xref: /titanic_51/usr/src/uts/common/io/ib/clients/rds/rdsib_cm.c (revision 0c19630b1592aa30d3e4d9db1a2a8cf9a91c0e72)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *	- Redistributions of source code must retain the above
39  *	  copyright notice, this list of conditions and the following
40  *	  disclaimer.
41  *
42  *	- Redistributions in binary form must reproduce the above
43  *	  copyright notice, this list of conditions and the following
44  *	  disclaimer in the documentation and/or other materials
45  *	  provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 /*
58  * Sun elects to include this software in Sun product
59  * under the OpenIB BSD license.
60  *
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
72  * POSSIBILITY OF SUCH DAMAGE.
73  */
74 
75 #include <sys/ib/clients/rds/rdsib_cm.h>
76 #include <sys/ib/clients/rds/rdsib_ib.h>
77 #include <sys/ib/clients/rds/rdsib_buf.h>
78 #include <sys/ib/clients/rds/rdsib_ep.h>
79 
80 /*
81  * This file contains CM related work:
82  *
83  * Service registration/deregistration
84  * Path lookup
85  * CM connection callbacks
86  * CM active and passive connection establishment
87  * Connection failover
88  */
89 
90 #define	SRCIP	src_addr.un.ip4addr
91 #define	DSTIP	dst_addr.un.ip4addr
92 
93 /*
94  * Handle an incoming CM REQ
95  */
96 /* ARGSUSED */
97 static ibt_cm_status_t
98 rds_handle_cm_req(rds_state_t *statep, ibt_cm_event_t *evp,
99     ibt_cm_return_args_t *rargsp, void *rcmp, ibt_priv_data_len_t rcmp_len)
100 {
101 	ibt_cm_req_rcv_t	*reqp;
102 	ib_gid_t		lgid, rgid;
103 	rds_cm_private_data_t	cmp;
104 	rds_session_t		*sp;
105 	rds_ep_t		*ep;
106 	ibt_channel_hdl_t	chanhdl;
107 	ibt_ip_cm_info_t	ipcm_info;
108 	uint8_t			save_state, save_type;
109 	int			ret;
110 
111 	RDS_DPRINTF2("rds_handle_cm_req", "Enter");
112 
113 	reqp = &evp->cm_event.req;
114 	rgid = reqp->req_prim_addr.av_dgid; /* requester gid */
115 	lgid = reqp->req_prim_addr.av_sgid; /* receiver gid */
116 
117 	RDS_DPRINTF2(LABEL, "REQ Received: From: %llx:%llx To: %llx:%llx",
118 	    rgid.gid_prefix, rgid.gid_guid, lgid.gid_prefix, lgid.gid_guid);
119 
120 	/*
121 	 * CM private data brings IP information
122 	 * Private data received is a stream of bytes and may not be properly
123 	 * aligned. So, bcopy the data onto the stack before accessing it.
124 	 */
125 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
126 	    sizeof (rds_cm_private_data_t));
127 
128 	/* extract the CM IP info */
129 	ret = ibt_get_ip_data(evp->cm_priv_data_len, evp->cm_priv_data,
130 	    &ipcm_info);
131 	if (ret != IBT_SUCCESS) {
132 		RDS_DPRINTF2("rds_handle_cm_req", "ibt_get_ip_data failed: %d",
133 		    ret);
134 		return (IBT_CM_REJECT);
135 	}
136 
137 	RDS_DPRINTF2("rds_handle_cm_req",
138 	    "REQ Received: From IP: 0x%x To IP: 0x%x type: %d",
139 	    ipcm_info.SRCIP, ipcm_info.DSTIP, cmp.cmp_eptype);
140 
141 	if (cmp.cmp_version != RDS_VERSION) {
142 		RDS_DPRINTF2(LABEL, "Version Mismatch: Local version: %d "
143 		    "Remote version: %d", RDS_VERSION, cmp.cmp_version);
144 		return (IBT_CM_REJECT);
145 	}
146 
147 	/* RDS supports V4 addresses only */
148 	if ((ipcm_info.src_addr.family != AF_INET) ||
149 	    (ipcm_info.dst_addr.family != AF_INET)) {
150 		RDS_DPRINTF2(LABEL, "Unsupported Address Family: "
151 		    "src: %d dst: %d", ipcm_info.src_addr.family,
152 		    ipcm_info.dst_addr.family);
153 		return (IBT_CM_REJECT);
154 	}
155 
156 	if (cmp.cmp_arch != RDS_THIS_ARCH) {
157 		RDS_DPRINTF2(LABEL, "ARCH does not match (%d != %d)",
158 		    cmp.cmp_arch, RDS_THIS_ARCH);
159 		return (IBT_CM_REJECT);
160 	}
161 
162 	if ((cmp.cmp_eptype != RDS_EP_TYPE_CTRL) &&
163 	    (cmp.cmp_eptype != RDS_EP_TYPE_DATA)) {
164 		RDS_DPRINTF2(LABEL, "Unknown Channel type: %d", cmp.cmp_eptype);
165 		return (IBT_CM_REJECT);
166 	}
167 
168 	/* user_buffer_size should be same on all nodes */
169 	if (cmp.cmp_user_buffer_size != UserBufferSize) {
170 		RDS_DPRINTF2(LABEL,
171 		    "UserBufferSize Mismatch, this node: %d remote node: %d",
172 		    UserBufferSize, cmp.cmp_user_buffer_size);
173 		return (IBT_CM_REJECT);
174 	}
175 
176 	/*
177 	 * RDS needs more time to process a failover REQ so send an MRA.
178 	 * Otherwise, the remote may retry the REQ and fail the connection.
179 	 */
180 	if ((cmp.cmp_failover) && (cmp.cmp_eptype == RDS_EP_TYPE_DATA)) {
181 		RDS_DPRINTF2("rds_handle_cm_req", "Session Failover, send MRA");
182 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
183 		    10000000 /* 10 sec */, NULL, 0);
184 	}
185 
186 	/* Is there a session to the destination node? */
187 	rw_enter(&statep->rds_sessionlock, RW_READER);
188 	sp = rds_session_lkup(statep, ipcm_info.SRCIP, rgid.gid_guid);
189 	rw_exit(&statep->rds_sessionlock);
190 
191 	if (sp == NULL) {
192 		/*
193 		 * currently there is no session to the destination
194 		 * remote ip in the private data is the local ip and vice
195 		 * versa
196 		 */
197 		sp = rds_session_create(statep, ipcm_info.DSTIP,
198 		    ipcm_info.SRCIP, reqp, RDS_SESSION_PASSIVE);
199 		if (sp == NULL) {
200 			/* Check the list anyway. */
201 			rw_enter(&statep->rds_sessionlock, RW_READER);
202 			sp = rds_session_lkup(statep, ipcm_info.SRCIP,
203 			    rgid.gid_guid);
204 			rw_exit(&statep->rds_sessionlock);
205 			if (sp == NULL) {
206 				/*
207 				 * The only way this can fail is due to lack
208 				 * of kernel resources
209 				 */
210 				return (IBT_CM_REJECT);
211 			}
212 		}
213 	}
214 
215 	rw_enter(&sp->session_lock, RW_WRITER);
216 
217 	/* catch peer-to-peer case as soon as possible */
218 	if ((sp->session_state == RDS_SESSION_STATE_CREATED) ||
219 	    (sp->session_state == RDS_SESSION_STATE_INIT)) {
220 		/* Check possible peer-to-peer case here */
221 		if (sp->session_type != RDS_SESSION_PASSIVE) {
222 			RDS_DPRINTF2("rds_handle_cm_req",
223 			    "SP(%p) Peer-peer connection handling", sp);
224 			if (lgid.gid_guid > rgid.gid_guid) {
225 				/* this node is active so reject this request */
226 				rw_exit(&sp->session_lock);
227 				return (IBT_CM_REJECT);
228 			} else {
229 				/* this node is passive, change the session */
230 				sp->session_type = RDS_SESSION_PASSIVE;
231 				sp->session_lgid = lgid;
232 				sp->session_rgid = rgid;
233 			}
234 		}
235 	}
236 
237 	RDS_DPRINTF2(LABEL, "SP(%p) state: %d", sp, sp->session_state);
238 	save_state = sp->session_state;
239 	save_type = sp->session_type;
240 
241 	switch (sp->session_state) {
242 	case RDS_SESSION_STATE_CONNECTED:
243 		RDS_DPRINTF2(LABEL, "STALE Session Detected SP(%p)", sp);
244 		sp->session_state = RDS_SESSION_STATE_ERROR;
245 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
246 		    "RDS_SESSION_STATE_ERROR", sp);
247 
248 		/* FALLTHRU */
249 	case RDS_SESSION_STATE_ERROR:
250 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
251 		/*
252 		 * Some other thread must be processing this session,
253 		 * this thread must wait until the other thread finishes.
254 		 */
255 		sp->session_type = RDS_SESSION_PASSIVE;
256 		rw_exit(&sp->session_lock);
257 
258 		/* Handling this will take some time, so send an MRA */
259 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, evp->cm_session_id,
260 		    10000000 /* 10 sec */, NULL, 0);
261 
262 		/*
263 		 * Any pending completions don't get flushed until the channel
264 		 * is closed. So, passing 0 here will not wait for pending
265 		 * completions in rds_session_close before closing the channel
266 		 */
267 		rds_session_close(sp, IBT_NOCALLBACKS, 0);
268 
269 		rw_enter(&sp->session_lock, RW_WRITER);
270 
271 		/*
272 		 * If the session was in ERROR, then either a failover thread
273 		 * or event_failure thread would be processing this session.
274 		 * This thread should wait for event_failure thread to
275 		 * complete. This need not wait for failover thread.
276 		 */
277 		if ((save_state != RDS_SESSION_STATE_CONNECTED) &&
278 		    (save_type == RDS_SESSION_PASSIVE)) {
279 				/*
280 				 * The other thread is event_failure thread,
281 				 * wait until it finishes.
282 				 */
283 				while (!((sp->session_state ==
284 				    RDS_SESSION_STATE_FAILED) ||
285 				    (sp->session_state ==
286 				    RDS_SESSION_STATE_FINI))) {
287 					rw_exit(&sp->session_lock);
288 					delay(drv_usectohz(1000000));
289 					rw_enter(&sp->session_lock, RW_WRITER);
290 				}
291 		}
292 
293 		/* move the session to init state */
294 		if ((sp->session_state == RDS_SESSION_STATE_ERROR) ||
295 		    (sp->session_state == RDS_SESSION_STATE_PASSIVE_CLOSING)) {
296 			ret = rds_session_reinit(sp, lgid);
297 			sp->session_myip = ipcm_info.DSTIP;
298 			sp->session_lgid = lgid;
299 			sp->session_rgid = rgid;
300 			if (ret != 0) {
301 				rds_session_fini(sp);
302 				sp->session_state = RDS_SESSION_STATE_FAILED;
303 				RDS_DPRINTF3("rds_handle_cm_req",
304 				    "SP(%p) State RDS_SESSION_STATE_FAILED",
305 				    sp);
306 				rw_exit(&sp->session_lock);
307 				return (IBT_CM_REJECT);
308 			} else {
309 				sp->session_state = RDS_SESSION_STATE_INIT;
310 				RDS_DPRINTF3("rds_handle_cm_req",
311 				    "SP(%p) State RDS_SESSION_STATE_INIT", sp);
312 			}
313 
314 			if (cmp.cmp_eptype == RDS_EP_TYPE_CTRL) {
315 				ep = &sp->session_ctrlep;
316 			} else {
317 				ep = &sp->session_dataep;
318 			}
319 			break;
320 		}
321 
322 		/* FALLTHRU */
323 	case RDS_SESSION_STATE_CREATED:
324 	case RDS_SESSION_STATE_FAILED:
325 	case RDS_SESSION_STATE_FINI:
326 		/*
327 		 * Initialize both channels, we accept this connection
328 		 * only if both channels are initialized
329 		 */
330 		sp->session_type = RDS_SESSION_PASSIVE;
331 		sp->session_lgid = lgid;
332 		sp->session_rgid = rgid;
333 		sp->session_state = RDS_SESSION_STATE_CREATED;
334 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
335 		    "RDS_SESSION_STATE_CREATED", sp);
336 		ret = rds_session_init(sp);
337 		if (ret != 0) {
338 			/* Seems like there are not enough resources */
339 			sp->session_state = RDS_SESSION_STATE_FAILED;
340 			RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
341 			    "RDS_SESSION_STATE_FAILED", sp);
342 			rw_exit(&sp->session_lock);
343 			return (IBT_CM_REJECT);
344 		}
345 		sp->session_state = RDS_SESSION_STATE_INIT;
346 		RDS_DPRINTF3("rds_handle_cm_req", "SP(%p) State "
347 		    "RDS_SESSION_STATE_INIT", sp);
348 
349 		/* FALLTHRU */
350 	case RDS_SESSION_STATE_INIT:
351 		/*
352 		 * When re-using an existing session, make sure the
353 		 * session is still through the same HCA. Otherwise, the
354 		 * memory registrations have to moved to the new HCA.
355 		 */
356 		if (cmp.cmp_eptype == RDS_EP_TYPE_DATA) {
357 			if (sp->session_lgid.gid_guid != lgid.gid_guid) {
358 				RDS_DPRINTF2("rds_handle_cm_req",
359 				    "Existing Session but different gid "
360 				    "existing: 0x%llx, new: 0x%llx, "
361 				    "sending an MRA",
362 				    sp->session_lgid.gid_guid, lgid.gid_guid);
363 				(void) ibt_cm_delay(IBT_CM_DELAY_REQ,
364 				    evp->cm_session_id, 10000000 /* 10 sec */,
365 				    NULL, 0);
366 				ret = rds_session_reinit(sp, lgid);
367 				if (ret != 0) {
368 					rds_session_fini(sp);
369 					sp->session_state =
370 					    RDS_SESSION_STATE_FAILED;
371 					sp->session_failover = 0;
372 					RDS_DPRINTF3("rds_failover_session",
373 					    "SP(%p) State "
374 					    "RDS_SESSION_STATE_FAILED", sp);
375 					rw_exit(&sp->session_lock);
376 					return (IBT_CM_REJECT);
377 				}
378 			}
379 			ep = &sp->session_dataep;
380 		} else {
381 			ep = &sp->session_ctrlep;
382 		}
383 
384 		break;
385 	default:
386 		RDS_DPRINTF2(LABEL, "ERROR: SP(%p) is in an unexpected "
387 		    "state: %d", sp, sp->session_state);
388 		rw_exit(&sp->session_lock);
389 		return (IBT_CM_REJECT);
390 	}
391 
392 	sp->session_failover = 0; /* reset any previous value */
393 	if (cmp.cmp_failover) {
394 		RDS_DPRINTF2("rds_handle_cm_req",
395 		    "SP(%p) Failover Session (BP %p)", sp, cmp.cmp_last_bufid);
396 		sp->session_failover = 1;
397 	}
398 
399 	mutex_enter(&ep->ep_lock);
400 	if (ep->ep_state == RDS_EP_STATE_UNCONNECTED) {
401 		ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
402 		sp->session_type = RDS_SESSION_PASSIVE;
403 		rw_exit(&sp->session_lock);
404 	} else if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
405 		rw_exit(&sp->session_lock);
406 		/*
407 		 * Peer to peer connection. There is an active
408 		 * connection pending on this ep. The one with
409 		 * greater port guid becomes active and the
410 		 * other becomes passive.
411 		 */
412 		RDS_DPRINTF2("rds_handle_cm_req",
413 		    "EP(%p) Peer-peer connection handling", ep);
414 		if (lgid.gid_guid > rgid.gid_guid) {
415 			/* this node is active so reject this request */
416 			mutex_exit(&ep->ep_lock);
417 			RDS_DPRINTF2(LABEL, "SP(%p) EP(%p): "
418 			    "Rejecting passive in favor of active", sp, ep);
419 			return (IBT_CM_REJECT);
420 		} else {
421 			/*
422 			 * This session is not the active end, change it
423 			 * to passive end.
424 			 */
425 			ep->ep_state = RDS_EP_STATE_PASSIVE_PENDING;
426 
427 			rw_enter(&sp->session_lock, RW_WRITER);
428 			sp->session_type = RDS_SESSION_PASSIVE;
429 			sp->session_lgid = lgid;
430 			sp->session_rgid = rgid;
431 			rw_exit(&sp->session_lock);
432 		}
433 	} else {
434 		rw_exit(&sp->session_lock);
435 	}
436 
437 	ep->ep_lbufid = cmp.cmp_last_bufid;
438 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
439 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
440 	cmp.cmp_last_bufid = ep->ep_rbufid;
441 	cmp.cmp_ack_addr = ep->ep_ack_addr;
442 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
443 	mutex_exit(&ep->ep_lock);
444 
445 	/* continue with accepting the connection request for this channel */
446 	chanhdl = rds_ep_alloc_rc_channel(ep, reqp->req_prim_hca_port);
447 	if (chanhdl == NULL) {
448 		mutex_enter(&ep->ep_lock);
449 		ep->ep_state = RDS_EP_STATE_UNCONNECTED;
450 		mutex_exit(&ep->ep_lock);
451 		return (IBT_CM_REJECT);
452 	}
453 
454 	/* pre-post recv buffers in the RQ */
455 	rds_post_recv_buf((void *)chanhdl);
456 
457 	rargsp->cm_ret_len = sizeof (rds_cm_private_data_t);
458 	bcopy((uint8_t *)&cmp, rcmp, sizeof (rds_cm_private_data_t));
459 	rargsp->cm_ret.rep.cm_channel = chanhdl;
460 	rargsp->cm_ret.rep.cm_rdma_ra_out = 4;
461 	rargsp->cm_ret.rep.cm_rdma_ra_in = 4;
462 	rargsp->cm_ret.rep.cm_rnr_retry_cnt = MinRnrRetry;
463 
464 	RDS_DPRINTF2("rds_handle_cm_req", "Return: SP(%p) EP(%p) Chan (%p)",
465 	    sp, ep, chanhdl);
466 
467 	return (IBT_CM_ACCEPT);
468 }
469 
470 /*
471  * Handle an incoming CM REP
472  * Pre-post recv buffers for the QP
473  */
474 /* ARGSUSED */
475 static ibt_cm_status_t
476 rds_handle_cm_rep(ibt_cm_event_t *evp, ibt_cm_return_args_t *rargsp,
477     void *rcmp, ibt_priv_data_len_t rcmp_len)
478 {
479 	rds_ep_t	*ep;
480 	rds_cm_private_data_t	cmp;
481 
482 	RDS_DPRINTF2("rds_handle_cm_rep", "Enter");
483 
484 	/* pre-post recv buffers in the RQ */
485 	rds_post_recv_buf((void *)evp->cm_channel);
486 
487 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
488 	bcopy((uint8_t *)evp->cm_priv_data, &cmp,
489 	    sizeof (rds_cm_private_data_t));
490 	ep->ep_lbufid = cmp.cmp_last_bufid;
491 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_raddr = (ib_vaddr_t)cmp.cmp_ack_addr;
492 	ep->ep_ackwr.wr.rc.rcwr.rdma.rdma_rkey = cmp.cmp_ack_rkey;
493 
494 	rargsp->cm_ret_len = 0;
495 
496 	RDS_DPRINTF2("rds_handle_cm_rep", "Return: lbufid: %p", ep->ep_lbufid);
497 
498 	return (IBT_CM_ACCEPT);
499 }
500 
501 /*
502  * Handle CONN EST
503  */
504 static ibt_cm_status_t
505 rds_handle_cm_conn_est(ibt_cm_event_t *evp)
506 {
507 	rds_session_t	*sp;
508 	rds_ep_t	*ep;
509 
510 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
511 
512 	RDS_DPRINTF2("rds_handle_cm_conn_est", "EP(%p) State: %d", ep,
513 	    ep->ep_state);
514 
515 	mutex_enter(&ep->ep_lock);
516 	ASSERT((ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) ||
517 	    (ep->ep_state == RDS_EP_STATE_PASSIVE_PENDING));
518 	ep->ep_state = RDS_EP_STATE_CONNECTED;
519 	ep->ep_chanhdl = evp->cm_channel;
520 	sp = ep->ep_sp;
521 	mutex_exit(&ep->ep_lock);
522 
523 	(void) rds_session_active(sp);
524 
525 	RDS_DPRINTF2("rds_handle_cm_conn_est", "Return");
526 	return (IBT_CM_ACCEPT);
527 }
528 
529 /*
530  * Handle CONN CLOSED
531  */
532 static ibt_cm_status_t
533 rds_handle_cm_conn_closed(ibt_cm_event_t *evp)
534 {
535 	rds_ep_t	*ep;
536 	rds_session_t	*sp;
537 
538 	/* Catch DREQs but ignore DREPs */
539 	if (evp->cm_event.closed != IBT_CM_CLOSED_DREQ_RCVD) {
540 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
541 		    "Ignoring Event: %d received", evp->cm_event.closed);
542 		return (IBT_CM_ACCEPT);
543 	}
544 
545 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
546 	sp = ep->ep_sp;
547 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "EP(%p) Chan(%p) Enter",
548 	    ep, evp->cm_channel);
549 
550 	mutex_enter(&ep->ep_lock);
551 	if (ep->ep_state != RDS_EP_STATE_CONNECTED) {
552 		/* Ignore this DREQ */
553 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
554 		    "EP(%p) not connected, state: %d", ep, ep->ep_state);
555 		mutex_exit(&ep->ep_lock);
556 		return (IBT_CM_ACCEPT);
557 	}
558 	ep->ep_state = RDS_EP_STATE_CLOSING;
559 	mutex_exit(&ep->ep_lock);
560 
561 	rw_enter(&sp->session_lock, RW_WRITER);
562 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) - state: %d", sp,
563 	    sp->session_state);
564 
565 	switch (sp->session_state) {
566 	case RDS_SESSION_STATE_CONNECTED:
567 	case RDS_SESSION_STATE_HCA_CLOSING:
568 		sp->session_state = RDS_SESSION_STATE_PASSIVE_CLOSING;
569 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
570 		    "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
571 		break;
572 
573 	case RDS_SESSION_STATE_PASSIVE_CLOSING:
574 		sp->session_state = RDS_SESSION_STATE_CLOSED;
575 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
576 		    "RDS_SESSION_STATE_CLOSED", sp);
577 		rds_passive_session_fini(sp);
578 		sp->session_state = RDS_SESSION_STATE_FINI;
579 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
580 		    "SP(%p) State RDS_SESSION_STATE_FINI", sp);
581 		break;
582 
583 	case RDS_SESSION_STATE_ACTIVE_CLOSING:
584 	case RDS_SESSION_STATE_ERROR:
585 	case RDS_SESSION_STATE_CLOSED:
586 		break;
587 
588 	case RDS_SESSION_STATE_INIT:
589 		sp->session_state = RDS_SESSION_STATE_ERROR;
590 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
591 		    "RDS_SESSION_STATE_ERROR", sp);
592 		rds_passive_session_fini(sp);
593 		sp->session_state = RDS_SESSION_STATE_FAILED;
594 		RDS_DPRINTF3("rds_handle_cm_conn_closed",
595 		    "SP(%p) State RDS_SESSION_STATE_FAILED", sp);
596 		break;
597 
598 	default:
599 		RDS_DPRINTF2("rds_handle_cm_conn_closed",
600 		    "SP(%p) - Unexpected state: %d", sp, sp->session_state);
601 		rds_passive_session_fini(sp);
602 		sp->session_state = RDS_SESSION_STATE_FAILED;
603 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
604 		    "RDS_SESSION_STATE_FAILED", sp);
605 	}
606 	rw_exit(&sp->session_lock);
607 
608 	mutex_enter(&ep->ep_lock);
609 	ep->ep_state = RDS_EP_STATE_CLOSED;
610 	mutex_exit(&ep->ep_lock);
611 
612 	RDS_DPRINTF2("rds_handle_cm_conn_closed", "SP(%p) Return", sp);
613 	return (IBT_CM_ACCEPT);
614 }
615 
616 /*
617  * Handle EVENT FAILURE
618  */
619 static ibt_cm_status_t
620 rds_handle_cm_event_failure(ibt_cm_event_t *evp)
621 {
622 	rds_ep_t	*ep;
623 	rds_session_t	*sp;
624 	int		ret;
625 
626 	RDS_DPRINTF2("rds_handle_cm_event_failure", "Enter: Chan hdl: 0x%p "
627 	    "Code: %d msg: %d reason: %d", evp->cm_channel,
628 	    evp->cm_event.failed.cf_code, evp->cm_event.failed.cf_msg,
629 	    evp->cm_event.failed.cf_reason);
630 
631 	if (evp->cm_event.failed.cf_reason == IBT_CM_INVALID_SID) {
632 		RDS_DPRINTF2(LABEL,
633 		    "Received REJ with reason IBT_CM_INVALID_SID: "
634 		    "RDS may not be loaded on the remote system");
635 	}
636 
637 	if (evp->cm_channel == NULL) {
638 		return (IBT_CM_ACCEPT);
639 	}
640 
641 	if ((evp->cm_event.failed.cf_code != IBT_CM_FAILURE_STALE) &&
642 	    (evp->cm_event.failed.cf_msg == IBT_CM_FAILURE_REQ)) {
643 		/*
644 		 * This end is active, just ignore, ibt_open_rc_channel()
645 		 * caller will take care of cleanup.
646 		 */
647 		RDS_DPRINTF2("rds_handle_cm_event_failure",
648 		    "Ignoring this event: Chan hdl: 0x%p", evp->cm_channel);
649 		return (IBT_CM_ACCEPT);
650 	}
651 
652 	ep = (rds_ep_t *)ibt_get_chan_private(evp->cm_channel);
653 	sp = ep->ep_sp;
654 
655 	rw_enter(&sp->session_lock, RW_WRITER);
656 	if (sp->session_type == RDS_SESSION_PASSIVE) {
657 		RDS_DPRINTF2("rds_handle_cm_event_failure",
658 		    "SP(%p) - state: %d", sp, sp->session_state);
659 		if ((sp->session_state == RDS_SESSION_STATE_INIT) ||
660 		    (sp->session_state == RDS_SESSION_STATE_CONNECTED)) {
661 			sp->session_state = RDS_SESSION_STATE_ERROR;
662 			RDS_DPRINTF3("rds_handle_cm_event_failure",
663 			    "SP(%p) State RDS_SESSION_STATE_ERROR", sp);
664 
665 			/*
666 			 * Store the cm_channel for freeing later
667 			 * Active side frees it on ibt_open_rc_channel
668 			 * failure
669 			 */
670 			if (ep->ep_chanhdl == NULL) {
671 				ep->ep_chanhdl = evp->cm_channel;
672 			}
673 			rw_exit(&sp->session_lock);
674 
675 			/*
676 			 * rds_passive_session_fini should not be called
677 			 * directly in the CM handler. It will cause a deadlock.
678 			 */
679 			ret = ddi_taskq_dispatch(rds_taskq,
680 			    rds_cleanup_passive_session, (void *)sp,
681 			    DDI_NOSLEEP);
682 			if (ret != DDI_SUCCESS) {
683 				RDS_DPRINTF2("rds_handle_cm_event_failure",
684 				    "SP(%p) TaskQ dispatch FAILED:%d", sp, ret);
685 			}
686 			return (IBT_CM_ACCEPT);
687 		}
688 	}
689 	rw_exit(&sp->session_lock);
690 
691 	RDS_DPRINTF2("rds_handle_cm_event_failure", "SP(%p) Return", sp);
692 	return (IBT_CM_ACCEPT);
693 }
694 
695 /*
696  * CM Handler
697  *
698  * Called by IBCM
699  * The cm_private type differs for active and passive events.
700  */
701 ibt_cm_status_t
702 rds_cm_handler(void *cm_private, ibt_cm_event_t *eventp,
703     ibt_cm_return_args_t *ret_args, void *ret_priv_data,
704     ibt_priv_data_len_t ret_len_max)
705 {
706 	ibt_cm_status_t		ret = IBT_CM_ACCEPT;
707 
708 	RDS_DPRINTF2("rds_cm_handler", "Enter: event: %d", eventp->cm_type);
709 
710 	switch (eventp->cm_type) {
711 	case IBT_CM_EVENT_REQ_RCV:
712 		ret = rds_handle_cm_req((rds_state_t *)cm_private, eventp,
713 		    ret_args, ret_priv_data, ret_len_max);
714 		break;
715 	case IBT_CM_EVENT_REP_RCV:
716 		ret = rds_handle_cm_rep(eventp, ret_args, ret_priv_data,
717 		    ret_len_max);
718 		break;
719 	case IBT_CM_EVENT_MRA_RCV:
720 		/* Not supported */
721 		break;
722 	case IBT_CM_EVENT_CONN_EST:
723 		ret = rds_handle_cm_conn_est(eventp);
724 		break;
725 	case IBT_CM_EVENT_CONN_CLOSED:
726 		ret = rds_handle_cm_conn_closed(eventp);
727 		break;
728 	case IBT_CM_EVENT_FAILURE:
729 		ret = rds_handle_cm_event_failure(eventp);
730 		break;
731 	case IBT_CM_EVENT_LAP_RCV:
732 		/* Not supported */
733 		RDS_DPRINTF2(LABEL, "LAP message received");
734 		break;
735 	case IBT_CM_EVENT_APR_RCV:
736 		/* Not supported */
737 		RDS_DPRINTF2(LABEL, "APR message received");
738 		break;
739 	default:
740 		break;
741 	}
742 
743 	RDS_DPRINTF2("rds_cm_handler", "Return");
744 
745 	return (ret);
746 }
747 
748 /* This is based on OFED Linux RDS */
749 #define	RDS_PORT_NUM	6556
750 
751 /*
752  * Register the wellknown service with service id: RDS_SERVICE_ID
753  * Incoming connection requests should arrive on this service id.
754  */
755 ibt_srv_hdl_t
756 rds_register_service(ibt_clnt_hdl_t rds_ibhdl)
757 {
758 	ibt_srv_hdl_t	srvhdl;
759 	ibt_srv_desc_t	srvdesc;
760 	int		ret;
761 
762 	RDS_DPRINTF2("rds_register_service", "Enter: 0x%p", rds_ibhdl);
763 
764 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
765 	srvdesc.sd_handler = rds_cm_handler;
766 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
767 
768 	/*
769 	 * This is the new service id as per:
770 	 * Annex A11: RDMA IP CM Service
771 	 */
772 	rdsib_statep->rds_service_id = ibt_get_ip_sid(IPPROTO_TCP,
773 	    RDS_PORT_NUM);
774 	ret = ibt_register_service(rds_ibhdl, &srvdesc,
775 	    rdsib_statep->rds_service_id, 1, &srvhdl, NULL);
776 	if (ret != IBT_SUCCESS) {
777 		RDS_DPRINTF2(LABEL,
778 		    "RDS Service (0x%llx) Registration Failed: %d",
779 		    rdsib_statep->rds_service_id, ret);
780 		return (NULL);
781 	}
782 
783 	RDS_DPRINTF2("rds_register_service", "Return: 0x%p", srvhdl);
784 	return (srvhdl);
785 }
786 
787 /* Bind the RDS service on all ports */
788 int
789 rds_bind_service(rds_state_t *statep)
790 {
791 	rds_hca_t	*hcap;
792 	ib_gid_t	gid;
793 	uint_t		jx, nbinds = 0, nports = 0;
794 	int		ret;
795 
796 	RDS_DPRINTF2("rds_bind_service", "Enter: 0x%p", statep);
797 
798 	rw_enter(&statep->rds_hca_lock, RW_READER);
799 
800 	hcap = statep->rds_hcalistp;
801 	while (hcap != NULL) {
802 
803 		/* skip the HCAs that are not fully online */
804 		if ((hcap->hca_state != RDS_HCA_STATE_OPEN) &&
805 		    (hcap->hca_state != RDS_HCA_STATE_MEM_REGISTERED)) {
806 			RDS_DPRINTF2("rds_bind_service",
807 			    "Skipping HCA: 0x%llx, state: %d",
808 			    hcap->hca_guid, hcap->hca_state);
809 			hcap = hcap->hca_nextp;
810 			continue;
811 		}
812 
813 		/* currently, we have space for only 4 bindhdls */
814 		ASSERT(hcap->hca_nports < 4);
815 		for (jx = 0; jx < hcap->hca_nports; jx++) {
816 			nports++;
817 			if (hcap->hca_pinfop[jx].p_linkstate !=
818 			    IBT_PORT_ACTIVE) {
819 				/*
820 				 * service bind will be called in the async
821 				 * handler when the port comes up. Clear any
822 				 * stale bind handle.
823 				 */
824 				hcap->hca_bindhdl[jx] = NULL;
825 				continue;
826 			}
827 
828 			gid = hcap->hca_pinfop[jx].p_sgid_tbl[0];
829 			RDS_DPRINTF5(LABEL, "HCA: 0x%llx Port: %d "
830 			    "gid: %llx:%llx", hcap->hca_guid,
831 			    hcap->hca_pinfop[jx].p_port_num, gid.gid_prefix,
832 			    gid.gid_guid);
833 
834 			/* pass statep as cm_private */
835 			ret = ibt_bind_service(statep->rds_srvhdl, gid,
836 			    NULL, statep, &hcap->hca_bindhdl[jx]);
837 			if (ret != IBT_SUCCESS) {
838 				RDS_DPRINTF2(LABEL, "Bind service for "
839 				    "HCA: 0x%llx Port: %d gid %llx:%llx "
840 				    "failed: %d", hcap->hca_guid,
841 				    hcap->hca_pinfop[jx].p_port_num,
842 				    gid.gid_prefix, gid.gid_guid, ret);
843 				continue;
844 			}
845 
846 			nbinds++;
847 		}
848 		hcap = hcap->hca_nextp;
849 	}
850 
851 	rw_exit(&statep->rds_hca_lock);
852 
853 	RDS_DPRINTF2(LABEL, "RDS Service available on %d/%d ports",
854 	    nbinds, nports);
855 
856 #if 0
857 	if (nbinds == 0) {
858 		return (-1);
859 	}
860 #endif
861 
862 	RDS_DPRINTF2("rds_bind_service", "Return");
863 
864 	return (0);
865 }
866 
867 /* Open an RC connection */
868 int
869 rds_open_rc_channel(rds_ep_t *ep, ibt_path_info_t *pinfo,
870     ibt_execution_mode_t mode, ibt_channel_hdl_t *chanhdl)
871 {
872 	rds_session_t		*sp;
873 	ibt_chan_open_args_t	ocargs;
874 	ibt_rc_returns_t	ocrets;
875 	rds_cm_private_data_t	cmp;
876 	uint8_t			hca_port;
877 	ibt_channel_hdl_t	hdl;
878 	ibt_status_t		ret = 0;
879 	ibt_ip_cm_info_t	ipcm_info;
880 
881 	RDS_DPRINTF2("rds_open_rc_channel", "Enter: EP(%p) mode: %d", ep, mode);
882 
883 	sp = ep->ep_sp;
884 
885 	bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
886 	ipcm_info.src_addr.family = AF_INET;
887 	ipcm_info.SRCIP = sp->session_myip;
888 	ipcm_info.dst_addr.family = AF_INET;
889 	ipcm_info.DSTIP = sp->session_remip;
890 	ipcm_info.src_port = RDS_PORT_NUM;
891 	ret = ibt_format_ip_private_data(&ipcm_info,
892 	    sizeof (rds_cm_private_data_t), &cmp);
893 	if (ret != IBT_SUCCESS) {
894 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_format_ip_private_data "
895 		    "failed: %d", sp, ep, ret);
896 		return (-1);
897 	}
898 
899 	hca_port = pinfo->pi_prim_cep_path.cep_hca_port_num;
900 
901 	hdl = rds_ep_alloc_rc_channel(ep, hca_port);
902 	if (hdl == NULL) {
903 		return (-1);
904 	}
905 
906 	cmp.cmp_version = RDS_VERSION;
907 	cmp.cmp_arch = RDS_THIS_ARCH;
908 	cmp.cmp_eptype = ep->ep_type;
909 	cmp.cmp_failover = sp->session_failover;
910 	cmp.cmp_last_bufid = ep->ep_rbufid;
911 	cmp.cmp_user_buffer_size = UserBufferSize;
912 	cmp.cmp_ack_addr = ep->ep_ack_addr;
913 	cmp.cmp_ack_rkey = ep->ep_ack_rkey;
914 
915 	bzero(&ocargs, sizeof (ibt_chan_open_args_t));
916 	bzero(&ocrets, sizeof (ibt_rc_returns_t));
917 	ocargs.oc_path = pinfo;
918 	ocargs.oc_cm_handler = rds_cm_handler;
919 	ocargs.oc_cm_clnt_private = NULL;
920 	ocargs.oc_rdma_ra_out = 4;
921 	ocargs.oc_rdma_ra_in = 4;
922 	ocargs.oc_priv_data_len = sizeof (rds_cm_private_data_t);
923 	ocargs.oc_priv_data = &cmp;
924 	ocargs.oc_path_retry_cnt = IBPathRetryCount;
925 	ocargs.oc_path_rnr_retry_cnt = MinRnrRetry;
926 	ret = ibt_open_rc_channel(hdl, IBT_OCHAN_NO_FLAGS,
927 	    mode, &ocargs, &ocrets);
928 	if (ret != IBT_SUCCESS) {
929 		RDS_DPRINTF2(LABEL, "SP(%p) EP(%p) ibt_open_rc_channel "
930 		    "failed: %d", sp, ep, ret);
931 		(void) ibt_flush_channel(hdl);
932 		(void) ibt_free_channel(hdl);
933 
934 		mutex_enter(&ep->ep_lock);
935 		/* don't cleanup if this failure is due to peer-peer race */
936 		if (ep->ep_state == RDS_EP_STATE_ACTIVE_PENDING) {
937 			/* cleanup stuff allocated in rds_ep_alloc_rc_channel */
938 			ep->ep_state = RDS_EP_STATE_ERROR;
939 			rds_ep_free_rc_channel(ep);
940 		}
941 		mutex_exit(&ep->ep_lock);
942 
943 		return (-1);
944 	}
945 
946 	*chanhdl = hdl;
947 
948 	RDS_DPRINTF2("rds_open_rc_channel", "Return: EP(%p) Chan: %p", ep,
949 	    *chanhdl);
950 
951 	return (0);
952 }
953 
954 int
955 rds_close_rc_channel(ibt_channel_hdl_t chanhdl, ibt_execution_mode_t mode)
956 {
957 	int	ret;
958 
959 	RDS_DPRINTF2("rds_close_rc_channel", "Enter: Chan(%p) Mode(%d)",
960 	    chanhdl, mode);
961 
962 	ret = ibt_close_rc_channel(chanhdl, mode, NULL, 0, NULL, NULL, 0);
963 
964 	RDS_DPRINTF2("rds_close_rc_channel", "Return Chan(%p)", chanhdl);
965 
966 	return (ret);
967 }
968