xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rds/rdsib_ib.c (revision bea83d026ee1bd1b2a2419e1d0232f107a5d7d9b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *	- Redistributions of source code must retain the above
39  *	  copyright notice, this list of conditions and the following
40  *	  disclaimer.
41  *
42  *	- Redistributions in binary form must reproduce the above
43  *	  copyright notice, this list of conditions and the following
44  *	  disclaimer in the documentation and/or other materials
45  *	  provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 /*
58  * Sun elects to include this software in Sun product
59  * under the OpenIB BSD license.
60  *
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
72  * POSSIBILITY OF SUCH DAMAGE.
73  */
74 
75 #pragma ident	"%Z%%M%	%I%	%E% SMI"
76 
77 #include <sys/types.h>
78 #include <sys/ddi.h>
79 #include <sys/sunddi.h>
80 #include <sys/ib/clients/rds/rdsib_cm.h>
81 #include <sys/ib/clients/rds/rdsib_ib.h>
82 #include <sys/ib/clients/rds/rdsib_buf.h>
83 #include <sys/ib/clients/rds/rdsib_ep.h>
84 #include <sys/ib/clients/rds/rds_kstat.h>
85 
86 static void rds_async_handler(void *clntp, ibt_hca_hdl_t hdl,
87     ibt_async_code_t code, ibt_async_event_t *event);
88 
89 static struct ibt_clnt_modinfo_s rds_ib_modinfo = {
90 	IBTI_V2,
91 	IBT_NETWORK,
92 	rds_async_handler,
93 	NULL,
94 	"RDS"
95 };
96 
97 /* performance tunables */
98 uint_t		rds_no_interrupts = 0;
99 uint_t		rds_poll_percent_full = 25;
100 uint_t		rds_wc_signal = IBT_NEXT_SOLICITED;
101 uint_t		rds_waittime_ms = 100; /* ms */
102 
103 extern dev_info_t *rdsib_dev_info;
104 extern void rds_close_sessions();
105 
106 static void
107 rdsib_validate_chan_sizes(ibt_hca_attr_t *hattrp)
108 {
109 	/* The SQ size should not be more than that supported by the HCA */
110 	if (((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_chan_sz) ||
111 	    ((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_cq_sz)) {
112 		RDS_DPRINTF0("RDSIB", "MaxDataSendBuffers + %d is greater "
113 		    "than that supported by the HCA driver "
114 		    "(%d + %d > %d or %d), lowering it to a supported value.",
115 		    RDS_NUM_ACKS, MaxDataSendBuffers, RDS_NUM_ACKS,
116 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
117 
118 		MaxDataSendBuffers = (hattrp->hca_max_chan_sz >
119 		    hattrp->hca_max_cq_sz) ?
120 		    hattrp->hca_max_cq_sz - RDS_NUM_ACKS :
121 		    hattrp->hca_max_chan_sz - RDS_NUM_ACKS;
122 	}
123 
124 	/* The RQ size should not be more than that supported by the HCA */
125 	if ((MaxDataRecvBuffers > hattrp->hca_max_chan_sz) ||
126 	    (MaxDataRecvBuffers > hattrp->hca_max_cq_sz)) {
127 		RDS_DPRINTF0("RDSIB", "MaxDataRecvBuffers is greater than that "
128 		    "supported by the HCA driver (%d > %d or %d), lowering it "
129 		    "to a supported value.", MaxDataRecvBuffers,
130 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
131 
132 		MaxDataRecvBuffers = (hattrp->hca_max_chan_sz >
133 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
134 		    hattrp->hca_max_chan_sz;
135 	}
136 
137 	/* The SQ size should not be more than that supported by the HCA */
138 	if ((MaxCtrlSendBuffers > hattrp->hca_max_chan_sz) ||
139 	    (MaxCtrlSendBuffers > hattrp->hca_max_cq_sz)) {
140 		RDS_DPRINTF0("RDSIB", "MaxCtrlSendBuffers is greater than that "
141 		    "supported by the HCA driver (%d > %d or %d), lowering it "
142 		    "to a supported value.", MaxCtrlSendBuffers,
143 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
144 
145 		MaxCtrlSendBuffers = (hattrp->hca_max_chan_sz >
146 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
147 		    hattrp->hca_max_chan_sz;
148 	}
149 
150 	/* The RQ size should not be more than that supported by the HCA */
151 	if ((MaxCtrlRecvBuffers > hattrp->hca_max_chan_sz) ||
152 	    (MaxCtrlRecvBuffers > hattrp->hca_max_cq_sz)) {
153 		RDS_DPRINTF0("RDSIB", "MaxCtrlRecvBuffers is greater than that "
154 		    "supported by the HCA driver (%d > %d or %d), lowering it "
155 		    "to a supported value.", MaxCtrlRecvBuffers,
156 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
157 
158 		MaxCtrlRecvBuffers = (hattrp->hca_max_chan_sz >
159 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
160 		    hattrp->hca_max_chan_sz;
161 	}
162 
163 	/* The MaxRecvMemory should be less than that supported by the HCA */
164 	if ((NDataRX * RdsPktSize) > hattrp->hca_max_memr_len) {
165 		RDS_DPRINTF0("RDSIB", "MaxRecvMemory is greater than that "
166 		    "supported by the HCA driver (%d > %d), lowering it to %d",
167 		    NDataRX * RdsPktSize, hattrp->hca_max_memr_len,
168 		    hattrp->hca_max_memr_len);
169 
170 		NDataRX = hattrp->hca_max_memr_len/RdsPktSize;
171 	}
172 }
173 
174 /*
175  * Called on open of first RDS socket
176  */
177 int
178 rdsib_open_ib()
179 {
180 	ib_guid_t	*guidp;
181 	rds_hca_t	*hcap, *hcap1;
182 	uint_t		ix, hcaix, nhcas;
183 	int		ret;
184 
185 	RDS_DPRINTF4("rdsib_open_ib", "enter: statep %p", rdsib_statep);
186 
187 	ASSERT(rdsib_statep != NULL);
188 	if (rdsib_statep == NULL) {
189 		RDS_DPRINTF1("rdsib_open_ib", "RDS Statep not initialized");
190 		return (-1);
191 	}
192 
193 	/* How many hcas are there? */
194 	nhcas = ibt_get_hca_list(&guidp);
195 	if (nhcas == 0) {
196 		RDS_DPRINTF2("rdsib_open_ib", "No IB HCAs Available");
197 		return (-1);
198 	}
199 
200 	RDS_DPRINTF3("rdsib_open_ib", "Number of HCAs: %d", nhcas);
201 
202 	/* Register with IBTF */
203 	ret = ibt_attach(&rds_ib_modinfo, rdsib_dev_info, rdsib_statep,
204 	    &rdsib_statep->rds_ibhdl);
205 	if (ret != IBT_SUCCESS) {
206 		RDS_DPRINTF2(LABEL, "ibt_attach failed: %d", ret);
207 		(void) ibt_free_hca_list(guidp, nhcas);
208 		return (-1);
209 	}
210 
211 	/*
212 	 * Open each HCA and gather its information. Don't care about HCAs
213 	 * that cannot be opened. It is OK as long as atleast one HCA can be
214 	 * opened.
215 	 * Initialize a HCA only if all the information is available.
216 	 */
217 	hcap1 = NULL;
218 	for (ix = 0, hcaix = 0; ix < nhcas; ix++) {
219 		RDS_DPRINTF3(LABEL, "Open HCA: 0x%llx", guidp[ix]);
220 
221 		hcap = (rds_hca_t *)kmem_zalloc(sizeof (rds_hca_t), KM_SLEEP);
222 
223 		ret = ibt_open_hca(rdsib_statep->rds_ibhdl, guidp[ix],
224 		    &hcap->hca_hdl);
225 		if (ret != IBT_SUCCESS) {
226 			RDS_DPRINTF2("rdsib_open_ib",
227 			    "ibt_open_hca: 0x%llx failed: %d", guidp[ix], ret);
228 			kmem_free(hcap, sizeof (rds_hca_t));
229 			continue;
230 		}
231 
232 		hcap->hca_guid = guidp[ix];
233 
234 		ret = ibt_query_hca(hcap->hca_hdl, &hcap->hca_attr);
235 		if (ret != IBT_SUCCESS) {
236 			RDS_DPRINTF2("rdsib_open_ib",
237 			    "Query HCA: 0x%llx failed:  %d", guidp[ix], ret);
238 			ret = ibt_close_hca(hcap->hca_hdl);
239 			ASSERT(ret == IBT_SUCCESS);
240 			kmem_free(hcap, sizeof (rds_hca_t));
241 			continue;
242 		}
243 
244 		ret = ibt_query_hca_ports(hcap->hca_hdl, 0,
245 		    &hcap->hca_pinfop, &hcap->hca_nports, &hcap->hca_pinfo_sz);
246 		if (ret != IBT_SUCCESS) {
247 			RDS_DPRINTF2("rdsib_open_ib",
248 			    "Query HCA 0x%llx ports failed: %d", guidp[ix],
249 			    ret);
250 			ret = ibt_close_hca(hcap->hca_hdl);
251 			ASSERT(ret == IBT_SUCCESS);
252 			kmem_free(hcap, sizeof (rds_hca_t));
253 			continue;
254 		}
255 
256 		/* Only one PD per HCA is allocated, so do it here */
257 		ret = ibt_alloc_pd(hcap->hca_hdl, IBT_PD_NO_FLAGS,
258 		    &hcap->hca_pdhdl);
259 		if (ret != IBT_SUCCESS) {
260 			RDS_DPRINTF2(LABEL, "ibt_alloc_pd 0x%llx failed: %d",
261 			    guidp[ix], ret);
262 			(void) ibt_free_portinfo(hcap->hca_pinfop,
263 			    hcap->hca_pinfo_sz);
264 			ret = ibt_close_hca(hcap->hca_hdl);
265 			ASSERT(ret == IBT_SUCCESS);
266 			kmem_free(hcap, sizeof (rds_hca_t));
267 			continue;
268 		}
269 
270 		rdsib_validate_chan_sizes(&hcap->hca_attr);
271 
272 		/* this HCA is fully initialized, go to the next one */
273 		hcaix++;
274 		hcap->hca_nextp = hcap1;
275 		hcap1 = hcap;
276 	}
277 
278 	/* free the HCA list, we are done with it */
279 	(void) ibt_free_hca_list(guidp, nhcas);
280 
281 	if (hcaix == 0) {
282 		/* Failed to Initialize even one HCA */
283 		RDS_DPRINTF2("rdsib_open_ib", "No HCAs are initialized");
284 		(void) ibt_detach(rdsib_statep->rds_ibhdl);
285 		rdsib_statep->rds_ibhdl = NULL;
286 		return (-1);
287 	}
288 
289 	if (hcaix < nhcas) {
290 		RDS_DPRINTF2("rdsib_open_ib", "HCAs %d/%d failed to initialize",
291 		    (nhcas - hcaix), nhcas);
292 	}
293 
294 	rdsib_statep->rds_hcalistp = hcap1;
295 	rdsib_statep->rds_nhcas = hcaix;
296 
297 	/* register the RDS service */
298 	rdsib_statep->rds_srvhdl =
299 	    rds_register_service(rdsib_statep->rds_ibhdl);
300 	if (rdsib_statep->rds_srvhdl == NULL) {
301 		RDS_DPRINTF2("rdsib_open_ib", "Service registration failed");
302 	} else {
303 		/* bind the service on all available ports */
304 		ret = rds_bind_service(rdsib_statep);
305 		if (ret != 0) {
306 			RDS_DPRINTF2("rdsib_open_ib", "Bind service failed");
307 		}
308 	}
309 
310 	RDS_DPRINTF4("rdsib_open_ib", "return: statep %p", rdsib_statep);
311 
312 	return (0);
313 }
314 
315 /*
316  * Called when all ports are closed.
317  */
318 void
319 rdsib_close_ib()
320 {
321 	rds_hca_t	*hcap, *nextp;
322 	int		ret;
323 
324 	RDS_DPRINTF2("rds_close_ib", "enter: statep %p", rdsib_statep);
325 
326 	if (rdsib_statep->rds_srvhdl != NULL) {
327 		(void) ibt_unbind_all_services(rdsib_statep->rds_srvhdl);
328 		(void) ibt_deregister_service(rdsib_statep->rds_ibhdl,
329 		    rdsib_statep->rds_srvhdl);
330 		(void) ibt_release_ip_sid(rdsib_statep->rds_service_id);
331 
332 		(void) ibt_unbind_all_services(rdsib_statep->rds_old_srvhdl);
333 		(void) ibt_deregister_service(rdsib_statep->rds_ibhdl,
334 		    rdsib_statep->rds_old_srvhdl);
335 	}
336 
337 	/* close and destroy all the sessions */
338 	rds_close_sessions(NULL);
339 
340 	/* Release all HCA resources */
341 	rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
342 	hcap = rdsib_statep->rds_hcalistp;
343 	rdsib_statep->rds_hcalistp = NULL;
344 	rdsib_statep->rds_nhcas = 0;
345 	rw_exit(&rdsib_statep->rds_hca_lock);
346 
347 	while (hcap != NULL) {
348 		nextp = hcap->hca_nextp;
349 
350 		ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl);
351 		ASSERT(ret == IBT_SUCCESS);
352 
353 		(void) ibt_free_portinfo(hcap->hca_pinfop, hcap->hca_pinfo_sz);
354 
355 		ret = ibt_close_hca(hcap->hca_hdl);
356 		ASSERT(ret == IBT_SUCCESS);
357 
358 		kmem_free(hcap, sizeof (rds_hca_t));
359 		hcap = nextp;
360 	}
361 
362 	/* Deregister with IBTF */
363 	if (rdsib_statep->rds_ibhdl != NULL) {
364 		(void) ibt_detach(rdsib_statep->rds_ibhdl);
365 		rdsib_statep->rds_ibhdl = NULL;
366 	}
367 
368 	RDS_DPRINTF2("rds_close_ib", "return: statep %p", rdsib_statep);
369 }
370 
371 /* Return hcap, given the hca guid */
372 rds_hca_t *
373 rds_get_hcap(rds_state_t *statep, ib_guid_t hca_guid)
374 {
375 	rds_hca_t	*hcap;
376 
377 	RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: Enter: statep: 0x%p "
378 	    "guid: %llx", statep, hca_guid);
379 
380 	rw_enter(&statep->rds_hca_lock, RW_READER);
381 
382 	hcap = statep->rds_hcalistp;
383 	while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
384 		hcap = hcap->hca_nextp;
385 	}
386 
387 	rw_exit(&statep->rds_hca_lock);
388 
389 	RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: return");
390 
391 	return (hcap);
392 }
393 
394 /* Return hcap, given a gid */
395 rds_hca_t *
396 rds_gid_to_hcap(rds_state_t *statep, ib_gid_t gid)
397 {
398 	rds_hca_t	*hcap;
399 	uint_t		ix;
400 
401 	RDS_DPRINTF4("rds_gid_to_hcap", "Enter: statep: 0x%p gid: %llx:%llx",
402 	    statep, gid.gid_prefix, gid.gid_guid);
403 
404 	rw_enter(&statep->rds_hca_lock, RW_READER);
405 
406 	hcap = statep->rds_hcalistp;
407 	while (hcap != NULL) {
408 		for (ix = 0; ix < hcap->hca_nports; ix++) {
409 			if ((hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_prefix ==
410 			    gid.gid_prefix) &&
411 			    (hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_guid ==
412 			    gid.gid_guid)) {
413 				RDS_DPRINTF4("rds_gid_to_hcap",
414 				    "gid found in hcap: 0x%p", hcap);
415 				rw_exit(&statep->rds_hca_lock);
416 				return (hcap);
417 			}
418 		}
419 		hcap = hcap->hca_nextp;
420 	}
421 
422 	rw_exit(&statep->rds_hca_lock);
423 
424 	return (NULL);
425 }
426 
427 /* This is called from the send CQ handler */
428 void
429 rds_send_acknowledgement(rds_ep_t *ep)
430 {
431 	int	ret;
432 	uint_t	ix;
433 
434 	RDS_DPRINTF4("rds_send_acknowledgement", "Enter EP(%p)", ep);
435 
436 	mutex_enter(&ep->ep_lock);
437 
438 	ASSERT(ep->ep_rdmacnt != 0);
439 
440 	/*
441 	 * The previous ACK completed successfully, send the next one
442 	 * if more messages were received after sending the last ACK
443 	 */
444 	if (ep->ep_rbufid != *(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va) {
445 		*(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va = ep->ep_rbufid;
446 		mutex_exit(&ep->ep_lock);
447 
448 		/* send acknowledgement */
449 		RDS_INCR_TXACKS();
450 		ret = ibt_post_send(ep->ep_chanhdl, &ep->ep_ackwr, 1, &ix);
451 		if (ret != IBT_SUCCESS) {
452 			RDS_DPRINTF1("rds_send_acknowledgement",
453 			    "EP(%p): ibt_post_send for acknowledgement "
454 			    "failed: %d, SQ depth: %d",
455 			    ep, ret, ep->ep_sndpool.pool_nbusy);
456 			mutex_enter(&ep->ep_lock);
457 			ep->ep_rdmacnt--;
458 			mutex_exit(&ep->ep_lock);
459 		}
460 	} else {
461 		/* ACKed all messages, no more to ACK */
462 		ep->ep_rdmacnt--;
463 		mutex_exit(&ep->ep_lock);
464 		return;
465 	}
466 
467 	RDS_DPRINTF4("rds_send_acknowledgement", "Return EP(%p)", ep);
468 }
469 
470 static int
471 rds_poll_ctrl_completions(ibt_cq_hdl_t cq, rds_ep_t *ep)
472 {
473 	ibt_wc_t	wc;
474 	uint_t		npolled;
475 	rds_buf_t	*bp;
476 	rds_ctrl_pkt_t	*cpkt;
477 	rds_qp_t	*recvqp;
478 	int		ret = IBT_SUCCESS;
479 
480 	RDS_DPRINTF4("rds_poll_ctrl_completions", "Enter: EP(%p)", ep);
481 
482 	bzero(&wc, sizeof (ibt_wc_t));
483 	ret = ibt_poll_cq(cq, &wc, 1, &npolled);
484 	if (ret != IBT_SUCCESS) {
485 		if (ret != IBT_CQ_EMPTY) {
486 			RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
487 			    "returned: %d", ep, cq, ret);
488 		} else {
489 			RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
490 			    "returned: IBT_CQ_EMPTY", ep, cq);
491 		}
492 		return (ret);
493 	}
494 
495 	bp = (rds_buf_t *)(uintptr_t)wc.wc_id;
496 
497 	if (wc.wc_status != IBT_WC_SUCCESS) {
498 		mutex_enter(&ep->ep_recvqp.qp_lock);
499 		ep->ep_recvqp.qp_level--;
500 		mutex_exit(&ep->ep_recvqp.qp_lock);
501 
502 		/* Free the buffer */
503 		bp->buf_state = RDS_RCVBUF_FREE;
504 		rds_free_recv_buf(bp, 1);
505 
506 		/* Receive completion failure */
507 		if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) {
508 			RDS_DPRINTF2("rds_poll_ctrl_completions",
509 			    "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
510 			    ep, cq, wc.wc_id, wc.wc_status);
511 		}
512 		return (ret);
513 	}
514 
515 	/* there is one less in the RQ */
516 	recvqp = &ep->ep_recvqp;
517 	mutex_enter(&recvqp->qp_lock);
518 	recvqp->qp_level--;
519 	if ((recvqp->qp_taskqpending == B_FALSE) &&
520 	    (recvqp->qp_level <= recvqp->qp_lwm)) {
521 		/* Time to post more buffers into the RQ */
522 		recvqp->qp_taskqpending = B_TRUE;
523 		mutex_exit(&recvqp->qp_lock);
524 
525 		ret = ddi_taskq_dispatch(rds_taskq,
526 		    rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
527 		if (ret != DDI_SUCCESS) {
528 			RDS_DPRINTF1(LABEL, "ddi_taskq_dispatch failed: %d",
529 			    ret);
530 			mutex_enter(&recvqp->qp_lock);
531 			recvqp->qp_taskqpending = B_FALSE;
532 			mutex_exit(&recvqp->qp_lock);
533 		}
534 	} else {
535 		mutex_exit(&recvqp->qp_lock);
536 	}
537 
538 	cpkt = (rds_ctrl_pkt_t *)(uintptr_t)bp->buf_ds.ds_va;
539 	rds_handle_control_message(ep->ep_sp, cpkt);
540 
541 	bp->buf_state = RDS_RCVBUF_FREE;
542 	rds_free_recv_buf(bp, 1);
543 
544 	RDS_DPRINTF4("rds_poll_ctrl_completions", "Return: EP(%p)", ep);
545 
546 	return (ret);
547 }
548 
549 #define	RDS_POST_FEW_ATATIME	100
550 /* Post recv WRs into the RQ. Assumes the ep->refcnt is already incremented */
551 void
552 rds_post_recv_buf(void *arg)
553 {
554 	ibt_channel_hdl_t	chanhdl;
555 	rds_ep_t		*ep;
556 	rds_session_t		*sp;
557 	rds_qp_t		*recvqp;
558 	rds_bufpool_t		*gp;
559 	rds_buf_t		*bp, *bp1;
560 	ibt_recv_wr_t		*wrp, wr[RDS_POST_FEW_ATATIME];
561 	rds_hca_t		*hcap;
562 	uint_t			npost, nspace, rcv_len;
563 	uint_t			ix, jx, kx;
564 	int			ret;
565 
566 	chanhdl = (ibt_channel_hdl_t)arg;
567 	RDS_DPRINTF4("rds_post_recv_buf", "Enter: CHAN(%p)", chanhdl);
568 	RDS_INCR_POST_RCV_BUF_CALLS();
569 
570 	ep = (rds_ep_t *)ibt_get_chan_private(chanhdl);
571 	ASSERT(ep != NULL);
572 	sp = ep->ep_sp;
573 	recvqp = &ep->ep_recvqp;
574 
575 	RDS_DPRINTF5("rds_post_recv_buf", "EP(%p)", ep);
576 
577 	/* get the hcap for the HCA hosting this channel */
578 	hcap = rds_get_hcap(rdsib_statep, ep->ep_hca_guid);
579 	if (hcap == NULL) {
580 		RDS_DPRINTF2("rds_post_recv_buf", "HCA (0x%llx) not found",
581 		    ep->ep_hca_guid);
582 		return;
583 	}
584 
585 	/* Make sure the session is still connected */
586 	rw_enter(&sp->session_lock, RW_READER);
587 	if ((sp->session_state != RDS_SESSION_STATE_INIT) &&
588 	    (sp->session_state != RDS_SESSION_STATE_CONNECTED)) {
589 		RDS_DPRINTF2("rds_post_recv_buf", "EP(%p): Session is not "
590 		    "in active state (%d)", ep, sp->session_state);
591 		rw_exit(&sp->session_lock);
592 		return;
593 	}
594 	rw_exit(&sp->session_lock);
595 
596 	/* how many can be posted */
597 	mutex_enter(&recvqp->qp_lock);
598 	nspace = recvqp->qp_depth - recvqp->qp_level;
599 	if (nspace == 0) {
600 		RDS_DPRINTF2("rds_post_recv_buf", "RQ is FULL");
601 		recvqp->qp_taskqpending = B_FALSE;
602 		mutex_exit(&recvqp->qp_lock);
603 		return;
604 	}
605 	mutex_exit(&recvqp->qp_lock);
606 
607 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
608 		gp = &rds_dpool;
609 		rcv_len = RdsPktSize;
610 	} else {
611 		gp = &rds_cpool;
612 		rcv_len = RDS_CTRLPKT_SIZE;
613 	}
614 
615 	bp = rds_get_buf(gp, nspace, &jx);
616 	if (bp == NULL) {
617 		RDS_DPRINTF2(LABEL, "EP(%p): No Recv buffers available", ep);
618 		/* try again later */
619 		ret = ddi_taskq_dispatch(rds_taskq, rds_post_recv_buf,
620 		    (void *)ep->ep_chanhdl, DDI_NOSLEEP);
621 		if (ret != DDI_SUCCESS) {
622 			RDS_DPRINTF1(LABEL, "ddi_taskq_dispatch failed: %d",
623 			    ret);
624 			mutex_enter(&recvqp->qp_lock);
625 			recvqp->qp_taskqpending = B_FALSE;
626 			mutex_exit(&recvqp->qp_lock);
627 		}
628 		return;
629 	}
630 
631 	if (jx != nspace) {
632 		RDS_DPRINTF2(LABEL, "EP(%p): Recv buffers "
633 		    "needed: %d available: %d", ep, nspace, jx);
634 		nspace = jx;
635 	}
636 
637 	bp1 = bp;
638 	for (ix = 0; ix < nspace; ix++) {
639 		bp1->buf_ep = ep;
640 		ASSERT(bp1->buf_state == RDS_RCVBUF_FREE);
641 		bp1->buf_state = RDS_RCVBUF_POSTED;
642 		bp1->buf_ds.ds_key = hcap->hca_lkey;
643 		bp1->buf_ds.ds_len = rcv_len;
644 		bp1 = bp1->buf_nextp;
645 	}
646 
647 #if 0
648 	wrp = kmem_zalloc(RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t),
649 	    KM_SLEEP);
650 #else
651 	wrp = &wr[0];
652 #endif
653 
654 	npost = nspace;
655 	while (npost) {
656 		jx = (npost > RDS_POST_FEW_ATATIME) ?
657 		    RDS_POST_FEW_ATATIME : npost;
658 		for (ix = 0; ix < jx; ix++) {
659 			wrp[ix].wr_id = (uintptr_t)bp;
660 			wrp[ix].wr_nds = 1;
661 			wrp[ix].wr_sgl = &bp->buf_ds;
662 			bp = bp->buf_nextp;
663 		}
664 
665 		ret = ibt_post_recv(chanhdl, wrp, jx, &kx);
666 		if ((ret != IBT_SUCCESS) || (kx != jx)) {
667 			RDS_DPRINTF1(LABEL, "ibt_post_recv for %d WRs failed: "
668 			    "%d", npost, ret);
669 			npost -= kx;
670 			break;
671 		}
672 
673 		npost -= jx;
674 	}
675 
676 	mutex_enter(&recvqp->qp_lock);
677 	if (npost != 0) {
678 		RDS_DPRINTF2("rds_post_recv_buf",
679 		    "EP(%p) Failed to post %d WRs", ep, npost);
680 		recvqp->qp_level += (nspace - npost);
681 	} else {
682 		recvqp->qp_level += nspace;
683 	}
684 
685 	/*
686 	 * sometimes, the recv WRs can get consumed as soon as they are
687 	 * posted. In that case, taskq thread to post more WRs to the RQ will
688 	 * not be scheduled as the taskqpending flag is still set.
689 	 */
690 	if (recvqp->qp_level == 0) {
691 		mutex_exit(&recvqp->qp_lock);
692 		ret = ddi_taskq_dispatch(rds_taskq,
693 		    rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
694 		if (ret != DDI_SUCCESS) {
695 			RDS_DPRINTF1("rds_post_recv_buf",
696 			    "ddi_taskq_dispatch failed: %d", ret);
697 			mutex_enter(&recvqp->qp_lock);
698 			recvqp->qp_taskqpending = B_FALSE;
699 			mutex_exit(&recvqp->qp_lock);
700 		}
701 	} else {
702 		recvqp->qp_taskqpending = B_FALSE;
703 		mutex_exit(&recvqp->qp_lock);
704 	}
705 
706 #if 0
707 	kmem_free(wrp, RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t));
708 #endif
709 
710 	RDS_DPRINTF4("rds_post_recv_buf", "Return: EP(%p)", ep);
711 }
712 
713 static int
714 rds_poll_data_completions(ibt_cq_hdl_t cq, rds_ep_t *ep)
715 {
716 	ibt_wc_t	wc;
717 	rds_buf_t	*bp;
718 	rds_data_hdr_t	*pktp;
719 	rds_qp_t	*recvqp;
720 	uint_t		npolled;
721 	int		ret = IBT_SUCCESS;
722 
723 
724 	RDS_DPRINTF4("rds_poll_data_completions", "Enter: EP(%p)", ep);
725 
726 	bzero(&wc, sizeof (ibt_wc_t));
727 	ret = ibt_poll_cq(cq, &wc, 1, &npolled);
728 	if (ret != IBT_SUCCESS) {
729 		if (ret != IBT_CQ_EMPTY) {
730 			RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
731 			    "returned: %d", ep, cq, ret);
732 		} else {
733 			RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
734 			    "returned: IBT_CQ_EMPTY", ep, cq);
735 		}
736 		return (ret);
737 	}
738 
739 	bp = (rds_buf_t *)(uintptr_t)wc.wc_id;
740 	ASSERT(bp->buf_state == RDS_RCVBUF_POSTED);
741 	bp->buf_state = RDS_RCVBUF_ONSOCKQ;
742 	bp->buf_nextp = NULL;
743 
744 	if (wc.wc_status != IBT_WC_SUCCESS) {
745 		mutex_enter(&ep->ep_recvqp.qp_lock);
746 		ep->ep_recvqp.qp_level--;
747 		mutex_exit(&ep->ep_recvqp.qp_lock);
748 
749 		/* free the buffer */
750 		bp->buf_state = RDS_RCVBUF_FREE;
751 		rds_free_recv_buf(bp, 1);
752 
753 		/* Receive completion failure */
754 		if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) {
755 			RDS_DPRINTF2("rds_poll_data_completions",
756 			    "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
757 			    ep, cq, wc.wc_id, wc.wc_status);
758 			RDS_INCR_RXERRS();
759 		}
760 		return (ret);
761 	}
762 
763 	/* there is one less in the RQ */
764 	recvqp = &ep->ep_recvqp;
765 	mutex_enter(&recvqp->qp_lock);
766 	recvqp->qp_level--;
767 	if ((recvqp->qp_taskqpending == B_FALSE) &&
768 	    (recvqp->qp_level <= recvqp->qp_lwm)) {
769 		/* Time to post more buffers into the RQ */
770 		recvqp->qp_taskqpending = B_TRUE;
771 		mutex_exit(&recvqp->qp_lock);
772 
773 		ret = ddi_taskq_dispatch(rds_taskq,
774 		    rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
775 		if (ret != DDI_SUCCESS) {
776 			RDS_DPRINTF1(LABEL, "ddi_taskq_dispatch failed: %d",
777 			    ret);
778 			mutex_enter(&recvqp->qp_lock);
779 			recvqp->qp_taskqpending = B_FALSE;
780 			mutex_exit(&recvqp->qp_lock);
781 		}
782 	} else {
783 		mutex_exit(&recvqp->qp_lock);
784 	}
785 
786 	pktp = (rds_data_hdr_t *)(uintptr_t)bp->buf_ds.ds_va;
787 	ASSERT(pktp->dh_datalen != 0);
788 
789 	RDS_DPRINTF5(LABEL, "Message Received: sendIP: 0x%x recvIP: 0x%x "
790 	    "sendport: %d recvport: %d npkts: %d pktno: %d", ep->ep_remip,
791 	    ep->ep_myip, pktp->dh_sendport, pktp->dh_recvport,
792 	    pktp->dh_npkts, pktp->dh_psn);
793 
794 	RDS_DPRINTF3(LABEL, "BP(%p): npkts: %d psn: %d", bp,
795 	    pktp->dh_npkts, pktp->dh_psn);
796 
797 	if (pktp->dh_npkts == 1) {
798 		/* single pkt or last packet */
799 		if (pktp->dh_psn != 0) {
800 			/* last packet of a segmented message */
801 			ASSERT(ep->ep_seglbp != NULL);
802 			ep->ep_seglbp->buf_nextp = bp;
803 			ep->ep_seglbp = bp;
804 			rds_received_msg(ep, ep->ep_segfbp);
805 			ep->ep_segfbp = NULL;
806 			ep->ep_seglbp = NULL;
807 		} else {
808 			/* single packet */
809 			rds_received_msg(ep, bp);
810 		}
811 	} else {
812 		/* multi-pkt msg */
813 		if (pktp->dh_psn == 0) {
814 			/* first packet */
815 			ASSERT(ep->ep_segfbp == NULL);
816 			ep->ep_segfbp = bp;
817 			ep->ep_seglbp = bp;
818 		} else {
819 			/* intermediate packet */
820 			ASSERT(ep->ep_segfbp != NULL);
821 			ep->ep_seglbp->buf_nextp = bp;
822 			ep->ep_seglbp = bp;
823 		}
824 	}
825 
826 	RDS_DPRINTF4("rds_poll_data_completions", "Return: EP(%p)", ep);
827 
828 	return (ret);
829 }
830 
831 void
832 rds_recvcq_handler(ibt_cq_hdl_t cq, void *arg)
833 {
834 	rds_ep_t	*ep;
835 	int		ret = IBT_SUCCESS;
836 	int		(*func)(ibt_cq_hdl_t, rds_ep_t *);
837 
838 	ep = (rds_ep_t *)arg;
839 
840 	RDS_DPRINTF4("rds_recvcq_handler", "enter: EP(%p)", ep);
841 
842 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
843 		func = rds_poll_data_completions;
844 	} else {
845 		func = rds_poll_ctrl_completions;
846 	}
847 
848 	do {
849 		ret = func(cq, ep);
850 	} while (ret != IBT_CQ_EMPTY);
851 
852 	/* enable the CQ */
853 	ret = ibt_enable_cq_notify(cq, rds_wc_signal);
854 	if (ret != IBT_SUCCESS) {
855 		RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify "
856 		    "failed: %d", ep, cq, ret);
857 		return;
858 	}
859 
860 	do {
861 		ret = func(cq, ep);
862 	} while (ret != IBT_CQ_EMPTY);
863 
864 	RDS_DPRINTF4("rds_recvcq_handler", "Return: EP(%p)", ep);
865 }
866 
867 void
868 rds_poll_send_completions(ibt_cq_hdl_t cq, rds_ep_t *ep, boolean_t lock)
869 {
870 	ibt_wc_t	wc[RDS_NUM_DATA_SEND_WCS];
871 	uint_t		npolled, nret, send_error = 0;
872 	rds_buf_t	*headp, *tailp, *bp;
873 	int		ret, ix;
874 
875 	RDS_DPRINTF4("rds_poll_send_completions", "Enter EP(%p)", ep);
876 
877 	headp = NULL;
878 	tailp = NULL;
879 	npolled = 0;
880 	do {
881 		ret = ibt_poll_cq(cq, wc, RDS_NUM_DATA_SEND_WCS, &nret);
882 		if (ret != IBT_SUCCESS) {
883 			if (ret != IBT_CQ_EMPTY) {
884 				RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): "
885 				    "ibt_poll_cq returned: %d", ep, cq, ret);
886 			} else {
887 				RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): "
888 				    "ibt_poll_cq returned: IBT_CQ_EMPTY",
889 				    ep, cq);
890 			}
891 
892 			break;
893 		}
894 
895 		for (ix = 0; ix < nret; ix++) {
896 			if (wc[ix].wc_status == IBT_WC_SUCCESS) {
897 				if (wc[ix].wc_type == IBT_WRC_RDMAW) {
898 					rds_send_acknowledgement(ep);
899 					continue;
900 				}
901 
902 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
903 				ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
904 				bp->buf_state = RDS_SNDBUF_FREE;
905 			} else if (wc[ix].wc_status == IBT_WC_WR_FLUSHED_ERR) {
906 				RDS_INCR_TXERRS();
907 				RDS_DPRINTF5("rds_poll_send_completions",
908 				    "EP(%p): WC ID: %p ERROR: %d", ep,
909 				    wc[ix].wc_id, wc[ix].wc_status);
910 
911 				if (wc[ix].wc_id == RDS_RDMAW_WRID) {
912 					mutex_enter(&ep->ep_lock);
913 					ep->ep_rdmacnt--;
914 					mutex_exit(&ep->ep_lock);
915 					continue;
916 				}
917 
918 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
919 				bp->buf_state = RDS_SNDBUF_ERROR;
920 			} else {
921 				RDS_INCR_TXERRS();
922 				RDS_DPRINTF2("rds_poll_send_completions",
923 				    "EP(%p): WC ID: %p ERROR: %d", ep,
924 				    wc[ix].wc_id, wc[ix].wc_status);
925 				if (send_error == 0) {
926 					rds_session_t	*sp = ep->ep_sp;
927 
928 					/* don't let anyone send anymore */
929 					rw_enter(&sp->session_lock, RW_WRITER);
930 					if (sp->session_state !=
931 					    RDS_SESSION_STATE_ERROR) {
932 						sp->session_state =
933 						    RDS_SESSION_STATE_ERROR;
934 						/* Make this the active end */
935 						sp->session_type =
936 						    RDS_SESSION_ACTIVE;
937 					}
938 					rw_exit(&sp->session_lock);
939 				}
940 
941 				send_error++;
942 
943 				if (wc[ix].wc_id == RDS_RDMAW_WRID) {
944 					mutex_enter(&ep->ep_lock);
945 					ep->ep_rdmacnt--;
946 					mutex_exit(&ep->ep_lock);
947 					continue;
948 				}
949 
950 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
951 				bp->buf_state = RDS_SNDBUF_ERROR;
952 			}
953 
954 			bp->buf_nextp = NULL;
955 			if (headp) {
956 				tailp->buf_nextp = bp;
957 				tailp = bp;
958 			} else {
959 				headp = bp;
960 				tailp = bp;
961 			}
962 
963 			npolled++;
964 		}
965 
966 		if (rds_no_interrupts && (npolled > 100)) {
967 			break;
968 		}
969 
970 		if (rds_no_interrupts == 1) {
971 			break;
972 		}
973 	} while (ret != IBT_CQ_EMPTY);
974 
975 	RDS_DPRINTF5("rds_poll_send_completions", "Npolled: %d send_error: %d",
976 	    npolled, send_error);
977 
978 	/* put the buffers to the pool */
979 	if (npolled != 0) {
980 		rds_free_send_buf(ep, headp, tailp, npolled, lock);
981 	}
982 
983 	if (send_error != 0) {
984 		rds_handle_send_error(ep);
985 	}
986 
987 	RDS_DPRINTF4("rds_poll_send_completions", "Return EP(%p)", ep);
988 }
989 
990 void
991 rds_sendcq_handler(ibt_cq_hdl_t cq, void *arg)
992 {
993 	rds_ep_t	*ep;
994 	int		ret;
995 
996 	ep = (rds_ep_t *)arg;
997 
998 	RDS_DPRINTF4("rds_sendcq_handler", "Enter: EP(%p)", ep);
999 
1000 	/* enable the CQ */
1001 	ret = ibt_enable_cq_notify(cq, IBT_NEXT_COMPLETION);
1002 	if (ret != IBT_SUCCESS) {
1003 		RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify "
1004 		    "failed: %d", ep, cq, ret);
1005 		return;
1006 	}
1007 
1008 	rds_poll_send_completions(cq, ep, B_FALSE);
1009 
1010 	RDS_DPRINTF4("rds_sendcq_handler", "Return: EP(%p)", ep);
1011 }
1012 
1013 void
1014 rds_ep_free_rc_channel(rds_ep_t *ep)
1015 {
1016 	int ret;
1017 
1018 	RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Enter", ep);
1019 
1020 	ASSERT(mutex_owned(&ep->ep_lock));
1021 
1022 	/* free the QP */
1023 	if (ep->ep_chanhdl != NULL) {
1024 		/* wait until the RQ is empty */
1025 		(void) ibt_flush_channel(ep->ep_chanhdl);
1026 		(void) rds_is_recvq_empty(ep, B_TRUE);
1027 		ret = ibt_free_channel(ep->ep_chanhdl);
1028 		if (ret != IBT_SUCCESS) {
1029 			RDS_DPRINTF1("rds_ep_free_rc_channel", "EP(%p) "
1030 			    "ibt_free_channel returned: %d", ep, ret);
1031 		}
1032 		ep->ep_chanhdl = NULL;
1033 	} else {
1034 		RDS_DPRINTF2("rds_ep_free_rc_channel",
1035 		    "EP(%p) Channel is ALREADY FREE", ep);
1036 	}
1037 
1038 	/* free the Send CQ */
1039 	if (ep->ep_sendcq != NULL) {
1040 		ret = ibt_free_cq(ep->ep_sendcq);
1041 		if (ret != IBT_SUCCESS) {
1042 			RDS_DPRINTF1("rds_ep_free_rc_channel",
1043 			    "EP(%p) - for sendcq, ibt_free_cq returned %d",
1044 			    ep, ret);
1045 		}
1046 		ep->ep_sendcq = NULL;
1047 	} else {
1048 		RDS_DPRINTF2("rds_ep_free_rc_channel",
1049 		    "EP(%p) SendCQ is ALREADY FREE", ep);
1050 	}
1051 
1052 	/* free the Recv CQ */
1053 	if (ep->ep_recvcq != NULL) {
1054 		ret = ibt_free_cq(ep->ep_recvcq);
1055 		if (ret != IBT_SUCCESS) {
1056 			RDS_DPRINTF1("rds_ep_free_rc_channel",
1057 			    "EP(%p) - for recvcq, ibt_free_cq returned %d",
1058 			    ep, ret);
1059 		}
1060 		ep->ep_recvcq = NULL;
1061 	} else {
1062 		RDS_DPRINTF2("rds_ep_free_rc_channel",
1063 		    "EP(%p) RecvCQ is ALREADY FREE", ep);
1064 	}
1065 
1066 	RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Return", ep);
1067 }
1068 
1069 /* Allocate resources for RC channel */
1070 ibt_channel_hdl_t
1071 rds_ep_alloc_rc_channel(rds_ep_t *ep, uint8_t hca_port)
1072 {
1073 	int				ret = IBT_SUCCESS;
1074 	ibt_cq_attr_t			scqattr, rcqattr;
1075 	ibt_rc_chan_alloc_args_t	chanargs;
1076 	ibt_channel_hdl_t		chanhdl;
1077 	rds_session_t			*sp;
1078 	rds_hca_t			*hcap;
1079 
1080 	RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Enter: 0x%p port: %d",
1081 	    ep, hca_port);
1082 
1083 	/* Update the EP with the right IP address and HCA guid */
1084 	sp = ep->ep_sp;
1085 	ASSERT(sp != NULL);
1086 	rw_enter(&sp->session_lock, RW_READER);
1087 	mutex_enter(&ep->ep_lock);
1088 	ep->ep_myip = sp->session_myip;
1089 	ep->ep_remip = sp->session_remip;
1090 	hcap = rds_gid_to_hcap(rdsib_statep, sp->session_lgid);
1091 	ep->ep_hca_guid = hcap->hca_guid;
1092 	mutex_exit(&ep->ep_lock);
1093 	rw_exit(&sp->session_lock);
1094 
1095 	/* reset taskqpending flag here */
1096 	ep->ep_recvqp.qp_taskqpending = B_FALSE;
1097 
1098 	if (ep->ep_type == RDS_EP_TYPE_CTRL) {
1099 		scqattr.cq_size = MaxCtrlSendBuffers;
1100 		scqattr.cq_sched = NULL;
1101 		scqattr.cq_flags = IBT_CQ_NO_FLAGS;
1102 
1103 		rcqattr.cq_size = MaxCtrlRecvBuffers;
1104 		rcqattr.cq_sched = NULL;
1105 		rcqattr.cq_flags = IBT_CQ_NO_FLAGS;
1106 
1107 		chanargs.rc_sizes.cs_sq = MaxCtrlSendBuffers;
1108 		chanargs.rc_sizes.cs_rq = MaxCtrlRecvBuffers;
1109 		chanargs.rc_sizes.cs_sq_sgl = 1;
1110 		chanargs.rc_sizes.cs_rq_sgl = 1;
1111 	} else {
1112 		scqattr.cq_size = MaxDataSendBuffers + RDS_NUM_ACKS;
1113 		scqattr.cq_sched = NULL;
1114 		scqattr.cq_flags = IBT_CQ_NO_FLAGS;
1115 
1116 		rcqattr.cq_size = MaxDataRecvBuffers;
1117 		rcqattr.cq_sched = NULL;
1118 		rcqattr.cq_flags = IBT_CQ_NO_FLAGS;
1119 
1120 		chanargs.rc_sizes.cs_sq = MaxDataSendBuffers + RDS_NUM_ACKS;
1121 		chanargs.rc_sizes.cs_rq = MaxDataRecvBuffers;
1122 		chanargs.rc_sizes.cs_sq_sgl = 1;
1123 		chanargs.rc_sizes.cs_rq_sgl = 1;
1124 	}
1125 
1126 	mutex_enter(&ep->ep_lock);
1127 	if (ep->ep_sendcq == NULL) {
1128 		/* returned size is always greater than the requested size */
1129 		ret = ibt_alloc_cq(hcap->hca_hdl, &scqattr,
1130 		    &ep->ep_sendcq, NULL);
1131 		if (ret != IBT_SUCCESS) {
1132 			RDS_DPRINTF2(LABEL, "ibt_alloc_cq for sendCQ "
1133 			    "failed, size = %d: %d", scqattr.cq_size, ret);
1134 			mutex_exit(&ep->ep_lock);
1135 			return (NULL);
1136 		}
1137 
1138 		(void) ibt_set_cq_handler(ep->ep_sendcq, rds_sendcq_handler,
1139 		    ep);
1140 
1141 		if (rds_no_interrupts == 0) {
1142 			ret = ibt_enable_cq_notify(ep->ep_sendcq,
1143 			    IBT_NEXT_COMPLETION);
1144 			if (ret != IBT_SUCCESS) {
1145 				RDS_DPRINTF2(LABEL,
1146 				    "ibt_enable_cq_notify failed: %d", ret);
1147 				(void) ibt_free_cq(ep->ep_sendcq);
1148 				ep->ep_sendcq = NULL;
1149 				mutex_exit(&ep->ep_lock);
1150 				return (NULL);
1151 			}
1152 		}
1153 	}
1154 
1155 	if (ep->ep_recvcq == NULL) {
1156 		/* returned size is always greater than the requested size */
1157 		ret = ibt_alloc_cq(hcap->hca_hdl, &rcqattr,
1158 		    &ep->ep_recvcq, NULL);
1159 		if (ret != IBT_SUCCESS) {
1160 			RDS_DPRINTF2(LABEL, "ibt_alloc_cq for recvCQ "
1161 			    "failed, size = %d: %d", rcqattr.cq_size, ret);
1162 			(void) ibt_free_cq(ep->ep_sendcq);
1163 			ep->ep_sendcq = NULL;
1164 			mutex_exit(&ep->ep_lock);
1165 			return (NULL);
1166 		}
1167 
1168 		(void) ibt_set_cq_handler(ep->ep_recvcq, rds_recvcq_handler,
1169 		    ep);
1170 
1171 		ret = ibt_enable_cq_notify(ep->ep_recvcq, rds_wc_signal);
1172 		if (ret != IBT_SUCCESS) {
1173 			RDS_DPRINTF2(LABEL,
1174 			    "ibt_enable_cq_notify failed: %d", ret);
1175 			(void) ibt_free_cq(ep->ep_recvcq);
1176 			ep->ep_recvcq = NULL;
1177 			(void) ibt_free_cq(ep->ep_sendcq);
1178 			ep->ep_sendcq = NULL;
1179 			mutex_exit(&ep->ep_lock);
1180 			return (NULL);
1181 		}
1182 	}
1183 
1184 	chanargs.rc_flags = IBT_ALL_SIGNALED;
1185 	chanargs.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR |
1186 	    IBT_CEP_ATOMIC;
1187 	chanargs.rc_hca_port_num = hca_port;
1188 	chanargs.rc_scq = ep->ep_sendcq;
1189 	chanargs.rc_rcq = ep->ep_recvcq;
1190 	chanargs.rc_pd = hcap->hca_pdhdl;
1191 	chanargs.rc_srq = NULL;
1192 
1193 	ret = ibt_alloc_rc_channel(hcap->hca_hdl,
1194 	    IBT_ACHAN_NO_FLAGS, &chanargs, &chanhdl, NULL);
1195 	if (ret != IBT_SUCCESS) {
1196 		RDS_DPRINTF2(LABEL, "ibt_alloc_rc_channel fail: %d",
1197 		    ret);
1198 		(void) ibt_free_cq(ep->ep_recvcq);
1199 		ep->ep_recvcq = NULL;
1200 		(void) ibt_free_cq(ep->ep_sendcq);
1201 		ep->ep_sendcq = NULL;
1202 		mutex_exit(&ep->ep_lock);
1203 		return (NULL);
1204 	}
1205 	mutex_exit(&ep->ep_lock);
1206 
1207 	/* Chan private should contain the ep */
1208 	(void) ibt_set_chan_private(chanhdl, ep);
1209 
1210 	RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Return: 0x%p", chanhdl);
1211 
1212 	return (chanhdl);
1213 }
1214 
1215 
1216 #if 0
1217 
1218 /* Return node guid given a port gid */
1219 ib_guid_t
1220 rds_gid_to_node_guid(ib_gid_t gid)
1221 {
1222 	ibt_node_info_t	nodeinfo;
1223 	int		ret;
1224 
1225 	RDS_DPRINTF4("rds_gid_to_node_guid", "Enter: gid: %llx:%llx",
1226 	    gid.gid_prefix, gid.gid_guid);
1227 
1228 	ret = ibt_gid_to_node_info(gid, &nodeinfo);
1229 	if (ret != IBT_SUCCESS) {
1230 		RDS_DPRINTF2(LABEL, "ibt_gid_node_info for gid: %llx:%llx "
1231 		    "failed", gid.gid_prefix, gid.gid_guid);
1232 		return (0LL);
1233 	}
1234 
1235 	RDS_DPRINTF4("rds_gid_to_node_guid", "Return: Node guid: %llx",
1236 	    nodeinfo.n_node_guid);
1237 
1238 	return (nodeinfo.n_node_guid);
1239 }
1240 
1241 #endif
1242 
1243 static void
1244 rds_handle_portup_event(rds_state_t *statep, ibt_hca_hdl_t hdl,
1245     ibt_async_event_t *event)
1246 {
1247 	rds_hca_t		*hcap;
1248 	ibt_hca_portinfo_t	*newpinfop, *oldpinfop;
1249 	uint_t			newsize, oldsize, nport;
1250 	ib_gid_t		gid;
1251 	int			ret;
1252 
1253 	RDS_DPRINTF2("rds_handle_portup_event",
1254 	    "Enter: GUID: 0x%llx Statep: %p", event->ev_hca_guid, statep);
1255 
1256 	hcap = rds_get_hcap(statep, event->ev_hca_guid);
1257 	if (hcap == NULL) {
1258 		RDS_DPRINTF2("rds_handle_portup_event", "HCA: 0x%llx is "
1259 		    "not in our list", event->ev_hca_guid);
1260 		return;
1261 	}
1262 
1263 	ret = ibt_query_hca_ports(hdl, 0, &newpinfop, &nport, &newsize);
1264 	if (ret != IBT_SUCCESS) {
1265 		RDS_DPRINTF2(LABEL, "ibt_query_hca_ports failed: %d", ret);
1266 		return;
1267 	}
1268 
1269 	oldpinfop = hcap->hca_pinfop;
1270 	oldsize = hcap->hca_pinfo_sz;
1271 	hcap->hca_pinfop = newpinfop;
1272 	hcap->hca_pinfo_sz = newsize;
1273 
1274 	/* structure copy */
1275 	gid = newpinfop[event->ev_port - 1].p_sgid_tbl[0];
1276 
1277 	/* bind RDS service on the port, pass statep as cm_private */
1278 	ret = ibt_bind_service(statep->rds_srvhdl, gid, NULL, statep, NULL);
1279 	if (ret != IBT_SUCCESS) {
1280 		RDS_DPRINTF2(LABEL, "Bind service for HCA: 0x%llx Port: %d "
1281 		    "gid %llx:%llx returned: %d", event->ev_hca_guid,
1282 		    event->ev_port, gid.gid_prefix, gid.gid_guid, ret);
1283 	}
1284 
1285 	(void) ibt_free_portinfo(oldpinfop, oldsize);
1286 
1287 	RDS_DPRINTF2("rds_handle_portup_event", "Return: GUID: 0x%llx",
1288 	    event->ev_hca_guid);
1289 }
1290 
1291 static void
1292 rds_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code,
1293     ibt_async_event_t *event)
1294 {
1295 	rds_state_t		*statep;
1296 
1297 	RDS_DPRINTF2("rds_async_handler", "Async code: %d", code);
1298 
1299 	switch (code) {
1300 	case IBT_EVENT_PORT_UP:
1301 		statep = (rds_state_t *)clntp;
1302 		rds_handle_portup_event(statep, hdl, event);
1303 		break;
1304 
1305 	default:
1306 		RDS_DPRINTF2(LABEL, "Async event: %d not handled", code);
1307 	}
1308 
1309 	RDS_DPRINTF2("rds_async_handler", "Return: code: %d", code);
1310 }
1311