xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rds/rdsib_ib.c (revision 9525b14bcdeb5b5f6f95ab27c2f48f18bd2ec829)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
27  *
28  * This software is available to you under a choice of one of two
29  * licenses.  You may choose to be licensed under the terms of the GNU
30  * General Public License (GPL) Version 2, available from the file
31  * COPYING in the main directory of this source tree, or the
32  * OpenIB.org BSD license below:
33  *
34  *     Redistribution and use in source and binary forms, with or
35  *     without modification, are permitted provided that the following
36  *     conditions are met:
37  *
38  *	- Redistributions of source code must retain the above
39  *	  copyright notice, this list of conditions and the following
40  *	  disclaimer.
41  *
42  *	- Redistributions in binary form must reproduce the above
43  *	  copyright notice, this list of conditions and the following
44  *	  disclaimer in the documentation and/or other materials
45  *	  provided with the distribution.
46  *
47  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
48  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
49  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
50  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
51  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
52  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
53  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
54  * SOFTWARE.
55  *
56  */
57 /*
58  * Sun elects to include this software in Sun product
59  * under the OpenIB BSD license.
60  *
61  *
62  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
63  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
64  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
65  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
66  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
67  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
68  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
69  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
70  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
71  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
72  * POSSIBILITY OF SUCH DAMAGE.
73  */
74 
75 #include <sys/types.h>
76 #include <sys/ddi.h>
77 #include <sys/sunddi.h>
78 #include <sys/ib/clients/rds/rdsib_cm.h>
79 #include <sys/ib/clients/rds/rdsib_ib.h>
80 #include <sys/ib/clients/rds/rdsib_buf.h>
81 #include <sys/ib/clients/rds/rdsib_ep.h>
82 #include <sys/ib/clients/rds/rds_kstat.h>
83 
84 static void rds_async_handler(void *clntp, ibt_hca_hdl_t hdl,
85     ibt_async_code_t code, ibt_async_event_t *event);
86 
87 static struct ibt_clnt_modinfo_s rds_ib_modinfo = {
88 	IBTI_V_CURR,
89 	IBT_NETWORK,
90 	rds_async_handler,
91 	NULL,
92 	"RDS"
93 };
94 
95 /* performance tunables */
96 uint_t		rds_no_interrupts = 0;
97 uint_t		rds_poll_percent_full = 25;
98 uint_t		rds_wc_signal = IBT_NEXT_SOLICITED;
99 uint_t		rds_waittime_ms = 100; /* ms */
100 
101 extern dev_info_t *rdsib_dev_info;
102 extern void rds_close_sessions();
103 
104 static void
105 rdsib_validate_chan_sizes(ibt_hca_attr_t *hattrp)
106 {
107 	/* The SQ size should not be more than that supported by the HCA */
108 	if (((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_chan_sz) ||
109 	    ((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_cq_sz)) {
110 		RDS_DPRINTF2("RDSIB", "MaxDataSendBuffers + %d is greater "
111 		    "than that supported by the HCA driver "
112 		    "(%d + %d > %d or %d), lowering it to a supported value.",
113 		    RDS_NUM_ACKS, MaxDataSendBuffers, RDS_NUM_ACKS,
114 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
115 
116 		MaxDataSendBuffers = (hattrp->hca_max_chan_sz >
117 		    hattrp->hca_max_cq_sz) ?
118 		    hattrp->hca_max_cq_sz - RDS_NUM_ACKS :
119 		    hattrp->hca_max_chan_sz - RDS_NUM_ACKS;
120 	}
121 
122 	/* The RQ size should not be more than that supported by the HCA */
123 	if ((MaxDataRecvBuffers > hattrp->hca_max_chan_sz) ||
124 	    (MaxDataRecvBuffers > hattrp->hca_max_cq_sz)) {
125 		RDS_DPRINTF2("RDSIB", "MaxDataRecvBuffers is greater than that "
126 		    "supported by the HCA driver (%d > %d or %d), lowering it "
127 		    "to a supported value.", MaxDataRecvBuffers,
128 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
129 
130 		MaxDataRecvBuffers = (hattrp->hca_max_chan_sz >
131 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
132 		    hattrp->hca_max_chan_sz;
133 	}
134 
135 	/* The SQ size should not be more than that supported by the HCA */
136 	if ((MaxCtrlSendBuffers > hattrp->hca_max_chan_sz) ||
137 	    (MaxCtrlSendBuffers > hattrp->hca_max_cq_sz)) {
138 		RDS_DPRINTF2("RDSIB", "MaxCtrlSendBuffers is greater than that "
139 		    "supported by the HCA driver (%d > %d or %d), lowering it "
140 		    "to a supported value.", MaxCtrlSendBuffers,
141 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
142 
143 		MaxCtrlSendBuffers = (hattrp->hca_max_chan_sz >
144 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
145 		    hattrp->hca_max_chan_sz;
146 	}
147 
148 	/* The RQ size should not be more than that supported by the HCA */
149 	if ((MaxCtrlRecvBuffers > hattrp->hca_max_chan_sz) ||
150 	    (MaxCtrlRecvBuffers > hattrp->hca_max_cq_sz)) {
151 		RDS_DPRINTF2("RDSIB", "MaxCtrlRecvBuffers is greater than that "
152 		    "supported by the HCA driver (%d > %d or %d), lowering it "
153 		    "to a supported value.", MaxCtrlRecvBuffers,
154 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
155 
156 		MaxCtrlRecvBuffers = (hattrp->hca_max_chan_sz >
157 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
158 		    hattrp->hca_max_chan_sz;
159 	}
160 
161 	/* The MaxRecvMemory should be less than that supported by the HCA */
162 	if ((NDataRX * RdsPktSize) > hattrp->hca_max_memr_len) {
163 		RDS_DPRINTF2("RDSIB", "MaxRecvMemory is greater than that "
164 		    "supported by the HCA driver (%d > %d), lowering it to %d",
165 		    NDataRX * RdsPktSize, hattrp->hca_max_memr_len,
166 		    hattrp->hca_max_memr_len);
167 
168 		NDataRX = hattrp->hca_max_memr_len/RdsPktSize;
169 	}
170 }
171 
172 /* Return hcap, given the hca guid */
173 rds_hca_t *
174 rds_lkup_hca(ib_guid_t hca_guid)
175 {
176 	rds_hca_t	*hcap;
177 
178 	RDS_DPRINTF4("rds_lkup_hca", "Enter: statep: 0x%p "
179 	    "guid: %llx", rdsib_statep, hca_guid);
180 
181 	rw_enter(&rdsib_statep->rds_hca_lock, RW_READER);
182 
183 	hcap = rdsib_statep->rds_hcalistp;
184 	while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
185 		hcap = hcap->hca_nextp;
186 	}
187 
188 	rw_exit(&rdsib_statep->rds_hca_lock);
189 
190 	RDS_DPRINTF4("rds_lkup_hca", "return");
191 
192 	return (hcap);
193 }
194 
195 void rds_randomize_qps(rds_hca_t *hcap);
196 
197 static rds_hca_t *
198 rdsib_init_hca(ib_guid_t hca_guid)
199 {
200 	rds_hca_t	*hcap;
201 	boolean_t	alloc = B_FALSE;
202 	int		ret;
203 
204 	RDS_DPRINTF2("rdsib_init_hca", "enter: HCA 0x%llx", hca_guid);
205 
206 	/* Do a HCA lookup */
207 	hcap = rds_lkup_hca(hca_guid);
208 
209 	if (hcap != NULL && hcap->hca_hdl != NULL) {
210 		/*
211 		 * This can happen if we get IBT_HCA_ATTACH_EVENT on an HCA
212 		 * that we have already opened. Just return NULL so that
213 		 * we'll not end up reinitializing the HCA again.
214 		 */
215 		RDS_DPRINTF2("rdsib_init_hca", "HCA already initialized");
216 		return (NULL);
217 	}
218 
219 	if (hcap == NULL) {
220 		RDS_DPRINTF2("rdsib_init_hca", "New HCA is added");
221 		hcap = (rds_hca_t *)kmem_zalloc(sizeof (rds_hca_t), KM_SLEEP);
222 		alloc = B_TRUE;
223 	}
224 
225 	hcap->hca_guid = hca_guid;
226 	ret = ibt_open_hca(rdsib_statep->rds_ibhdl, hca_guid,
227 	    &hcap->hca_hdl);
228 	if (ret != IBT_SUCCESS) {
229 		if (ret == IBT_HCA_IN_USE) {
230 			RDS_DPRINTF2("rdsib_init_hca",
231 			    "ibt_open_hca: 0x%llx returned IBT_HCA_IN_USE",
232 			    hca_guid);
233 		} else {
234 			RDS_DPRINTF2("rdsib_init_hca",
235 			    "ibt_open_hca: 0x%llx failed: %d", hca_guid, ret);
236 		}
237 		if (alloc == B_TRUE) {
238 			kmem_free(hcap, sizeof (rds_hca_t));
239 		}
240 		return (NULL);
241 	}
242 
243 	ret = ibt_query_hca(hcap->hca_hdl, &hcap->hca_attr);
244 	if (ret != IBT_SUCCESS) {
245 		RDS_DPRINTF2("rdsib_init_hca",
246 		    "Query HCA: 0x%llx failed:  %d", hca_guid, ret);
247 		ret = ibt_close_hca(hcap->hca_hdl);
248 		ASSERT(ret == IBT_SUCCESS);
249 		if (alloc == B_TRUE) {
250 			kmem_free(hcap, sizeof (rds_hca_t));
251 		} else {
252 			hcap->hca_hdl = NULL;
253 		}
254 		return (NULL);
255 	}
256 
257 	ret = ibt_query_hca_ports(hcap->hca_hdl, 0,
258 	    &hcap->hca_pinfop, &hcap->hca_nports, &hcap->hca_pinfo_sz);
259 	if (ret != IBT_SUCCESS) {
260 		RDS_DPRINTF2("rdsib_init_hca",
261 		    "Query HCA 0x%llx ports failed: %d", hca_guid,
262 		    ret);
263 		ret = ibt_close_hca(hcap->hca_hdl);
264 		hcap->hca_hdl = NULL;
265 		ASSERT(ret == IBT_SUCCESS);
266 		if (alloc == B_TRUE) {
267 			kmem_free(hcap, sizeof (rds_hca_t));
268 		} else {
269 			hcap->hca_hdl = NULL;
270 		}
271 		return (NULL);
272 	}
273 
274 	/* Only one PD per HCA is allocated, so do it here */
275 	ret = ibt_alloc_pd(hcap->hca_hdl, IBT_PD_NO_FLAGS,
276 	    &hcap->hca_pdhdl);
277 	if (ret != IBT_SUCCESS) {
278 		RDS_DPRINTF2("rdsib_init_hca",
279 		    "ibt_alloc_pd 0x%llx failed: %d", hca_guid, ret);
280 		(void) ibt_free_portinfo(hcap->hca_pinfop,
281 		    hcap->hca_pinfo_sz);
282 		ret = ibt_close_hca(hcap->hca_hdl);
283 		ASSERT(ret == IBT_SUCCESS);
284 		hcap->hca_hdl = NULL;
285 		if (alloc == B_TRUE) {
286 			kmem_free(hcap, sizeof (rds_hca_t));
287 		} else {
288 			hcap->hca_hdl = NULL;
289 		}
290 		return (NULL);
291 	}
292 
293 	rdsib_validate_chan_sizes(&hcap->hca_attr);
294 
295 	/* To minimize stale connections after ungraceful reboots */
296 	rds_randomize_qps(hcap);
297 
298 	rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
299 	hcap->hca_state = RDS_HCA_STATE_OPEN;
300 	if (alloc == B_TRUE) {
301 		/* this is a new HCA, add it to the list */
302 		rdsib_statep->rds_nhcas++;
303 		hcap->hca_nextp = rdsib_statep->rds_hcalistp;
304 		rdsib_statep->rds_hcalistp = hcap;
305 	}
306 	rw_exit(&rdsib_statep->rds_hca_lock);
307 
308 	RDS_DPRINTF2("rdsib_init_hca", "return: HCA 0x%llx", hca_guid);
309 
310 	return (hcap);
311 }
312 
313 /*
314  * Called from attach
315  */
316 int
317 rdsib_initialize_ib()
318 {
319 	ib_guid_t	*guidp;
320 	rds_hca_t	*hcap;
321 	uint_t		ix, hcaix, nhcas;
322 	int		ret;
323 
324 	RDS_DPRINTF2("rdsib_initialize_ib", "enter: statep %p", rdsib_statep);
325 
326 	ASSERT(rdsib_statep != NULL);
327 	if (rdsib_statep == NULL) {
328 		RDS_DPRINTF1("rdsib_initialize_ib",
329 		    "RDS Statep not initialized");
330 		return (-1);
331 	}
332 
333 	/* How many hcas are there? */
334 	nhcas = ibt_get_hca_list(&guidp);
335 	if (nhcas == 0) {
336 		RDS_DPRINTF2("rdsib_initialize_ib", "No IB HCAs Available");
337 		return (-1);
338 	}
339 
340 	RDS_DPRINTF3("rdsib_initialize_ib", "Number of HCAs: %d", nhcas);
341 
342 	/* Register with IBTF */
343 	ret = ibt_attach(&rds_ib_modinfo, rdsib_dev_info, rdsib_statep,
344 	    &rdsib_statep->rds_ibhdl);
345 	if (ret != IBT_SUCCESS) {
346 		RDS_DPRINTF2("rdsib_initialize_ib", "ibt_attach failed: %d",
347 		    ret);
348 		(void) ibt_free_hca_list(guidp, nhcas);
349 		return (-1);
350 	}
351 
352 	/*
353 	 * Open each HCA and gather its information. Don't care about HCAs
354 	 * that cannot be opened. It is OK as long as atleast one HCA can be
355 	 * opened.
356 	 * Initialize a HCA only if all the information is available.
357 	 */
358 	for (ix = 0, hcaix = 0; ix < nhcas; ix++) {
359 		RDS_DPRINTF3(LABEL, "Open HCA: 0x%llx", guidp[ix]);
360 
361 		hcap = rdsib_init_hca(guidp[ix]);
362 		if (hcap != NULL) hcaix++;
363 	}
364 
365 	/* free the HCA list, we are done with it */
366 	(void) ibt_free_hca_list(guidp, nhcas);
367 
368 	if (hcaix == 0) {
369 		/* Failed to Initialize even one HCA */
370 		RDS_DPRINTF2("rdsib_initialize_ib", "No HCAs are initialized");
371 		(void) ibt_detach(rdsib_statep->rds_ibhdl);
372 		rdsib_statep->rds_ibhdl = NULL;
373 		return (-1);
374 	}
375 
376 	if (hcaix < nhcas) {
377 		RDS_DPRINTF2("rdsib_open_ib", "HCAs %d/%d failed to initialize",
378 		    (nhcas - hcaix), nhcas);
379 	}
380 
381 	RDS_DPRINTF2("rdsib_initialize_ib", "return: statep %p", rdsib_statep);
382 
383 	return (0);
384 }
385 
386 /*
387  * Called from detach
388  */
389 void
390 rdsib_deinitialize_ib()
391 {
392 	rds_hca_t	*hcap, *nextp;
393 	int		ret;
394 
395 	RDS_DPRINTF2("rdsib_deinitialize_ib", "enter: statep %p", rdsib_statep);
396 
397 	/* close and destroy all the sessions */
398 	rds_close_sessions(NULL);
399 
400 	/* Release all HCA resources */
401 	rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
402 	RDS_DPRINTF2("rdsib_deinitialize_ib", "HCA List: %p, NHCA: %d",
403 	    rdsib_statep->rds_hcalistp, rdsib_statep->rds_nhcas);
404 	hcap = rdsib_statep->rds_hcalistp;
405 	rdsib_statep->rds_hcalistp = NULL;
406 	rdsib_statep->rds_nhcas = 0;
407 	rw_exit(&rdsib_statep->rds_hca_lock);
408 
409 	while (hcap != NULL) {
410 		nextp = hcap->hca_nextp;
411 
412 		if (hcap->hca_hdl != NULL) {
413 			ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl);
414 			ASSERT(ret == IBT_SUCCESS);
415 
416 			(void) ibt_free_portinfo(hcap->hca_pinfop,
417 			    hcap->hca_pinfo_sz);
418 
419 			ret = ibt_close_hca(hcap->hca_hdl);
420 			ASSERT(ret == IBT_SUCCESS);
421 		}
422 
423 		kmem_free(hcap, sizeof (rds_hca_t));
424 		hcap = nextp;
425 	}
426 
427 	/* Deregister with IBTF */
428 	if (rdsib_statep->rds_ibhdl != NULL) {
429 		(void) ibt_detach(rdsib_statep->rds_ibhdl);
430 		rdsib_statep->rds_ibhdl = NULL;
431 	}
432 
433 	RDS_DPRINTF2("rdsib_deinitialize_ib", "return: statep %p",
434 	    rdsib_statep);
435 }
436 
437 /*
438  * Called on open of first RDS socket
439  */
440 int
441 rdsib_open_ib()
442 {
443 	int	ret;
444 
445 	RDS_DPRINTF2("rdsib_open_ib", "enter: statep %p", rdsib_statep);
446 
447 	/* Enable incoming connection requests */
448 	if (rdsib_statep->rds_srvhdl == NULL) {
449 		rdsib_statep->rds_srvhdl =
450 		    rds_register_service(rdsib_statep->rds_ibhdl);
451 		if (rdsib_statep->rds_srvhdl == NULL) {
452 			RDS_DPRINTF2("rdsib_open_ib",
453 			    "Service registration failed");
454 			return (-1);
455 		} else {
456 			/* bind the service on all available ports */
457 			ret = rds_bind_service(rdsib_statep);
458 			if (ret != 0) {
459 				RDS_DPRINTF2("rdsib_open_ib",
460 				    "Bind service failed: %d", ret);
461 			}
462 		}
463 	}
464 
465 	RDS_DPRINTF2("rdsib_open_ib", "return: statep %p", rdsib_statep);
466 
467 	return (0);
468 }
469 
470 /*
471  * Called when all ports are closed.
472  */
473 void
474 rdsib_close_ib()
475 {
476 	int	ret;
477 
478 	RDS_DPRINTF2("rdsib_close_ib", "enter: statep %p", rdsib_statep);
479 
480 	/* Disable incoming connection requests */
481 	if (rdsib_statep->rds_srvhdl != NULL) {
482 		ret = ibt_unbind_all_services(rdsib_statep->rds_srvhdl);
483 		if (ret != 0) {
484 			RDS_DPRINTF2("rdsib_close_ib",
485 			    "ibt_unbind_all_services failed: %d\n", ret);
486 		}
487 		ret = ibt_deregister_service(rdsib_statep->rds_ibhdl,
488 		    rdsib_statep->rds_srvhdl);
489 		if (ret != 0) {
490 			RDS_DPRINTF2("rdsib_close_ib",
491 			    "ibt_deregister_service failed: %d\n", ret);
492 		} else {
493 			rdsib_statep->rds_srvhdl = NULL;
494 		}
495 	}
496 
497 	RDS_DPRINTF2("rdsib_close_ib", "return: statep %p", rdsib_statep);
498 }
499 
500 /* Return hcap, given the hca guid */
501 rds_hca_t *
502 rds_get_hcap(rds_state_t *statep, ib_guid_t hca_guid)
503 {
504 	rds_hca_t	*hcap;
505 
506 	RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: Enter: statep: 0x%p "
507 	    "guid: %llx", statep, hca_guid);
508 
509 	rw_enter(&statep->rds_hca_lock, RW_READER);
510 
511 	hcap = statep->rds_hcalistp;
512 	while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
513 		hcap = hcap->hca_nextp;
514 	}
515 
516 	/*
517 	 * don't let anyone use this HCA until the RECV memory
518 	 * is registered with this HCA
519 	 */
520 	if ((hcap != NULL) &&
521 	    (hcap->hca_state == RDS_HCA_STATE_MEM_REGISTERED)) {
522 		ASSERT(hcap->hca_mrhdl != NULL);
523 		rw_exit(&statep->rds_hca_lock);
524 		return (hcap);
525 	}
526 
527 	RDS_DPRINTF2("rds_get_hcap",
528 	    "HCA (0x%p, 0x%llx) is not initialized", hcap, hca_guid);
529 	rw_exit(&statep->rds_hca_lock);
530 
531 	RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: return");
532 
533 	return (NULL);
534 }
535 
536 /* Return hcap, given a gid */
537 rds_hca_t *
538 rds_gid_to_hcap(rds_state_t *statep, ib_gid_t gid)
539 {
540 	rds_hca_t	*hcap;
541 	uint_t		ix;
542 
543 	RDS_DPRINTF4("rds_gid_to_hcap", "Enter: statep: 0x%p gid: %llx:%llx",
544 	    statep, gid.gid_prefix, gid.gid_guid);
545 
546 	rw_enter(&statep->rds_hca_lock, RW_READER);
547 
548 	hcap = statep->rds_hcalistp;
549 	while (hcap != NULL) {
550 
551 		/*
552 		 * don't let anyone use this HCA until the RECV memory
553 		 * is registered with this HCA
554 		 */
555 		if (hcap->hca_state != RDS_HCA_STATE_MEM_REGISTERED) {
556 			RDS_DPRINTF3("rds_gid_to_hcap",
557 			    "HCA (0x%p, 0x%llx) is not initialized",
558 			    hcap, gid.gid_guid);
559 			hcap = hcap->hca_nextp;
560 			continue;
561 		}
562 
563 		for (ix = 0; ix < hcap->hca_nports; ix++) {
564 			if ((hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_prefix ==
565 			    gid.gid_prefix) &&
566 			    (hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_guid ==
567 			    gid.gid_guid)) {
568 				RDS_DPRINTF4("rds_gid_to_hcap",
569 				    "gid found in hcap: 0x%p", hcap);
570 				rw_exit(&statep->rds_hca_lock);
571 				return (hcap);
572 			}
573 		}
574 		hcap = hcap->hca_nextp;
575 	}
576 
577 	rw_exit(&statep->rds_hca_lock);
578 
579 	return (NULL);
580 }
581 
582 /* This is called from the send CQ handler */
583 void
584 rds_send_acknowledgement(rds_ep_t *ep)
585 {
586 	int	ret;
587 	uint_t	ix;
588 
589 	RDS_DPRINTF4("rds_send_acknowledgement", "Enter EP(%p)", ep);
590 
591 	mutex_enter(&ep->ep_lock);
592 
593 	ASSERT(ep->ep_rdmacnt != 0);
594 
595 	/*
596 	 * The previous ACK completed successfully, send the next one
597 	 * if more messages were received after sending the last ACK
598 	 */
599 	if (ep->ep_rbufid != *(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va) {
600 		*(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va = ep->ep_rbufid;
601 		mutex_exit(&ep->ep_lock);
602 
603 		/* send acknowledgement */
604 		RDS_INCR_TXACKS();
605 		ret = ibt_post_send(ep->ep_chanhdl, &ep->ep_ackwr, 1, &ix);
606 		if (ret != IBT_SUCCESS) {
607 			RDS_DPRINTF2("rds_send_acknowledgement",
608 			    "EP(%p): ibt_post_send for acknowledgement "
609 			    "failed: %d, SQ depth: %d",
610 			    ep, ret, ep->ep_sndpool.pool_nbusy);
611 			mutex_enter(&ep->ep_lock);
612 			ep->ep_rdmacnt--;
613 			mutex_exit(&ep->ep_lock);
614 		}
615 	} else {
616 		/* ACKed all messages, no more to ACK */
617 		ep->ep_rdmacnt--;
618 		mutex_exit(&ep->ep_lock);
619 		return;
620 	}
621 
622 	RDS_DPRINTF4("rds_send_acknowledgement", "Return EP(%p)", ep);
623 }
624 
625 static int
626 rds_poll_ctrl_completions(ibt_cq_hdl_t cq, rds_ep_t *ep)
627 {
628 	ibt_wc_t	wc;
629 	uint_t		npolled;
630 	rds_buf_t	*bp;
631 	rds_ctrl_pkt_t	*cpkt;
632 	rds_qp_t	*recvqp;
633 	int		ret = IBT_SUCCESS;
634 
635 	RDS_DPRINTF4("rds_poll_ctrl_completions", "Enter: EP(%p)", ep);
636 
637 	bzero(&wc, sizeof (ibt_wc_t));
638 	ret = ibt_poll_cq(cq, &wc, 1, &npolled);
639 	if (ret != IBT_SUCCESS) {
640 		if (ret != IBT_CQ_EMPTY) {
641 			RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
642 			    "returned: %d", ep, cq, ret);
643 		} else {
644 			RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
645 			    "returned: IBT_CQ_EMPTY", ep, cq);
646 		}
647 		return (ret);
648 	}
649 
650 	bp = (rds_buf_t *)(uintptr_t)wc.wc_id;
651 
652 	if (wc.wc_status != IBT_WC_SUCCESS) {
653 		mutex_enter(&ep->ep_recvqp.qp_lock);
654 		ep->ep_recvqp.qp_level--;
655 		mutex_exit(&ep->ep_recvqp.qp_lock);
656 
657 		/* Free the buffer */
658 		bp->buf_state = RDS_RCVBUF_FREE;
659 		rds_free_recv_buf(bp, 1);
660 
661 		/* Receive completion failure */
662 		if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) {
663 			RDS_DPRINTF2("rds_poll_ctrl_completions",
664 			    "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
665 			    ep, cq, wc.wc_id, wc.wc_status);
666 		}
667 		return (ret);
668 	}
669 
670 	/* there is one less in the RQ */
671 	recvqp = &ep->ep_recvqp;
672 	mutex_enter(&recvqp->qp_lock);
673 	recvqp->qp_level--;
674 	if ((recvqp->qp_taskqpending == B_FALSE) &&
675 	    (recvqp->qp_level <= recvqp->qp_lwm)) {
676 		/* Time to post more buffers into the RQ */
677 		recvqp->qp_taskqpending = B_TRUE;
678 		mutex_exit(&recvqp->qp_lock);
679 
680 		ret = ddi_taskq_dispatch(rds_taskq,
681 		    rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
682 		if (ret != DDI_SUCCESS) {
683 			RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
684 			    ret);
685 			mutex_enter(&recvqp->qp_lock);
686 			recvqp->qp_taskqpending = B_FALSE;
687 			mutex_exit(&recvqp->qp_lock);
688 		}
689 	} else {
690 		mutex_exit(&recvqp->qp_lock);
691 	}
692 
693 	cpkt = (rds_ctrl_pkt_t *)(uintptr_t)bp->buf_ds.ds_va;
694 	rds_handle_control_message(ep->ep_sp, cpkt);
695 
696 	bp->buf_state = RDS_RCVBUF_FREE;
697 	rds_free_recv_buf(bp, 1);
698 
699 	RDS_DPRINTF4("rds_poll_ctrl_completions", "Return: EP(%p)", ep);
700 
701 	return (ret);
702 }
703 
704 #define	RDS_POST_FEW_ATATIME	100
705 /* Post recv WRs into the RQ. Assumes the ep->refcnt is already incremented */
706 void
707 rds_post_recv_buf(void *arg)
708 {
709 	ibt_channel_hdl_t	chanhdl;
710 	rds_ep_t		*ep;
711 	rds_session_t		*sp;
712 	rds_qp_t		*recvqp;
713 	rds_bufpool_t		*gp;
714 	rds_buf_t		*bp, *bp1;
715 	ibt_recv_wr_t		*wrp, wr[RDS_POST_FEW_ATATIME];
716 	rds_hca_t		*hcap;
717 	uint_t			npost, nspace, rcv_len;
718 	uint_t			ix, jx, kx;
719 	int			ret;
720 
721 	chanhdl = (ibt_channel_hdl_t)arg;
722 	RDS_DPRINTF4("rds_post_recv_buf", "Enter: CHAN(%p)", chanhdl);
723 	RDS_INCR_POST_RCV_BUF_CALLS();
724 
725 	ep = (rds_ep_t *)ibt_get_chan_private(chanhdl);
726 	ASSERT(ep != NULL);
727 	sp = ep->ep_sp;
728 	recvqp = &ep->ep_recvqp;
729 
730 	RDS_DPRINTF5("rds_post_recv_buf", "EP(%p)", ep);
731 
732 	/* get the hcap for the HCA hosting this channel */
733 	hcap = rds_lkup_hca(ep->ep_hca_guid);
734 	if (hcap == NULL) {
735 		RDS_DPRINTF2("rds_post_recv_buf", "HCA (0x%llx) not found",
736 		    ep->ep_hca_guid);
737 		return;
738 	}
739 
740 	/* Make sure the session is still connected */
741 	rw_enter(&sp->session_lock, RW_READER);
742 	if ((sp->session_state != RDS_SESSION_STATE_INIT) &&
743 	    (sp->session_state != RDS_SESSION_STATE_CONNECTED) &&
744 	    (sp->session_state != RDS_SESSION_STATE_HCA_CLOSING)) {
745 		RDS_DPRINTF2("rds_post_recv_buf", "EP(%p): Session is not "
746 		    "in active state (%d)", ep, sp->session_state);
747 		rw_exit(&sp->session_lock);
748 		return;
749 	}
750 	rw_exit(&sp->session_lock);
751 
752 	/* how many can be posted */
753 	mutex_enter(&recvqp->qp_lock);
754 	nspace = recvqp->qp_depth - recvqp->qp_level;
755 	if (nspace == 0) {
756 		RDS_DPRINTF2("rds_post_recv_buf", "RQ is FULL");
757 		recvqp->qp_taskqpending = B_FALSE;
758 		mutex_exit(&recvqp->qp_lock);
759 		return;
760 	}
761 	mutex_exit(&recvqp->qp_lock);
762 
763 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
764 		gp = &rds_dpool;
765 		rcv_len = RdsPktSize;
766 	} else {
767 		gp = &rds_cpool;
768 		rcv_len = RDS_CTRLPKT_SIZE;
769 	}
770 
771 	bp = rds_get_buf(gp, nspace, &jx);
772 	if (bp == NULL) {
773 		RDS_DPRINTF2(LABEL, "EP(%p): No Recv buffers available", ep);
774 		/* try again later */
775 		ret = ddi_taskq_dispatch(rds_taskq, rds_post_recv_buf,
776 		    (void *)chanhdl, DDI_NOSLEEP);
777 		if (ret != DDI_SUCCESS) {
778 			RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
779 			    ret);
780 			mutex_enter(&recvqp->qp_lock);
781 			recvqp->qp_taskqpending = B_FALSE;
782 			mutex_exit(&recvqp->qp_lock);
783 		}
784 		return;
785 	}
786 
787 	if (jx != nspace) {
788 		RDS_DPRINTF2(LABEL, "EP(%p): Recv buffers "
789 		    "needed: %d available: %d", ep, nspace, jx);
790 		nspace = jx;
791 	}
792 
793 	bp1 = bp;
794 	for (ix = 0; ix < nspace; ix++) {
795 		bp1->buf_ep = ep;
796 		ASSERT(bp1->buf_state == RDS_RCVBUF_FREE);
797 		bp1->buf_state = RDS_RCVBUF_POSTED;
798 		bp1->buf_ds.ds_key = hcap->hca_lkey;
799 		bp1->buf_ds.ds_len = rcv_len;
800 		bp1 = bp1->buf_nextp;
801 	}
802 
803 #if 0
804 	wrp = kmem_zalloc(RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t),
805 	    KM_SLEEP);
806 #else
807 	wrp = &wr[0];
808 #endif
809 
810 	npost = nspace;
811 	while (npost) {
812 		jx = (npost > RDS_POST_FEW_ATATIME) ?
813 		    RDS_POST_FEW_ATATIME : npost;
814 		for (ix = 0; ix < jx; ix++) {
815 			wrp[ix].wr_id = (uintptr_t)bp;
816 			wrp[ix].wr_nds = 1;
817 			wrp[ix].wr_sgl = &bp->buf_ds;
818 			bp = bp->buf_nextp;
819 		}
820 
821 		ret = ibt_post_recv(chanhdl, wrp, jx, &kx);
822 		if ((ret != IBT_SUCCESS) || (kx != jx)) {
823 			RDS_DPRINTF2(LABEL, "ibt_post_recv for %d WRs failed: "
824 			    "%d", npost, ret);
825 			npost -= kx;
826 			break;
827 		}
828 
829 		npost -= jx;
830 	}
831 
832 	mutex_enter(&recvqp->qp_lock);
833 	if (npost != 0) {
834 		RDS_DPRINTF2("rds_post_recv_buf",
835 		    "EP(%p) Failed to post %d WRs", ep, npost);
836 		recvqp->qp_level += (nspace - npost);
837 	} else {
838 		recvqp->qp_level += nspace;
839 	}
840 
841 	/*
842 	 * sometimes, the recv WRs can get consumed as soon as they are
843 	 * posted. In that case, taskq thread to post more WRs to the RQ will
844 	 * not be scheduled as the taskqpending flag is still set.
845 	 */
846 	if (recvqp->qp_level == 0) {
847 		mutex_exit(&recvqp->qp_lock);
848 		ret = ddi_taskq_dispatch(rds_taskq,
849 		    rds_post_recv_buf, (void *)chanhdl, DDI_NOSLEEP);
850 		if (ret != DDI_SUCCESS) {
851 			RDS_DPRINTF2("rds_post_recv_buf",
852 			    "ddi_taskq_dispatch failed: %d", ret);
853 			mutex_enter(&recvqp->qp_lock);
854 			recvqp->qp_taskqpending = B_FALSE;
855 			mutex_exit(&recvqp->qp_lock);
856 		}
857 	} else {
858 		recvqp->qp_taskqpending = B_FALSE;
859 		mutex_exit(&recvqp->qp_lock);
860 	}
861 
862 #if 0
863 	kmem_free(wrp, RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t));
864 #endif
865 
866 	RDS_DPRINTF4("rds_post_recv_buf", "Return: EP(%p)", ep);
867 }
868 
869 static int
870 rds_poll_data_completions(ibt_cq_hdl_t cq, rds_ep_t *ep)
871 {
872 	ibt_wc_t	wc;
873 	rds_buf_t	*bp;
874 	rds_data_hdr_t	*pktp;
875 	rds_qp_t	*recvqp;
876 	uint_t		npolled;
877 	int		ret = IBT_SUCCESS;
878 
879 
880 	RDS_DPRINTF4("rds_poll_data_completions", "Enter: EP(%p)", ep);
881 
882 	bzero(&wc, sizeof (ibt_wc_t));
883 	ret = ibt_poll_cq(cq, &wc, 1, &npolled);
884 	if (ret != IBT_SUCCESS) {
885 		if (ret != IBT_CQ_EMPTY) {
886 			RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
887 			    "returned: %d", ep, cq, ret);
888 		} else {
889 			RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
890 			    "returned: IBT_CQ_EMPTY", ep, cq);
891 		}
892 		return (ret);
893 	}
894 
895 	bp = (rds_buf_t *)(uintptr_t)wc.wc_id;
896 	ASSERT(bp->buf_state == RDS_RCVBUF_POSTED);
897 	bp->buf_state = RDS_RCVBUF_ONSOCKQ;
898 	bp->buf_nextp = NULL;
899 
900 	if (wc.wc_status != IBT_WC_SUCCESS) {
901 		mutex_enter(&ep->ep_recvqp.qp_lock);
902 		ep->ep_recvqp.qp_level--;
903 		mutex_exit(&ep->ep_recvqp.qp_lock);
904 
905 		/* free the buffer */
906 		bp->buf_state = RDS_RCVBUF_FREE;
907 		rds_free_recv_buf(bp, 1);
908 
909 		/* Receive completion failure */
910 		if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) {
911 			RDS_DPRINTF2("rds_poll_data_completions",
912 			    "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
913 			    ep, cq, wc.wc_id, wc.wc_status);
914 			RDS_INCR_RXERRS();
915 		}
916 		return (ret);
917 	}
918 
919 	/* there is one less in the RQ */
920 	recvqp = &ep->ep_recvqp;
921 	mutex_enter(&recvqp->qp_lock);
922 	recvqp->qp_level--;
923 	if ((recvqp->qp_taskqpending == B_FALSE) &&
924 	    (recvqp->qp_level <= recvqp->qp_lwm)) {
925 		/* Time to post more buffers into the RQ */
926 		recvqp->qp_taskqpending = B_TRUE;
927 		mutex_exit(&recvqp->qp_lock);
928 
929 		ret = ddi_taskq_dispatch(rds_taskq,
930 		    rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
931 		if (ret != DDI_SUCCESS) {
932 			RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
933 			    ret);
934 			mutex_enter(&recvqp->qp_lock);
935 			recvqp->qp_taskqpending = B_FALSE;
936 			mutex_exit(&recvqp->qp_lock);
937 		}
938 	} else {
939 		mutex_exit(&recvqp->qp_lock);
940 	}
941 
942 	pktp = (rds_data_hdr_t *)(uintptr_t)bp->buf_ds.ds_va;
943 	ASSERT(pktp->dh_datalen != 0);
944 
945 	RDS_DPRINTF5(LABEL, "Message Received: sendIP: 0x%x recvIP: 0x%x "
946 	    "sendport: %d recvport: %d npkts: %d pktno: %d", ep->ep_remip,
947 	    ep->ep_myip, pktp->dh_sendport, pktp->dh_recvport,
948 	    pktp->dh_npkts, pktp->dh_psn);
949 
950 	RDS_DPRINTF3(LABEL, "BP(%p): npkts: %d psn: %d", bp,
951 	    pktp->dh_npkts, pktp->dh_psn);
952 
953 	if (pktp->dh_npkts == 1) {
954 		/* single pkt or last packet */
955 		if (pktp->dh_psn != 0) {
956 			/* last packet of a segmented message */
957 			ASSERT(ep->ep_seglbp != NULL);
958 			ep->ep_seglbp->buf_nextp = bp;
959 			ep->ep_seglbp = bp;
960 			rds_received_msg(ep, ep->ep_segfbp);
961 			ep->ep_segfbp = NULL;
962 			ep->ep_seglbp = NULL;
963 		} else {
964 			/* single packet */
965 			rds_received_msg(ep, bp);
966 		}
967 	} else {
968 		/* multi-pkt msg */
969 		if (pktp->dh_psn == 0) {
970 			/* first packet */
971 			ASSERT(ep->ep_segfbp == NULL);
972 			ep->ep_segfbp = bp;
973 			ep->ep_seglbp = bp;
974 		} else {
975 			/* intermediate packet */
976 			ASSERT(ep->ep_segfbp != NULL);
977 			ep->ep_seglbp->buf_nextp = bp;
978 			ep->ep_seglbp = bp;
979 		}
980 	}
981 
982 	RDS_DPRINTF4("rds_poll_data_completions", "Return: EP(%p)", ep);
983 
984 	return (ret);
985 }
986 
987 void
988 rds_recvcq_handler(ibt_cq_hdl_t cq, void *arg)
989 {
990 	rds_ep_t	*ep;
991 	int		ret = IBT_SUCCESS;
992 	int		(*func)(ibt_cq_hdl_t, rds_ep_t *);
993 
994 	ep = (rds_ep_t *)arg;
995 
996 	RDS_DPRINTF4("rds_recvcq_handler", "enter: EP(%p)", ep);
997 
998 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
999 		func = rds_poll_data_completions;
1000 	} else {
1001 		func = rds_poll_ctrl_completions;
1002 	}
1003 
1004 	do {
1005 		ret = func(cq, ep);
1006 	} while (ret != IBT_CQ_EMPTY);
1007 
1008 	/* enable the CQ */
1009 	ret = ibt_enable_cq_notify(cq, rds_wc_signal);
1010 	if (ret != IBT_SUCCESS) {
1011 		RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify "
1012 		    "failed: %d", ep, cq, ret);
1013 		return;
1014 	}
1015 
1016 	do {
1017 		ret = func(cq, ep);
1018 	} while (ret != IBT_CQ_EMPTY);
1019 
1020 	RDS_DPRINTF4("rds_recvcq_handler", "Return: EP(%p)", ep);
1021 }
1022 
1023 void
1024 rds_poll_send_completions(ibt_cq_hdl_t cq, rds_ep_t *ep, boolean_t lock)
1025 {
1026 	ibt_wc_t	wc[RDS_NUM_DATA_SEND_WCS];
1027 	uint_t		npolled, nret, send_error = 0;
1028 	rds_buf_t	*headp, *tailp, *bp;
1029 	int		ret, ix;
1030 
1031 	RDS_DPRINTF4("rds_poll_send_completions", "Enter EP(%p)", ep);
1032 
1033 	headp = NULL;
1034 	tailp = NULL;
1035 	npolled = 0;
1036 	do {
1037 		ret = ibt_poll_cq(cq, wc, RDS_NUM_DATA_SEND_WCS, &nret);
1038 		if (ret != IBT_SUCCESS) {
1039 			if (ret != IBT_CQ_EMPTY) {
1040 				RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): "
1041 				    "ibt_poll_cq returned: %d", ep, cq, ret);
1042 			} else {
1043 				RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): "
1044 				    "ibt_poll_cq returned: IBT_CQ_EMPTY",
1045 				    ep, cq);
1046 			}
1047 
1048 			break;
1049 		}
1050 
1051 		for (ix = 0; ix < nret; ix++) {
1052 			if (wc[ix].wc_status == IBT_WC_SUCCESS) {
1053 				if (wc[ix].wc_type == IBT_WRC_RDMAW) {
1054 					rds_send_acknowledgement(ep);
1055 					continue;
1056 				}
1057 
1058 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
1059 				ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
1060 				bp->buf_state = RDS_SNDBUF_FREE;
1061 			} else if (wc[ix].wc_status == IBT_WC_WR_FLUSHED_ERR) {
1062 				RDS_INCR_TXERRS();
1063 				RDS_DPRINTF5("rds_poll_send_completions",
1064 				    "EP(%p): WC ID: %p ERROR: %d", ep,
1065 				    wc[ix].wc_id, wc[ix].wc_status);
1066 
1067 				if (wc[ix].wc_id == RDS_RDMAW_WRID) {
1068 					mutex_enter(&ep->ep_lock);
1069 					ep->ep_rdmacnt--;
1070 					mutex_exit(&ep->ep_lock);
1071 					continue;
1072 				}
1073 
1074 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
1075 				ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
1076 				bp->buf_state = RDS_SNDBUF_FREE;
1077 			} else {
1078 				RDS_INCR_TXERRS();
1079 				RDS_DPRINTF2("rds_poll_send_completions",
1080 				    "EP(%p): WC ID: %p ERROR: %d", ep,
1081 				    wc[ix].wc_id, wc[ix].wc_status);
1082 				if (send_error == 0) {
1083 					rds_session_t	*sp = ep->ep_sp;
1084 
1085 					/* don't let anyone send anymore */
1086 					rw_enter(&sp->session_lock, RW_WRITER);
1087 					if (sp->session_state !=
1088 					    RDS_SESSION_STATE_ERROR) {
1089 						sp->session_state =
1090 						    RDS_SESSION_STATE_ERROR;
1091 						/* Make this the active end */
1092 						sp->session_type =
1093 						    RDS_SESSION_ACTIVE;
1094 					}
1095 					rw_exit(&sp->session_lock);
1096 				}
1097 
1098 				send_error++;
1099 
1100 				if (wc[ix].wc_id == RDS_RDMAW_WRID) {
1101 					mutex_enter(&ep->ep_lock);
1102 					ep->ep_rdmacnt--;
1103 					mutex_exit(&ep->ep_lock);
1104 					continue;
1105 				}
1106 
1107 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
1108 				ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
1109 				bp->buf_state = RDS_SNDBUF_FREE;
1110 			}
1111 
1112 			bp->buf_nextp = NULL;
1113 			if (headp) {
1114 				tailp->buf_nextp = bp;
1115 				tailp = bp;
1116 			} else {
1117 				headp = bp;
1118 				tailp = bp;
1119 			}
1120 
1121 			npolled++;
1122 		}
1123 
1124 		if (rds_no_interrupts && (npolled > 100)) {
1125 			break;
1126 		}
1127 
1128 		if (rds_no_interrupts == 1) {
1129 			break;
1130 		}
1131 	} while (ret != IBT_CQ_EMPTY);
1132 
1133 	RDS_DPRINTF5("rds_poll_send_completions", "Npolled: %d send_error: %d",
1134 	    npolled, send_error);
1135 
1136 	/* put the buffers to the pool */
1137 	if (npolled != 0) {
1138 		rds_free_send_buf(ep, headp, tailp, npolled, lock);
1139 	}
1140 
1141 	if (send_error != 0) {
1142 		rds_handle_send_error(ep);
1143 	}
1144 
1145 	RDS_DPRINTF4("rds_poll_send_completions", "Return EP(%p)", ep);
1146 }
1147 
1148 void
1149 rds_sendcq_handler(ibt_cq_hdl_t cq, void *arg)
1150 {
1151 	rds_ep_t	*ep;
1152 	int		ret;
1153 
1154 	ep = (rds_ep_t *)arg;
1155 
1156 	RDS_DPRINTF4("rds_sendcq_handler", "Enter: EP(%p)", ep);
1157 
1158 	/* enable the CQ */
1159 	ret = ibt_enable_cq_notify(cq, IBT_NEXT_COMPLETION);
1160 	if (ret != IBT_SUCCESS) {
1161 		RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify "
1162 		    "failed: %d", ep, cq, ret);
1163 		return;
1164 	}
1165 
1166 	rds_poll_send_completions(cq, ep, B_FALSE);
1167 
1168 	RDS_DPRINTF4("rds_sendcq_handler", "Return: EP(%p)", ep);
1169 }
1170 
1171 void
1172 rds_ep_free_rc_channel(rds_ep_t *ep)
1173 {
1174 	int ret;
1175 
1176 	RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Enter", ep);
1177 
1178 	ASSERT(mutex_owned(&ep->ep_lock));
1179 
1180 	/* free the QP */
1181 	if (ep->ep_chanhdl != NULL) {
1182 		/* wait until the RQ is empty */
1183 		(void) ibt_flush_channel(ep->ep_chanhdl);
1184 		(void) rds_is_recvq_empty(ep, B_TRUE);
1185 		ret = ibt_free_channel(ep->ep_chanhdl);
1186 		if (ret != IBT_SUCCESS) {
1187 			RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) "
1188 			    "ibt_free_channel returned: %d", ep, ret);
1189 		}
1190 		ep->ep_chanhdl = NULL;
1191 	} else {
1192 		RDS_DPRINTF2("rds_ep_free_rc_channel",
1193 		    "EP(%p) Channel is ALREADY FREE", ep);
1194 	}
1195 
1196 	/* free the Send CQ */
1197 	if (ep->ep_sendcq != NULL) {
1198 		ret = ibt_free_cq(ep->ep_sendcq);
1199 		if (ret != IBT_SUCCESS) {
1200 			RDS_DPRINTF2("rds_ep_free_rc_channel",
1201 			    "EP(%p) - for sendcq, ibt_free_cq returned %d",
1202 			    ep, ret);
1203 		}
1204 		ep->ep_sendcq = NULL;
1205 	} else {
1206 		RDS_DPRINTF2("rds_ep_free_rc_channel",
1207 		    "EP(%p) SendCQ is ALREADY FREE", ep);
1208 	}
1209 
1210 	/* free the Recv CQ */
1211 	if (ep->ep_recvcq != NULL) {
1212 		ret = ibt_free_cq(ep->ep_recvcq);
1213 		if (ret != IBT_SUCCESS) {
1214 			RDS_DPRINTF2("rds_ep_free_rc_channel",
1215 			    "EP(%p) - for recvcq, ibt_free_cq returned %d",
1216 			    ep, ret);
1217 		}
1218 		ep->ep_recvcq = NULL;
1219 	} else {
1220 		RDS_DPRINTF2("rds_ep_free_rc_channel",
1221 		    "EP(%p) RecvCQ is ALREADY FREE", ep);
1222 	}
1223 
1224 	RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Return", ep);
1225 }
1226 
1227 /* Allocate resources for RC channel */
1228 ibt_channel_hdl_t
1229 rds_ep_alloc_rc_channel(rds_ep_t *ep, uint8_t hca_port)
1230 {
1231 	int				ret = IBT_SUCCESS;
1232 	ibt_cq_attr_t			scqattr, rcqattr;
1233 	ibt_rc_chan_alloc_args_t	chanargs;
1234 	ibt_channel_hdl_t		chanhdl;
1235 	rds_session_t			*sp;
1236 	rds_hca_t			*hcap;
1237 
1238 	RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Enter: 0x%p port: %d",
1239 	    ep, hca_port);
1240 
1241 	/* Update the EP with the right IP address and HCA guid */
1242 	sp = ep->ep_sp;
1243 	ASSERT(sp != NULL);
1244 	rw_enter(&sp->session_lock, RW_READER);
1245 	mutex_enter(&ep->ep_lock);
1246 	ep->ep_myip = sp->session_myip;
1247 	ep->ep_remip = sp->session_remip;
1248 	hcap = rds_gid_to_hcap(rdsib_statep, sp->session_lgid);
1249 	ep->ep_hca_guid = hcap->hca_guid;
1250 	mutex_exit(&ep->ep_lock);
1251 	rw_exit(&sp->session_lock);
1252 
1253 	/* reset taskqpending flag here */
1254 	ep->ep_recvqp.qp_taskqpending = B_FALSE;
1255 
1256 	if (ep->ep_type == RDS_EP_TYPE_CTRL) {
1257 		scqattr.cq_size = MaxCtrlSendBuffers;
1258 		scqattr.cq_sched = NULL;
1259 		scqattr.cq_flags = IBT_CQ_NO_FLAGS;
1260 
1261 		rcqattr.cq_size = MaxCtrlRecvBuffers;
1262 		rcqattr.cq_sched = NULL;
1263 		rcqattr.cq_flags = IBT_CQ_NO_FLAGS;
1264 
1265 		chanargs.rc_sizes.cs_sq = MaxCtrlSendBuffers;
1266 		chanargs.rc_sizes.cs_rq = MaxCtrlRecvBuffers;
1267 		chanargs.rc_sizes.cs_sq_sgl = 1;
1268 		chanargs.rc_sizes.cs_rq_sgl = 1;
1269 	} else {
1270 		scqattr.cq_size = MaxDataSendBuffers + RDS_NUM_ACKS;
1271 		scqattr.cq_sched = NULL;
1272 		scqattr.cq_flags = IBT_CQ_NO_FLAGS;
1273 
1274 		rcqattr.cq_size = MaxDataRecvBuffers;
1275 		rcqattr.cq_sched = NULL;
1276 		rcqattr.cq_flags = IBT_CQ_NO_FLAGS;
1277 
1278 		chanargs.rc_sizes.cs_sq = MaxDataSendBuffers + RDS_NUM_ACKS;
1279 		chanargs.rc_sizes.cs_rq = MaxDataRecvBuffers;
1280 		chanargs.rc_sizes.cs_sq_sgl = 1;
1281 		chanargs.rc_sizes.cs_rq_sgl = 1;
1282 	}
1283 
1284 	mutex_enter(&ep->ep_lock);
1285 	if (ep->ep_sendcq == NULL) {
1286 		/* returned size is always greater than the requested size */
1287 		ret = ibt_alloc_cq(hcap->hca_hdl, &scqattr,
1288 		    &ep->ep_sendcq, NULL);
1289 		if (ret != IBT_SUCCESS) {
1290 			RDS_DPRINTF2(LABEL, "ibt_alloc_cq for sendCQ "
1291 			    "failed, size = %d: %d", scqattr.cq_size, ret);
1292 			mutex_exit(&ep->ep_lock);
1293 			return (NULL);
1294 		}
1295 
1296 		(void) ibt_set_cq_handler(ep->ep_sendcq, rds_sendcq_handler,
1297 		    ep);
1298 
1299 		if (rds_no_interrupts == 0) {
1300 			ret = ibt_enable_cq_notify(ep->ep_sendcq,
1301 			    IBT_NEXT_COMPLETION);
1302 			if (ret != IBT_SUCCESS) {
1303 				RDS_DPRINTF2(LABEL,
1304 				    "ibt_enable_cq_notify failed: %d", ret);
1305 				(void) ibt_free_cq(ep->ep_sendcq);
1306 				ep->ep_sendcq = NULL;
1307 				mutex_exit(&ep->ep_lock);
1308 				return (NULL);
1309 			}
1310 		}
1311 	}
1312 
1313 	if (ep->ep_recvcq == NULL) {
1314 		/* returned size is always greater than the requested size */
1315 		ret = ibt_alloc_cq(hcap->hca_hdl, &rcqattr,
1316 		    &ep->ep_recvcq, NULL);
1317 		if (ret != IBT_SUCCESS) {
1318 			RDS_DPRINTF2(LABEL, "ibt_alloc_cq for recvCQ "
1319 			    "failed, size = %d: %d", rcqattr.cq_size, ret);
1320 			(void) ibt_free_cq(ep->ep_sendcq);
1321 			ep->ep_sendcq = NULL;
1322 			mutex_exit(&ep->ep_lock);
1323 			return (NULL);
1324 		}
1325 
1326 		(void) ibt_set_cq_handler(ep->ep_recvcq, rds_recvcq_handler,
1327 		    ep);
1328 
1329 		ret = ibt_enable_cq_notify(ep->ep_recvcq, rds_wc_signal);
1330 		if (ret != IBT_SUCCESS) {
1331 			RDS_DPRINTF2(LABEL,
1332 			    "ibt_enable_cq_notify failed: %d", ret);
1333 			(void) ibt_free_cq(ep->ep_recvcq);
1334 			ep->ep_recvcq = NULL;
1335 			(void) ibt_free_cq(ep->ep_sendcq);
1336 			ep->ep_sendcq = NULL;
1337 			mutex_exit(&ep->ep_lock);
1338 			return (NULL);
1339 		}
1340 	}
1341 
1342 	chanargs.rc_flags = IBT_ALL_SIGNALED;
1343 	chanargs.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR |
1344 	    IBT_CEP_ATOMIC;
1345 	chanargs.rc_hca_port_num = hca_port;
1346 	chanargs.rc_scq = ep->ep_sendcq;
1347 	chanargs.rc_rcq = ep->ep_recvcq;
1348 	chanargs.rc_pd = hcap->hca_pdhdl;
1349 	chanargs.rc_srq = NULL;
1350 
1351 	ret = ibt_alloc_rc_channel(hcap->hca_hdl,
1352 	    IBT_ACHAN_NO_FLAGS, &chanargs, &chanhdl, NULL);
1353 	if (ret != IBT_SUCCESS) {
1354 		RDS_DPRINTF2(LABEL, "ibt_alloc_rc_channel fail: %d",
1355 		    ret);
1356 		(void) ibt_free_cq(ep->ep_recvcq);
1357 		ep->ep_recvcq = NULL;
1358 		(void) ibt_free_cq(ep->ep_sendcq);
1359 		ep->ep_sendcq = NULL;
1360 		mutex_exit(&ep->ep_lock);
1361 		return (NULL);
1362 	}
1363 	mutex_exit(&ep->ep_lock);
1364 
1365 	/* Chan private should contain the ep */
1366 	(void) ibt_set_chan_private(chanhdl, ep);
1367 
1368 	RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Return: 0x%p", chanhdl);
1369 
1370 	return (chanhdl);
1371 }
1372 
1373 
1374 #if 0
1375 
1376 /* Return node guid given a port gid */
1377 ib_guid_t
1378 rds_gid_to_node_guid(ib_gid_t gid)
1379 {
1380 	ibt_node_info_t	nodeinfo;
1381 	int		ret;
1382 
1383 	RDS_DPRINTF4("rds_gid_to_node_guid", "Enter: gid: %llx:%llx",
1384 	    gid.gid_prefix, gid.gid_guid);
1385 
1386 	ret = ibt_gid_to_node_info(gid, &nodeinfo);
1387 	if (ret != IBT_SUCCESS) {
1388 		RDS_DPRINTF2(LABEL, "ibt_gid_node_info for gid: %llx:%llx "
1389 		    "failed", gid.gid_prefix, gid.gid_guid);
1390 		return (0LL);
1391 	}
1392 
1393 	RDS_DPRINTF4("rds_gid_to_node_guid", "Return: Node guid: %llx",
1394 	    nodeinfo.n_node_guid);
1395 
1396 	return (nodeinfo.n_node_guid);
1397 }
1398 
1399 #endif
1400 
1401 static void
1402 rds_handle_portup_event(rds_state_t *statep, ibt_hca_hdl_t hdl,
1403     ibt_async_event_t *event)
1404 {
1405 	rds_hca_t		*hcap;
1406 	ibt_hca_portinfo_t	*newpinfop, *oldpinfop;
1407 	uint_t			newsize, oldsize, nport;
1408 	ib_gid_t		gid;
1409 	int			ret;
1410 
1411 	RDS_DPRINTF2("rds_handle_portup_event",
1412 	    "Enter: GUID: 0x%llx Statep: %p", event->ev_hca_guid, statep);
1413 
1414 	rw_enter(&statep->rds_hca_lock, RW_WRITER);
1415 
1416 	hcap = statep->rds_hcalistp;
1417 	while ((hcap != NULL) && (hcap->hca_guid != event->ev_hca_guid)) {
1418 		hcap = hcap->hca_nextp;
1419 	}
1420 
1421 	if (hcap == NULL) {
1422 		RDS_DPRINTF2("rds_handle_portup_event", "HCA: 0x%llx is "
1423 		    "not in our list", event->ev_hca_guid);
1424 		rw_exit(&statep->rds_hca_lock);
1425 		return;
1426 	}
1427 
1428 	ret = ibt_query_hca_ports(hdl, 0, &newpinfop, &nport, &newsize);
1429 	if (ret != IBT_SUCCESS) {
1430 		RDS_DPRINTF2(LABEL, "ibt_query_hca_ports failed: %d", ret);
1431 		rw_exit(&statep->rds_hca_lock);
1432 		return;
1433 	}
1434 
1435 	oldpinfop = hcap->hca_pinfop;
1436 	oldsize = hcap->hca_pinfo_sz;
1437 	hcap->hca_pinfop = newpinfop;
1438 	hcap->hca_pinfo_sz = newsize;
1439 
1440 	(void) ibt_free_portinfo(oldpinfop, oldsize);
1441 
1442 	/* If RDS service is not registered then no bind is needed */
1443 	if (statep->rds_srvhdl == NULL) {
1444 		RDS_DPRINTF2("rds_handle_portup_event",
1445 		    "RDS Service is not registered, so no action needed");
1446 		rw_exit(&statep->rds_hca_lock);
1447 		return;
1448 	}
1449 
1450 	/*
1451 	 * If the service was previously bound on this port and
1452 	 * if this port has changed state down and now up, we do not
1453 	 * need to bind the service again. The bind is expected to
1454 	 * persist across state changes. If the service was never bound
1455 	 * before then we bind it this time.
1456 	 */
1457 	if (hcap->hca_bindhdl[event->ev_port - 1] == NULL) {
1458 
1459 		/* structure copy */
1460 		gid = newpinfop[event->ev_port - 1].p_sgid_tbl[0];
1461 
1462 		/* bind RDS service on the port, pass statep as cm_private */
1463 		ret = ibt_bind_service(statep->rds_srvhdl, gid, NULL, statep,
1464 		    &hcap->hca_bindhdl[event->ev_port - 1]);
1465 		if (ret != IBT_SUCCESS) {
1466 			RDS_DPRINTF2("rds_handle_portup_event",
1467 			    "Bind service for HCA: 0x%llx Port: %d "
1468 			    "gid %llx:%llx returned: %d", event->ev_hca_guid,
1469 			    event->ev_port, gid.gid_prefix, gid.gid_guid, ret);
1470 		}
1471 	}
1472 
1473 	rw_exit(&statep->rds_hca_lock);
1474 
1475 	RDS_DPRINTF2("rds_handle_portup_event", "Return: GUID: 0x%llx",
1476 	    event->ev_hca_guid);
1477 }
1478 
1479 static void
1480 rdsib_add_hca(ib_guid_t hca_guid)
1481 {
1482 	rds_hca_t	*hcap;
1483 	ibt_mr_attr_t	mem_attr;
1484 	ibt_mr_desc_t	mem_desc;
1485 	int		ret;
1486 
1487 	RDS_DPRINTF2("rdsib_add_hca", "Enter: GUID: 0x%llx", hca_guid);
1488 
1489 	hcap = rdsib_init_hca(hca_guid);
1490 	if (hcap == NULL)
1491 		return;
1492 
1493 	/* register the recv memory with this hca */
1494 	mutex_enter(&rds_dpool.pool_lock);
1495 	if (rds_dpool.pool_memp == NULL) {
1496 		/* no memory to register */
1497 		RDS_DPRINTF2("rdsib_add_hca", "No memory to register");
1498 		mutex_exit(&rds_dpool.pool_lock);
1499 		return;
1500 	}
1501 
1502 	mem_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)rds_dpool.pool_memp;
1503 	mem_attr.mr_len = rds_dpool.pool_memsize;
1504 	mem_attr.mr_as = NULL;
1505 	mem_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE;
1506 
1507 	ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl, &mem_attr,
1508 	    &hcap->hca_mrhdl, &mem_desc);
1509 
1510 	mutex_exit(&rds_dpool.pool_lock);
1511 
1512 	if (ret != IBT_SUCCESS) {
1513 		RDS_DPRINTF2("rdsib_add_hca", "ibt_register_mr failed: %d",
1514 		    ret);
1515 	} else {
1516 		rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
1517 		hcap->hca_state = RDS_HCA_STATE_MEM_REGISTERED;
1518 		hcap->hca_lkey = mem_desc.md_lkey;
1519 		hcap->hca_rkey = mem_desc.md_rkey;
1520 		rw_exit(&rdsib_statep->rds_hca_lock);
1521 	}
1522 
1523 	RDS_DPRINTF2("rdsib_add_hca", "Retrun: GUID: 0x%llx", hca_guid);
1524 }
1525 
1526 void rds_close_this_session(rds_session_t *sp, uint8_t wait);
1527 int rds_post_control_message(rds_session_t *sp, uint8_t code, in_port_t port);
1528 
1529 static void
1530 rdsib_del_hca(rds_state_t *statep, ib_guid_t hca_guid)
1531 {
1532 	rds_session_t	*sp;
1533 	rds_hca_t	*hcap;
1534 	rds_hca_state_t	saved_state;
1535 	int		ret, ix;
1536 
1537 	RDS_DPRINTF2("rdsib_del_hca", "Enter: GUID: 0x%llx", hca_guid);
1538 
1539 	/*
1540 	 * This should be a write lock as we don't want anyone to get access
1541 	 * to the hcap while we are modifing its contents
1542 	 */
1543 	rw_enter(&statep->rds_hca_lock, RW_WRITER);
1544 
1545 	hcap = statep->rds_hcalistp;
1546 	while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
1547 		hcap = hcap->hca_nextp;
1548 	}
1549 
1550 	/* Prevent initiating any new activity on this HCA */
1551 	ASSERT(hcap != NULL);
1552 	saved_state = hcap->hca_state;
1553 	hcap->hca_state = RDS_HCA_STATE_STOPPING;
1554 
1555 	rw_exit(&statep->rds_hca_lock);
1556 
1557 	/*
1558 	 * stop the outgoing traffic and close any active sessions on this hca.
1559 	 * Any pending messages in the SQ will be allowed to complete.
1560 	 */
1561 	rw_enter(&statep->rds_sessionlock, RW_READER);
1562 	sp = statep->rds_sessionlistp;
1563 	while (sp) {
1564 		if (sp->session_hca_guid != hca_guid) {
1565 			sp = sp->session_nextp;
1566 			continue;
1567 		}
1568 
1569 		rw_enter(&sp->session_lock, RW_WRITER);
1570 		RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
1571 		    sp->session_state);
1572 		/*
1573 		 * We are changing the session state in advance. This prevents
1574 		 * further messages to be posted to the SQ. We then
1575 		 * send a control message to the remote and tell it close
1576 		 * the session.
1577 		 */
1578 		sp->session_state = RDS_SESSION_STATE_HCA_CLOSING;
1579 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
1580 		    "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
1581 		rw_exit(&sp->session_lock);
1582 
1583 		/*
1584 		 * wait until the sendq is empty then tell the remote to
1585 		 * close this session. This enables for graceful shutdown of
1586 		 * the session
1587 		 */
1588 		(void) rds_is_sendq_empty(&sp->session_dataep, 2);
1589 		(void) rds_post_control_message(sp,
1590 		    RDS_CTRL_CODE_CLOSE_SESSION, 0);
1591 
1592 		sp = sp->session_nextp;
1593 	}
1594 
1595 	/* wait until all the sessions are off this HCA */
1596 	sp = statep->rds_sessionlistp;
1597 	while (sp) {
1598 		if (sp->session_hca_guid != hca_guid) {
1599 			sp = sp->session_nextp;
1600 			continue;
1601 		}
1602 
1603 		rw_enter(&sp->session_lock, RW_READER);
1604 		RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
1605 		    sp->session_state);
1606 
1607 		while ((sp->session_state == RDS_SESSION_STATE_HCA_CLOSING) ||
1608 		    (sp->session_state == RDS_SESSION_STATE_ERROR) ||
1609 		    (sp->session_state == RDS_SESSION_STATE_PASSIVE_CLOSING) ||
1610 		    (sp->session_state == RDS_SESSION_STATE_CLOSED)) {
1611 			rw_exit(&sp->session_lock);
1612 			delay(drv_usectohz(1000000));
1613 			rw_enter(&sp->session_lock, RW_READER);
1614 			RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
1615 			    sp->session_state);
1616 		}
1617 
1618 		rw_exit(&sp->session_lock);
1619 
1620 		sp = sp->session_nextp;
1621 	}
1622 	rw_exit(&statep->rds_sessionlock);
1623 
1624 	/*
1625 	 * if rdsib_close_ib was called before this, then that would have
1626 	 * unbound the service on all ports. In that case, the HCA structs
1627 	 * will contain stale bindhdls. Hence, we do not call unbind unless
1628 	 * the service is still registered.
1629 	 */
1630 	if (statep->rds_srvhdl != NULL) {
1631 		/* unbind RDS service on all ports on this HCA */
1632 		for (ix = 0; ix < hcap->hca_nports; ix++) {
1633 			if (hcap->hca_bindhdl[ix] == NULL) {
1634 				continue;
1635 			}
1636 
1637 			RDS_DPRINTF2("rdsib_del_hca",
1638 			    "Unbinding Service: port: %d, bindhdl: %p",
1639 			    ix + 1, hcap->hca_bindhdl[ix]);
1640 			(void) ibt_unbind_service(rdsib_statep->rds_srvhdl,
1641 			    hcap->hca_bindhdl[ix]);
1642 			hcap->hca_bindhdl[ix] = NULL;
1643 		}
1644 	}
1645 
1646 	RDS_DPRINTF2("rdsib_del_hca", "HCA(%p) State: %d", hcap,
1647 	    hcap->hca_state);
1648 
1649 	switch (saved_state) {
1650 	case RDS_HCA_STATE_MEM_REGISTERED:
1651 		ASSERT(hcap->hca_mrhdl != NULL);
1652 		ret = ibt_deregister_mr(hcap->hca_hdl, hcap->hca_mrhdl);
1653 		if (ret != IBT_SUCCESS) {
1654 			RDS_DPRINTF2("rdsib_del_hca",
1655 			    "ibt_deregister_mr failed: %d", ret);
1656 			return;
1657 		}
1658 		hcap->hca_mrhdl = NULL;
1659 		/* FALLTHRU */
1660 	case RDS_HCA_STATE_OPEN:
1661 		ASSERT(hcap->hca_hdl != NULL);
1662 		ASSERT(hcap->hca_pdhdl != NULL);
1663 
1664 
1665 		ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl);
1666 		if (ret != IBT_SUCCESS) {
1667 			RDS_DPRINTF2("rdsib_del_hca",
1668 			    "ibt_free_pd failed: %d", ret);
1669 		}
1670 
1671 		(void) ibt_free_portinfo(hcap->hca_pinfop, hcap->hca_pinfo_sz);
1672 
1673 		ret = ibt_close_hca(hcap->hca_hdl);
1674 		if (ret != IBT_SUCCESS) {
1675 			RDS_DPRINTF2("rdsib_del_hca",
1676 			    "ibt_close_hca failed: %d", ret);
1677 		}
1678 
1679 		hcap->hca_hdl = NULL;
1680 		hcap->hca_pdhdl = NULL;
1681 		hcap->hca_lkey = 0;
1682 		hcap->hca_rkey = 0;
1683 	}
1684 
1685 	/*
1686 	 * This should be a write lock as we don't want anyone to get access
1687 	 * to the hcap while we are modifing its contents
1688 	 */
1689 	rw_enter(&statep->rds_hca_lock, RW_WRITER);
1690 	hcap->hca_state = RDS_HCA_STATE_REMOVED;
1691 	rw_exit(&statep->rds_hca_lock);
1692 
1693 	RDS_DPRINTF2("rdsib_del_hca", "Return: GUID: 0x%llx", hca_guid);
1694 }
1695 
1696 static void
1697 rds_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code,
1698     ibt_async_event_t *event)
1699 {
1700 	rds_state_t		*statep = (rds_state_t *)clntp;
1701 
1702 	RDS_DPRINTF2("rds_async_handler", "Async code: %d", code);
1703 
1704 	switch (code) {
1705 	case IBT_EVENT_PORT_UP:
1706 		rds_handle_portup_event(statep, hdl, event);
1707 		break;
1708 	case IBT_HCA_ATTACH_EVENT:
1709 		/*
1710 		 * NOTE: In some error recovery paths, it is possible to
1711 		 * receive IBT_HCA_ATTACH_EVENTs on already known HCAs.
1712 		 */
1713 		(void) rdsib_add_hca(event->ev_hca_guid);
1714 		break;
1715 	case IBT_HCA_DETACH_EVENT:
1716 		(void) rdsib_del_hca(statep, event->ev_hca_guid);
1717 		break;
1718 
1719 	default:
1720 		RDS_DPRINTF2(LABEL, "Async event: %d not handled", code);
1721 	}
1722 
1723 	RDS_DPRINTF2("rds_async_handler", "Return: code: %d", code);
1724 }
1725 
1726 /*
1727  * This routine exists to minimize stale connections across ungraceful
1728  * reboots of nodes in a cluster.
1729  */
1730 void
1731 rds_randomize_qps(rds_hca_t *hcap)
1732 {
1733 	ibt_cq_attr_t			cqattr;
1734 	ibt_rc_chan_alloc_args_t	chanargs;
1735 	ibt_channel_hdl_t		qp1, qp2;
1736 	ibt_cq_hdl_t			cq_hdl;
1737 	hrtime_t			nsec;
1738 	uint8_t				i, j, rand1, rand2;
1739 	int				ret;
1740 
1741 	bzero(&cqattr, sizeof (ibt_cq_attr_t));
1742 	cqattr.cq_size = 1;
1743 	cqattr.cq_sched = NULL;
1744 	cqattr.cq_flags = IBT_CQ_NO_FLAGS;
1745 	ret = ibt_alloc_cq(hcap->hca_hdl, &cqattr, &cq_hdl, NULL);
1746 	if (ret != IBT_SUCCESS) {
1747 		RDS_DPRINTF2("rds_randomize_qps",
1748 		    "ibt_alloc_cq failed: %d", ret);
1749 		return;
1750 	}
1751 
1752 	bzero(&chanargs, sizeof (ibt_rc_chan_alloc_args_t));
1753 	chanargs.rc_flags = IBT_ALL_SIGNALED;
1754 	chanargs.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR |
1755 	    IBT_CEP_ATOMIC;
1756 	chanargs.rc_hca_port_num = 1;
1757 	chanargs.rc_scq = cq_hdl;
1758 	chanargs.rc_rcq = cq_hdl;
1759 	chanargs.rc_pd = hcap->hca_pdhdl;
1760 	chanargs.rc_srq = NULL;
1761 
1762 	nsec = gethrtime();
1763 	rand1 = (nsec & 0xF);
1764 	rand2 = (nsec >> 4) & 0xF;
1765 	RDS_DPRINTF2("rds_randomize_qps", "rand1: %d rand2: %d",
1766 	    rand1, rand2);
1767 
1768 	for (i = 0; i < rand1 + 3; i++) {
1769 		if (ibt_alloc_rc_channel(hcap->hca_hdl,
1770 		    IBT_ACHAN_NO_FLAGS, &chanargs, &qp1, NULL) !=
1771 		    IBT_SUCCESS) {
1772 			RDS_DPRINTF2("rds_randomize_qps",
1773 			    "Bailing at i: %d", i);
1774 			(void) ibt_free_cq(cq_hdl);
1775 			return;
1776 		}
1777 		for (j = 0; j < rand2 + 3; j++) {
1778 			if (ibt_alloc_rc_channel(hcap->hca_hdl,
1779 			    IBT_ACHAN_NO_FLAGS, &chanargs, &qp2,
1780 			    NULL) != IBT_SUCCESS) {
1781 				RDS_DPRINTF2("rds_randomize_qps",
1782 				    "Bailing at i: %d j: %d", i, j);
1783 				(void) ibt_free_channel(qp1);
1784 				(void) ibt_free_cq(cq_hdl);
1785 				return;
1786 			}
1787 			(void) ibt_free_channel(qp2);
1788 		}
1789 		(void) ibt_free_channel(qp1);
1790 	}
1791 
1792 	(void) ibt_free_cq(cq_hdl);
1793 }
1794