xref: /illumos-gate/usr/src/uts/common/io/ib/clients/rds/rdsib_ib.c (revision 00d1d19828f3122eb24ae7a68175ba64744f8366)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /*
25  * Copyright (c) 2005 SilverStorm Technologies, Inc. All rights reserved.
26  *
27  * This software is available to you under a choice of one of two
28  * licenses.  You may choose to be licensed under the terms of the GNU
29  * General Public License (GPL) Version 2, available from the file
30  * COPYING in the main directory of this source tree, or the
31  * OpenIB.org BSD license below:
32  *
33  *     Redistribution and use in source and binary forms, with or
34  *     without modification, are permitted provided that the following
35  *     conditions are met:
36  *
37  *	- Redistributions of source code must retain the above
38  *	  copyright notice, this list of conditions and the following
39  *	  disclaimer.
40  *
41  *	- Redistributions in binary form must reproduce the above
42  *	  copyright notice, this list of conditions and the following
43  *	  disclaimer in the documentation and/or other materials
44  *	  provided with the distribution.
45  *
46  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
47  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
48  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
49  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
50  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
51  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
52  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
53  * SOFTWARE.
54  *
55  */
56 /*
57  * Sun elects to include this software in Sun product
58  * under the OpenIB BSD license.
59  *
60  *
61  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
62  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
63  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
64  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
65  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
66  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
67  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
68  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
69  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
70  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
71  * POSSIBILITY OF SUCH DAMAGE.
72  */
73 
74 #include <sys/types.h>
75 #include <sys/ddi.h>
76 #include <sys/sunddi.h>
77 #include <sys/ib/clients/rds/rdsib_cm.h>
78 #include <sys/ib/clients/rds/rdsib_ib.h>
79 #include <sys/ib/clients/rds/rdsib_buf.h>
80 #include <sys/ib/clients/rds/rdsib_ep.h>
81 #include <sys/ib/clients/rds/rds_kstat.h>
82 
83 static void rds_async_handler(void *clntp, ibt_hca_hdl_t hdl,
84     ibt_async_code_t code, ibt_async_event_t *event);
85 
86 static struct ibt_clnt_modinfo_s rds_ib_modinfo = {
87 	IBTI_V_CURR,
88 	IBT_NETWORK,
89 	rds_async_handler,
90 	NULL,
91 	"RDS"
92 };
93 
94 /* performance tunables */
95 uint_t		rds_no_interrupts = 0;
96 uint_t		rds_poll_percent_full = 25;
97 uint_t		rds_wc_signal = IBT_NEXT_SOLICITED;
98 uint_t		rds_waittime_ms = 100; /* ms */
99 
100 extern dev_info_t *rdsib_dev_info;
101 extern void rds_close_sessions();
102 
103 static void
rdsib_validate_chan_sizes(ibt_hca_attr_t * hattrp)104 rdsib_validate_chan_sizes(ibt_hca_attr_t *hattrp)
105 {
106 	/* The SQ size should not be more than that supported by the HCA */
107 	if (((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_chan_sz) ||
108 	    ((MaxDataSendBuffers + RDS_NUM_ACKS) > hattrp->hca_max_cq_sz)) {
109 		RDS_DPRINTF2("RDSIB", "MaxDataSendBuffers + %d is greater "
110 		    "than that supported by the HCA driver "
111 		    "(%d + %d > %d or %d), lowering it to a supported value.",
112 		    RDS_NUM_ACKS, MaxDataSendBuffers, RDS_NUM_ACKS,
113 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
114 
115 		MaxDataSendBuffers = (hattrp->hca_max_chan_sz >
116 		    hattrp->hca_max_cq_sz) ?
117 		    hattrp->hca_max_cq_sz - RDS_NUM_ACKS :
118 		    hattrp->hca_max_chan_sz - RDS_NUM_ACKS;
119 	}
120 
121 	/* The RQ size should not be more than that supported by the HCA */
122 	if ((MaxDataRecvBuffers > hattrp->hca_max_chan_sz) ||
123 	    (MaxDataRecvBuffers > hattrp->hca_max_cq_sz)) {
124 		RDS_DPRINTF2("RDSIB", "MaxDataRecvBuffers is greater than that "
125 		    "supported by the HCA driver (%d > %d or %d), lowering it "
126 		    "to a supported value.", MaxDataRecvBuffers,
127 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
128 
129 		MaxDataRecvBuffers = (hattrp->hca_max_chan_sz >
130 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
131 		    hattrp->hca_max_chan_sz;
132 	}
133 
134 	/* The SQ size should not be more than that supported by the HCA */
135 	if ((MaxCtrlSendBuffers > hattrp->hca_max_chan_sz) ||
136 	    (MaxCtrlSendBuffers > hattrp->hca_max_cq_sz)) {
137 		RDS_DPRINTF2("RDSIB", "MaxCtrlSendBuffers is greater than that "
138 		    "supported by the HCA driver (%d > %d or %d), lowering it "
139 		    "to a supported value.", MaxCtrlSendBuffers,
140 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
141 
142 		MaxCtrlSendBuffers = (hattrp->hca_max_chan_sz >
143 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
144 		    hattrp->hca_max_chan_sz;
145 	}
146 
147 	/* The RQ size should not be more than that supported by the HCA */
148 	if ((MaxCtrlRecvBuffers > hattrp->hca_max_chan_sz) ||
149 	    (MaxCtrlRecvBuffers > hattrp->hca_max_cq_sz)) {
150 		RDS_DPRINTF2("RDSIB", "MaxCtrlRecvBuffers is greater than that "
151 		    "supported by the HCA driver (%d > %d or %d), lowering it "
152 		    "to a supported value.", MaxCtrlRecvBuffers,
153 		    hattrp->hca_max_chan_sz, hattrp->hca_max_cq_sz);
154 
155 		MaxCtrlRecvBuffers = (hattrp->hca_max_chan_sz >
156 		    hattrp->hca_max_cq_sz) ? hattrp->hca_max_cq_sz :
157 		    hattrp->hca_max_chan_sz;
158 	}
159 
160 	/* The MaxRecvMemory should be less than that supported by the HCA */
161 	if ((NDataRX * RdsPktSize) > hattrp->hca_max_memr_len) {
162 		RDS_DPRINTF2("RDSIB", "MaxRecvMemory is greater than that "
163 		    "supported by the HCA driver (%d > %d), lowering it to %d",
164 		    NDataRX * RdsPktSize, hattrp->hca_max_memr_len,
165 		    hattrp->hca_max_memr_len);
166 
167 		NDataRX = hattrp->hca_max_memr_len/RdsPktSize;
168 	}
169 }
170 
171 /* Return hcap, given the hca guid */
172 rds_hca_t *
rds_lkup_hca(ib_guid_t hca_guid)173 rds_lkup_hca(ib_guid_t hca_guid)
174 {
175 	rds_hca_t	*hcap;
176 
177 	RDS_DPRINTF4("rds_lkup_hca", "Enter: statep: 0x%p "
178 	    "guid: %llx", rdsib_statep, hca_guid);
179 
180 	rw_enter(&rdsib_statep->rds_hca_lock, RW_READER);
181 
182 	hcap = rdsib_statep->rds_hcalistp;
183 	while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
184 		hcap = hcap->hca_nextp;
185 	}
186 
187 	rw_exit(&rdsib_statep->rds_hca_lock);
188 
189 	RDS_DPRINTF4("rds_lkup_hca", "return");
190 
191 	return (hcap);
192 }
193 
194 void rds_randomize_qps(rds_hca_t *hcap);
195 
196 static rds_hca_t *
rdsib_init_hca(ib_guid_t hca_guid)197 rdsib_init_hca(ib_guid_t hca_guid)
198 {
199 	rds_hca_t	*hcap;
200 	boolean_t	alloc = B_FALSE;
201 	int		ret;
202 
203 	RDS_DPRINTF2("rdsib_init_hca", "enter: HCA 0x%llx", hca_guid);
204 
205 	/* Do a HCA lookup */
206 	hcap = rds_lkup_hca(hca_guid);
207 
208 	if (hcap != NULL && hcap->hca_hdl != NULL) {
209 		/*
210 		 * This can happen if we get IBT_HCA_ATTACH_EVENT on an HCA
211 		 * that we have already opened. Just return NULL so that
212 		 * we'll not end up reinitializing the HCA again.
213 		 */
214 		RDS_DPRINTF2("rdsib_init_hca", "HCA already initialized");
215 		return (NULL);
216 	}
217 
218 	if (hcap == NULL) {
219 		RDS_DPRINTF2("rdsib_init_hca", "New HCA is added");
220 		hcap = (rds_hca_t *)kmem_zalloc(sizeof (rds_hca_t), KM_SLEEP);
221 		alloc = B_TRUE;
222 	}
223 
224 	hcap->hca_guid = hca_guid;
225 	ret = ibt_open_hca(rdsib_statep->rds_ibhdl, hca_guid,
226 	    &hcap->hca_hdl);
227 	if (ret != IBT_SUCCESS) {
228 		if (ret == IBT_HCA_IN_USE) {
229 			RDS_DPRINTF2("rdsib_init_hca",
230 			    "ibt_open_hca: 0x%llx returned IBT_HCA_IN_USE",
231 			    hca_guid);
232 		} else {
233 			RDS_DPRINTF2("rdsib_init_hca",
234 			    "ibt_open_hca: 0x%llx failed: %d", hca_guid, ret);
235 		}
236 		if (alloc == B_TRUE) {
237 			kmem_free(hcap, sizeof (rds_hca_t));
238 		}
239 		return (NULL);
240 	}
241 
242 	ret = ibt_query_hca(hcap->hca_hdl, &hcap->hca_attr);
243 	if (ret != IBT_SUCCESS) {
244 		RDS_DPRINTF2("rdsib_init_hca",
245 		    "Query HCA: 0x%llx failed:  %d", hca_guid, ret);
246 		ret = ibt_close_hca(hcap->hca_hdl);
247 		ASSERT(ret == IBT_SUCCESS);
248 		if (alloc == B_TRUE) {
249 			kmem_free(hcap, sizeof (rds_hca_t));
250 		} else {
251 			hcap->hca_hdl = NULL;
252 		}
253 		return (NULL);
254 	}
255 
256 	ret = ibt_query_hca_ports(hcap->hca_hdl, 0,
257 	    &hcap->hca_pinfop, &hcap->hca_nports, &hcap->hca_pinfo_sz);
258 	if (ret != IBT_SUCCESS) {
259 		RDS_DPRINTF2("rdsib_init_hca",
260 		    "Query HCA 0x%llx ports failed: %d", hca_guid,
261 		    ret);
262 		ret = ibt_close_hca(hcap->hca_hdl);
263 		hcap->hca_hdl = NULL;
264 		ASSERT(ret == IBT_SUCCESS);
265 		if (alloc == B_TRUE) {
266 			kmem_free(hcap, sizeof (rds_hca_t));
267 		} else {
268 			hcap->hca_hdl = NULL;
269 		}
270 		return (NULL);
271 	}
272 
273 	/* Only one PD per HCA is allocated, so do it here */
274 	ret = ibt_alloc_pd(hcap->hca_hdl, IBT_PD_NO_FLAGS,
275 	    &hcap->hca_pdhdl);
276 	if (ret != IBT_SUCCESS) {
277 		RDS_DPRINTF2("rdsib_init_hca",
278 		    "ibt_alloc_pd 0x%llx failed: %d", hca_guid, ret);
279 		(void) ibt_free_portinfo(hcap->hca_pinfop,
280 		    hcap->hca_pinfo_sz);
281 		ret = ibt_close_hca(hcap->hca_hdl);
282 		ASSERT(ret == IBT_SUCCESS);
283 		hcap->hca_hdl = NULL;
284 		if (alloc == B_TRUE) {
285 			kmem_free(hcap, sizeof (rds_hca_t));
286 		} else {
287 			hcap->hca_hdl = NULL;
288 		}
289 		return (NULL);
290 	}
291 
292 	rdsib_validate_chan_sizes(&hcap->hca_attr);
293 
294 	/* To minimize stale connections after ungraceful reboots */
295 	rds_randomize_qps(hcap);
296 
297 	rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
298 	hcap->hca_state = RDS_HCA_STATE_OPEN;
299 	if (alloc == B_TRUE) {
300 		/* this is a new HCA, add it to the list */
301 		rdsib_statep->rds_nhcas++;
302 		hcap->hca_nextp = rdsib_statep->rds_hcalistp;
303 		rdsib_statep->rds_hcalistp = hcap;
304 	}
305 	rw_exit(&rdsib_statep->rds_hca_lock);
306 
307 	RDS_DPRINTF2("rdsib_init_hca", "return: HCA 0x%llx", hca_guid);
308 
309 	return (hcap);
310 }
311 
312 /*
313  * Called from attach
314  */
315 int
rdsib_initialize_ib()316 rdsib_initialize_ib()
317 {
318 	ib_guid_t	*guidp;
319 	rds_hca_t	*hcap;
320 	uint_t		ix, hcaix, nhcas;
321 	int		ret;
322 
323 	RDS_DPRINTF2("rdsib_initialize_ib", "enter: statep %p", rdsib_statep);
324 
325 	ASSERT(rdsib_statep != NULL);
326 	if (rdsib_statep == NULL) {
327 		RDS_DPRINTF1("rdsib_initialize_ib",
328 		    "RDS Statep not initialized");
329 		return (-1);
330 	}
331 
332 	/* How many hcas are there? */
333 	nhcas = ibt_get_hca_list(&guidp);
334 	if (nhcas == 0) {
335 		RDS_DPRINTF2("rdsib_initialize_ib", "No IB HCAs Available");
336 		return (-1);
337 	}
338 
339 	RDS_DPRINTF3("rdsib_initialize_ib", "Number of HCAs: %d", nhcas);
340 
341 	/* Register with IBTF */
342 	ret = ibt_attach(&rds_ib_modinfo, rdsib_dev_info, rdsib_statep,
343 	    &rdsib_statep->rds_ibhdl);
344 	if (ret != IBT_SUCCESS) {
345 		RDS_DPRINTF2("rdsib_initialize_ib", "ibt_attach failed: %d",
346 		    ret);
347 		(void) ibt_free_hca_list(guidp, nhcas);
348 		return (-1);
349 	}
350 
351 	/*
352 	 * Open each HCA and gather its information. Don't care about HCAs
353 	 * that cannot be opened. It is OK as long as atleast one HCA can be
354 	 * opened.
355 	 * Initialize a HCA only if all the information is available.
356 	 */
357 	for (ix = 0, hcaix = 0; ix < nhcas; ix++) {
358 		RDS_DPRINTF3(LABEL, "Open HCA: 0x%llx", guidp[ix]);
359 
360 		hcap = rdsib_init_hca(guidp[ix]);
361 		if (hcap != NULL) hcaix++;
362 	}
363 
364 	/* free the HCA list, we are done with it */
365 	(void) ibt_free_hca_list(guidp, nhcas);
366 
367 	if (hcaix == 0) {
368 		/* Failed to Initialize even one HCA */
369 		RDS_DPRINTF2("rdsib_initialize_ib", "No HCAs are initialized");
370 		(void) ibt_detach(rdsib_statep->rds_ibhdl);
371 		rdsib_statep->rds_ibhdl = NULL;
372 		return (-1);
373 	}
374 
375 	if (hcaix < nhcas) {
376 		RDS_DPRINTF2("rdsib_open_ib", "HCAs %d/%d failed to initialize",
377 		    (nhcas - hcaix), nhcas);
378 	}
379 
380 	RDS_DPRINTF2("rdsib_initialize_ib", "return: statep %p", rdsib_statep);
381 
382 	return (0);
383 }
384 
385 /*
386  * Called from detach
387  */
388 void
rdsib_deinitialize_ib()389 rdsib_deinitialize_ib()
390 {
391 	rds_hca_t	*hcap, *nextp;
392 	int		ret;
393 
394 	RDS_DPRINTF2("rdsib_deinitialize_ib", "enter: statep %p", rdsib_statep);
395 
396 	/* close and destroy all the sessions */
397 	rds_close_sessions(NULL);
398 
399 	/* Release all HCA resources */
400 	rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
401 	RDS_DPRINTF2("rdsib_deinitialize_ib", "HCA List: %p, NHCA: %d",
402 	    rdsib_statep->rds_hcalistp, rdsib_statep->rds_nhcas);
403 	hcap = rdsib_statep->rds_hcalistp;
404 	rdsib_statep->rds_hcalistp = NULL;
405 	rdsib_statep->rds_nhcas = 0;
406 	rw_exit(&rdsib_statep->rds_hca_lock);
407 
408 	while (hcap != NULL) {
409 		nextp = hcap->hca_nextp;
410 
411 		if (hcap->hca_hdl != NULL) {
412 			ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl);
413 			ASSERT(ret == IBT_SUCCESS);
414 
415 			(void) ibt_free_portinfo(hcap->hca_pinfop,
416 			    hcap->hca_pinfo_sz);
417 
418 			ret = ibt_close_hca(hcap->hca_hdl);
419 			ASSERT(ret == IBT_SUCCESS);
420 		}
421 
422 		kmem_free(hcap, sizeof (rds_hca_t));
423 		hcap = nextp;
424 	}
425 
426 	/* Deregister with IBTF */
427 	if (rdsib_statep->rds_ibhdl != NULL) {
428 		(void) ibt_detach(rdsib_statep->rds_ibhdl);
429 		rdsib_statep->rds_ibhdl = NULL;
430 	}
431 
432 	RDS_DPRINTF2("rdsib_deinitialize_ib", "return: statep %p",
433 	    rdsib_statep);
434 }
435 
436 /*
437  * Called on open of first RDS socket
438  */
439 int
rdsib_open_ib()440 rdsib_open_ib()
441 {
442 	int	ret;
443 
444 	RDS_DPRINTF2("rdsib_open_ib", "enter: statep %p", rdsib_statep);
445 
446 	/* Enable incoming connection requests */
447 	if (rdsib_statep->rds_srvhdl == NULL) {
448 		rdsib_statep->rds_srvhdl =
449 		    rds_register_service(rdsib_statep->rds_ibhdl);
450 		if (rdsib_statep->rds_srvhdl == NULL) {
451 			RDS_DPRINTF2("rdsib_open_ib",
452 			    "Service registration failed");
453 			return (-1);
454 		} else {
455 			/* bind the service on all available ports */
456 			ret = rds_bind_service(rdsib_statep);
457 			if (ret != 0) {
458 				RDS_DPRINTF2("rdsib_open_ib",
459 				    "Bind service failed: %d", ret);
460 			}
461 		}
462 	}
463 
464 	RDS_DPRINTF2("rdsib_open_ib", "return: statep %p", rdsib_statep);
465 
466 	return (0);
467 }
468 
469 /*
470  * Called when all ports are closed.
471  */
472 void
rdsib_close_ib()473 rdsib_close_ib()
474 {
475 	int	ret;
476 
477 	RDS_DPRINTF2("rdsib_close_ib", "enter: statep %p", rdsib_statep);
478 
479 	/* Disable incoming connection requests */
480 	if (rdsib_statep->rds_srvhdl != NULL) {
481 		ret = ibt_unbind_all_services(rdsib_statep->rds_srvhdl);
482 		if (ret != 0) {
483 			RDS_DPRINTF2("rdsib_close_ib",
484 			    "ibt_unbind_all_services failed: %d\n", ret);
485 		}
486 		ret = ibt_deregister_service(rdsib_statep->rds_ibhdl,
487 		    rdsib_statep->rds_srvhdl);
488 		if (ret != 0) {
489 			RDS_DPRINTF2("rdsib_close_ib",
490 			    "ibt_deregister_service failed: %d\n", ret);
491 		} else {
492 			rdsib_statep->rds_srvhdl = NULL;
493 		}
494 	}
495 
496 	RDS_DPRINTF2("rdsib_close_ib", "return: statep %p", rdsib_statep);
497 }
498 
499 /* Return hcap, given the hca guid */
500 rds_hca_t *
rds_get_hcap(rds_state_t * statep,ib_guid_t hca_guid)501 rds_get_hcap(rds_state_t *statep, ib_guid_t hca_guid)
502 {
503 	rds_hca_t	*hcap;
504 
505 	RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: Enter: statep: 0x%p "
506 	    "guid: %llx", statep, hca_guid);
507 
508 	rw_enter(&statep->rds_hca_lock, RW_READER);
509 
510 	hcap = statep->rds_hcalistp;
511 	while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
512 		hcap = hcap->hca_nextp;
513 	}
514 
515 	/*
516 	 * don't let anyone use this HCA until the RECV memory
517 	 * is registered with this HCA
518 	 */
519 	if ((hcap != NULL) &&
520 	    (hcap->hca_state == RDS_HCA_STATE_MEM_REGISTERED)) {
521 		ASSERT(hcap->hca_mrhdl != NULL);
522 		rw_exit(&statep->rds_hca_lock);
523 		return (hcap);
524 	}
525 
526 	RDS_DPRINTF2("rds_get_hcap",
527 	    "HCA (0x%p, 0x%llx) is not initialized", hcap, hca_guid);
528 	rw_exit(&statep->rds_hca_lock);
529 
530 	RDS_DPRINTF4("rds_get_hcap", "rds_get_hcap: return");
531 
532 	return (NULL);
533 }
534 
535 /* Return hcap, given a gid */
536 rds_hca_t *
rds_gid_to_hcap(rds_state_t * statep,ib_gid_t gid)537 rds_gid_to_hcap(rds_state_t *statep, ib_gid_t gid)
538 {
539 	rds_hca_t	*hcap;
540 	uint_t		ix;
541 
542 	RDS_DPRINTF4("rds_gid_to_hcap", "Enter: statep: 0x%p gid: %llx:%llx",
543 	    statep, gid.gid_prefix, gid.gid_guid);
544 
545 	rw_enter(&statep->rds_hca_lock, RW_READER);
546 
547 	hcap = statep->rds_hcalistp;
548 	while (hcap != NULL) {
549 
550 		/*
551 		 * don't let anyone use this HCA until the RECV memory
552 		 * is registered with this HCA
553 		 */
554 		if (hcap->hca_state != RDS_HCA_STATE_MEM_REGISTERED) {
555 			RDS_DPRINTF3("rds_gid_to_hcap",
556 			    "HCA (0x%p, 0x%llx) is not initialized",
557 			    hcap, gid.gid_guid);
558 			hcap = hcap->hca_nextp;
559 			continue;
560 		}
561 
562 		for (ix = 0; ix < hcap->hca_nports; ix++) {
563 			if ((hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_prefix ==
564 			    gid.gid_prefix) &&
565 			    (hcap->hca_pinfop[ix].p_sgid_tbl[0].gid_guid ==
566 			    gid.gid_guid)) {
567 				RDS_DPRINTF4("rds_gid_to_hcap",
568 				    "gid found in hcap: 0x%p", hcap);
569 				rw_exit(&statep->rds_hca_lock);
570 				return (hcap);
571 			}
572 		}
573 		hcap = hcap->hca_nextp;
574 	}
575 
576 	rw_exit(&statep->rds_hca_lock);
577 
578 	return (NULL);
579 }
580 
581 /* This is called from the send CQ handler */
582 void
rds_send_acknowledgement(rds_ep_t * ep)583 rds_send_acknowledgement(rds_ep_t *ep)
584 {
585 	int	ret;
586 	uint_t	ix;
587 
588 	RDS_DPRINTF4("rds_send_acknowledgement", "Enter EP(%p)", ep);
589 
590 	mutex_enter(&ep->ep_lock);
591 
592 	ASSERT(ep->ep_rdmacnt != 0);
593 
594 	/*
595 	 * The previous ACK completed successfully, send the next one
596 	 * if more messages were received after sending the last ACK
597 	 */
598 	if (ep->ep_rbufid != *(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va) {
599 		*(uintptr_t *)(uintptr_t)ep->ep_ackds.ds_va = ep->ep_rbufid;
600 		mutex_exit(&ep->ep_lock);
601 
602 		/* send acknowledgement */
603 		RDS_INCR_TXACKS();
604 		ret = ibt_post_send(ep->ep_chanhdl, &ep->ep_ackwr, 1, &ix);
605 		if (ret != IBT_SUCCESS) {
606 			RDS_DPRINTF2("rds_send_acknowledgement",
607 			    "EP(%p): ibt_post_send for acknowledgement "
608 			    "failed: %d, SQ depth: %d",
609 			    ep, ret, ep->ep_sndpool.pool_nbusy);
610 			mutex_enter(&ep->ep_lock);
611 			ep->ep_rdmacnt--;
612 			mutex_exit(&ep->ep_lock);
613 		}
614 	} else {
615 		/* ACKed all messages, no more to ACK */
616 		ep->ep_rdmacnt--;
617 		mutex_exit(&ep->ep_lock);
618 		return;
619 	}
620 
621 	RDS_DPRINTF4("rds_send_acknowledgement", "Return EP(%p)", ep);
622 }
623 
624 static int
rds_poll_ctrl_completions(ibt_cq_hdl_t cq,rds_ep_t * ep)625 rds_poll_ctrl_completions(ibt_cq_hdl_t cq, rds_ep_t *ep)
626 {
627 	ibt_wc_t	wc;
628 	uint_t		npolled;
629 	rds_buf_t	*bp;
630 	rds_ctrl_pkt_t	*cpkt;
631 	rds_qp_t	*recvqp;
632 	int		ret = IBT_SUCCESS;
633 
634 	RDS_DPRINTF4("rds_poll_ctrl_completions", "Enter: EP(%p)", ep);
635 
636 	bzero(&wc, sizeof (ibt_wc_t));
637 	ret = ibt_poll_cq(cq, &wc, 1, &npolled);
638 	if (ret != IBT_SUCCESS) {
639 		if (ret != IBT_CQ_EMPTY) {
640 			RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
641 			    "returned: %d", ep, cq, ret);
642 		} else {
643 			RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
644 			    "returned: IBT_CQ_EMPTY", ep, cq);
645 		}
646 		return (ret);
647 	}
648 
649 	bp = (rds_buf_t *)(uintptr_t)wc.wc_id;
650 
651 	if (wc.wc_status != IBT_WC_SUCCESS) {
652 		mutex_enter(&ep->ep_recvqp.qp_lock);
653 		ep->ep_recvqp.qp_level--;
654 		mutex_exit(&ep->ep_recvqp.qp_lock);
655 
656 		/* Free the buffer */
657 		bp->buf_state = RDS_RCVBUF_FREE;
658 		rds_free_recv_buf(bp, 1);
659 
660 		/* Receive completion failure */
661 		if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) {
662 			RDS_DPRINTF2("rds_poll_ctrl_completions",
663 			    "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
664 			    ep, cq, wc.wc_id, wc.wc_status);
665 		}
666 		return (ret);
667 	}
668 
669 	/* there is one less in the RQ */
670 	recvqp = &ep->ep_recvqp;
671 	mutex_enter(&recvqp->qp_lock);
672 	recvqp->qp_level--;
673 	if ((recvqp->qp_taskqpending == B_FALSE) &&
674 	    (recvqp->qp_level <= recvqp->qp_lwm)) {
675 		/* Time to post more buffers into the RQ */
676 		recvqp->qp_taskqpending = B_TRUE;
677 		mutex_exit(&recvqp->qp_lock);
678 
679 		ret = ddi_taskq_dispatch(rds_taskq,
680 		    rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
681 		if (ret != DDI_SUCCESS) {
682 			RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
683 			    ret);
684 			mutex_enter(&recvqp->qp_lock);
685 			recvqp->qp_taskqpending = B_FALSE;
686 			mutex_exit(&recvqp->qp_lock);
687 		}
688 	} else {
689 		mutex_exit(&recvqp->qp_lock);
690 	}
691 
692 	cpkt = (rds_ctrl_pkt_t *)(uintptr_t)bp->buf_ds.ds_va;
693 	rds_handle_control_message(ep->ep_sp, cpkt);
694 
695 	bp->buf_state = RDS_RCVBUF_FREE;
696 	rds_free_recv_buf(bp, 1);
697 
698 	RDS_DPRINTF4("rds_poll_ctrl_completions", "Return: EP(%p)", ep);
699 
700 	return (ret);
701 }
702 
703 #define	RDS_POST_FEW_ATATIME	100
704 /* Post recv WRs into the RQ. Assumes the ep->refcnt is already incremented */
705 void
rds_post_recv_buf(void * arg)706 rds_post_recv_buf(void *arg)
707 {
708 	ibt_channel_hdl_t	chanhdl;
709 	rds_ep_t		*ep;
710 	rds_session_t		*sp;
711 	rds_qp_t		*recvqp;
712 	rds_bufpool_t		*gp;
713 	rds_buf_t		*bp, *bp1;
714 	ibt_recv_wr_t		*wrp, wr[RDS_POST_FEW_ATATIME];
715 	rds_hca_t		*hcap;
716 	uint_t			npost, nspace, rcv_len;
717 	uint_t			ix, jx, kx;
718 	int			ret;
719 
720 	chanhdl = (ibt_channel_hdl_t)arg;
721 	RDS_DPRINTF4("rds_post_recv_buf", "Enter: CHAN(%p)", chanhdl);
722 	RDS_INCR_POST_RCV_BUF_CALLS();
723 
724 	ep = (rds_ep_t *)ibt_get_chan_private(chanhdl);
725 	ASSERT(ep != NULL);
726 	sp = ep->ep_sp;
727 	recvqp = &ep->ep_recvqp;
728 
729 	RDS_DPRINTF5("rds_post_recv_buf", "EP(%p)", ep);
730 
731 	/* get the hcap for the HCA hosting this channel */
732 	hcap = rds_lkup_hca(ep->ep_hca_guid);
733 	if (hcap == NULL) {
734 		RDS_DPRINTF2("rds_post_recv_buf", "HCA (0x%llx) not found",
735 		    ep->ep_hca_guid);
736 		return;
737 	}
738 
739 	/* Make sure the session is still connected */
740 	rw_enter(&sp->session_lock, RW_READER);
741 	if ((sp->session_state != RDS_SESSION_STATE_INIT) &&
742 	    (sp->session_state != RDS_SESSION_STATE_CONNECTED) &&
743 	    (sp->session_state != RDS_SESSION_STATE_HCA_CLOSING)) {
744 		RDS_DPRINTF2("rds_post_recv_buf", "EP(%p): Session is not "
745 		    "in active state (%d)", ep, sp->session_state);
746 		rw_exit(&sp->session_lock);
747 		return;
748 	}
749 	rw_exit(&sp->session_lock);
750 
751 	/* how many can be posted */
752 	mutex_enter(&recvqp->qp_lock);
753 	nspace = recvqp->qp_depth - recvqp->qp_level;
754 	if (nspace == 0) {
755 		RDS_DPRINTF2("rds_post_recv_buf", "RQ is FULL");
756 		recvqp->qp_taskqpending = B_FALSE;
757 		mutex_exit(&recvqp->qp_lock);
758 		return;
759 	}
760 	mutex_exit(&recvqp->qp_lock);
761 
762 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
763 		gp = &rds_dpool;
764 		rcv_len = RdsPktSize;
765 	} else {
766 		gp = &rds_cpool;
767 		rcv_len = RDS_CTRLPKT_SIZE;
768 	}
769 
770 	bp = rds_get_buf(gp, nspace, &jx);
771 	if (bp == NULL) {
772 		RDS_DPRINTF2(LABEL, "EP(%p): No Recv buffers available", ep);
773 		/* try again later */
774 		ret = ddi_taskq_dispatch(rds_taskq, rds_post_recv_buf,
775 		    (void *)chanhdl, DDI_NOSLEEP);
776 		if (ret != DDI_SUCCESS) {
777 			RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
778 			    ret);
779 			mutex_enter(&recvqp->qp_lock);
780 			recvqp->qp_taskqpending = B_FALSE;
781 			mutex_exit(&recvqp->qp_lock);
782 		}
783 		return;
784 	}
785 
786 	if (jx != nspace) {
787 		RDS_DPRINTF2(LABEL, "EP(%p): Recv buffers "
788 		    "needed: %d available: %d", ep, nspace, jx);
789 		nspace = jx;
790 	}
791 
792 	bp1 = bp;
793 	for (ix = 0; ix < nspace; ix++) {
794 		bp1->buf_ep = ep;
795 		ASSERT(bp1->buf_state == RDS_RCVBUF_FREE);
796 		bp1->buf_state = RDS_RCVBUF_POSTED;
797 		bp1->buf_ds.ds_key = hcap->hca_lkey;
798 		bp1->buf_ds.ds_len = rcv_len;
799 		bp1 = bp1->buf_nextp;
800 	}
801 
802 #if 0
803 	wrp = kmem_zalloc(RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t),
804 	    KM_SLEEP);
805 #else
806 	wrp = &wr[0];
807 #endif
808 
809 	npost = nspace;
810 	while (npost) {
811 		jx = (npost > RDS_POST_FEW_ATATIME) ?
812 		    RDS_POST_FEW_ATATIME : npost;
813 		for (ix = 0; ix < jx; ix++) {
814 			wrp[ix].wr_id = (uintptr_t)bp;
815 			wrp[ix].wr_nds = 1;
816 			wrp[ix].wr_sgl = &bp->buf_ds;
817 			bp = bp->buf_nextp;
818 		}
819 
820 		ret = ibt_post_recv(chanhdl, wrp, jx, &kx);
821 		if ((ret != IBT_SUCCESS) || (kx != jx)) {
822 			RDS_DPRINTF2(LABEL, "ibt_post_recv for %d WRs failed: "
823 			    "%d", npost, ret);
824 			npost -= kx;
825 			break;
826 		}
827 
828 		npost -= jx;
829 	}
830 
831 	mutex_enter(&recvqp->qp_lock);
832 	if (npost != 0) {
833 		RDS_DPRINTF2("rds_post_recv_buf",
834 		    "EP(%p) Failed to post %d WRs", ep, npost);
835 		recvqp->qp_level += (nspace - npost);
836 	} else {
837 		recvqp->qp_level += nspace;
838 	}
839 
840 	/*
841 	 * sometimes, the recv WRs can get consumed as soon as they are
842 	 * posted. In that case, taskq thread to post more WRs to the RQ will
843 	 * not be scheduled as the taskqpending flag is still set.
844 	 */
845 	if (recvqp->qp_level == 0) {
846 		mutex_exit(&recvqp->qp_lock);
847 		ret = ddi_taskq_dispatch(rds_taskq,
848 		    rds_post_recv_buf, (void *)chanhdl, DDI_NOSLEEP);
849 		if (ret != DDI_SUCCESS) {
850 			RDS_DPRINTF2("rds_post_recv_buf",
851 			    "ddi_taskq_dispatch failed: %d", ret);
852 			mutex_enter(&recvqp->qp_lock);
853 			recvqp->qp_taskqpending = B_FALSE;
854 			mutex_exit(&recvqp->qp_lock);
855 		}
856 	} else {
857 		recvqp->qp_taskqpending = B_FALSE;
858 		mutex_exit(&recvqp->qp_lock);
859 	}
860 
861 #if 0
862 	kmem_free(wrp, RDS_POST_FEW_ATATIME * sizeof (ibt_recv_wr_t));
863 #endif
864 
865 	RDS_DPRINTF4("rds_post_recv_buf", "Return: EP(%p)", ep);
866 }
867 
868 static int
rds_poll_data_completions(ibt_cq_hdl_t cq,rds_ep_t * ep)869 rds_poll_data_completions(ibt_cq_hdl_t cq, rds_ep_t *ep)
870 {
871 	ibt_wc_t	wc;
872 	rds_buf_t	*bp;
873 	rds_data_hdr_t	*pktp;
874 	rds_qp_t	*recvqp;
875 	uint_t		npolled;
876 	int		ret = IBT_SUCCESS;
877 
878 
879 	RDS_DPRINTF4("rds_poll_data_completions", "Enter: EP(%p)", ep);
880 
881 	bzero(&wc, sizeof (ibt_wc_t));
882 	ret = ibt_poll_cq(cq, &wc, 1, &npolled);
883 	if (ret != IBT_SUCCESS) {
884 		if (ret != IBT_CQ_EMPTY) {
885 			RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
886 			    "returned: %d", ep, cq, ret);
887 		} else {
888 			RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): ibt_poll_cq "
889 			    "returned: IBT_CQ_EMPTY", ep, cq);
890 		}
891 		return (ret);
892 	}
893 
894 	bp = (rds_buf_t *)(uintptr_t)wc.wc_id;
895 	ASSERT(bp->buf_state == RDS_RCVBUF_POSTED);
896 	bp->buf_state = RDS_RCVBUF_ONSOCKQ;
897 	bp->buf_nextp = NULL;
898 
899 	if (wc.wc_status != IBT_WC_SUCCESS) {
900 		mutex_enter(&ep->ep_recvqp.qp_lock);
901 		ep->ep_recvqp.qp_level--;
902 		mutex_exit(&ep->ep_recvqp.qp_lock);
903 
904 		/* free the buffer */
905 		bp->buf_state = RDS_RCVBUF_FREE;
906 		rds_free_recv_buf(bp, 1);
907 
908 		/* Receive completion failure */
909 		if (wc.wc_status != IBT_WC_WR_FLUSHED_ERR) {
910 			RDS_DPRINTF2("rds_poll_data_completions",
911 			    "EP(%p) CQ(%p) BP(%p): WC Error Status: %d",
912 			    ep, cq, wc.wc_id, wc.wc_status);
913 			RDS_INCR_RXERRS();
914 		}
915 		return (ret);
916 	}
917 
918 	/* there is one less in the RQ */
919 	recvqp = &ep->ep_recvqp;
920 	mutex_enter(&recvqp->qp_lock);
921 	recvqp->qp_level--;
922 	if ((recvqp->qp_taskqpending == B_FALSE) &&
923 	    (recvqp->qp_level <= recvqp->qp_lwm)) {
924 		/* Time to post more buffers into the RQ */
925 		recvqp->qp_taskqpending = B_TRUE;
926 		mutex_exit(&recvqp->qp_lock);
927 
928 		ret = ddi_taskq_dispatch(rds_taskq,
929 		    rds_post_recv_buf, (void *)ep->ep_chanhdl, DDI_NOSLEEP);
930 		if (ret != DDI_SUCCESS) {
931 			RDS_DPRINTF2(LABEL, "ddi_taskq_dispatch failed: %d",
932 			    ret);
933 			mutex_enter(&recvqp->qp_lock);
934 			recvqp->qp_taskqpending = B_FALSE;
935 			mutex_exit(&recvqp->qp_lock);
936 		}
937 	} else {
938 		mutex_exit(&recvqp->qp_lock);
939 	}
940 
941 	pktp = (rds_data_hdr_t *)(uintptr_t)bp->buf_ds.ds_va;
942 	ASSERT(pktp->dh_datalen != 0);
943 
944 	RDS_DPRINTF5(LABEL, "Message Received: sendIP: 0x%x recvIP: 0x%x "
945 	    "sendport: %d recvport: %d npkts: %d pktno: %d", ep->ep_remip,
946 	    ep->ep_myip, pktp->dh_sendport, pktp->dh_recvport,
947 	    pktp->dh_npkts, pktp->dh_psn);
948 
949 	RDS_DPRINTF3(LABEL, "BP(%p): npkts: %d psn: %d", bp,
950 	    pktp->dh_npkts, pktp->dh_psn);
951 
952 	if (pktp->dh_npkts == 1) {
953 		/* single pkt or last packet */
954 		if (pktp->dh_psn != 0) {
955 			/* last packet of a segmented message */
956 			ASSERT(ep->ep_seglbp != NULL);
957 			ep->ep_seglbp->buf_nextp = bp;
958 			ep->ep_seglbp = bp;
959 			rds_received_msg(ep, ep->ep_segfbp);
960 			ep->ep_segfbp = NULL;
961 			ep->ep_seglbp = NULL;
962 		} else {
963 			/* single packet */
964 			rds_received_msg(ep, bp);
965 		}
966 	} else {
967 		/* multi-pkt msg */
968 		if (pktp->dh_psn == 0) {
969 			/* first packet */
970 			ASSERT(ep->ep_segfbp == NULL);
971 			ep->ep_segfbp = bp;
972 			ep->ep_seglbp = bp;
973 		} else {
974 			/* intermediate packet */
975 			ASSERT(ep->ep_segfbp != NULL);
976 			ep->ep_seglbp->buf_nextp = bp;
977 			ep->ep_seglbp = bp;
978 		}
979 	}
980 
981 	RDS_DPRINTF4("rds_poll_data_completions", "Return: EP(%p)", ep);
982 
983 	return (ret);
984 }
985 
986 void
rds_recvcq_handler(ibt_cq_hdl_t cq,void * arg)987 rds_recvcq_handler(ibt_cq_hdl_t cq, void *arg)
988 {
989 	rds_ep_t	*ep;
990 	int		ret = IBT_SUCCESS;
991 	int		(*func)(ibt_cq_hdl_t, rds_ep_t *);
992 
993 	ep = (rds_ep_t *)arg;
994 
995 	RDS_DPRINTF4("rds_recvcq_handler", "enter: EP(%p)", ep);
996 
997 	if (ep->ep_type == RDS_EP_TYPE_DATA) {
998 		func = rds_poll_data_completions;
999 	} else {
1000 		func = rds_poll_ctrl_completions;
1001 	}
1002 
1003 	do {
1004 		ret = func(cq, ep);
1005 	} while (ret != IBT_CQ_EMPTY);
1006 
1007 	/* enable the CQ */
1008 	ret = ibt_enable_cq_notify(cq, rds_wc_signal);
1009 	if (ret != IBT_SUCCESS) {
1010 		RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify "
1011 		    "failed: %d", ep, cq, ret);
1012 		return;
1013 	}
1014 
1015 	do {
1016 		ret = func(cq, ep);
1017 	} while (ret != IBT_CQ_EMPTY);
1018 
1019 	RDS_DPRINTF4("rds_recvcq_handler", "Return: EP(%p)", ep);
1020 }
1021 
1022 void
rds_poll_send_completions(ibt_cq_hdl_t cq,rds_ep_t * ep,boolean_t lock)1023 rds_poll_send_completions(ibt_cq_hdl_t cq, rds_ep_t *ep, boolean_t lock)
1024 {
1025 	ibt_wc_t	wc[RDS_NUM_DATA_SEND_WCS];
1026 	uint_t		npolled, nret, send_error = 0;
1027 	rds_buf_t	*headp, *tailp, *bp;
1028 	int		ret, ix;
1029 
1030 	RDS_DPRINTF4("rds_poll_send_completions", "Enter EP(%p)", ep);
1031 
1032 	headp = NULL;
1033 	tailp = NULL;
1034 	npolled = 0;
1035 	do {
1036 		ret = ibt_poll_cq(cq, wc, RDS_NUM_DATA_SEND_WCS, &nret);
1037 		if (ret != IBT_SUCCESS) {
1038 			if (ret != IBT_CQ_EMPTY) {
1039 				RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): "
1040 				    "ibt_poll_cq returned: %d", ep, cq, ret);
1041 			} else {
1042 				RDS_DPRINTF5(LABEL, "EP(%p) CQ(%p): "
1043 				    "ibt_poll_cq returned: IBT_CQ_EMPTY",
1044 				    ep, cq);
1045 			}
1046 
1047 			break;
1048 		}
1049 
1050 		for (ix = 0; ix < nret; ix++) {
1051 			if (wc[ix].wc_status == IBT_WC_SUCCESS) {
1052 				if (wc[ix].wc_type == IBT_WRC_RDMAW) {
1053 					rds_send_acknowledgement(ep);
1054 					continue;
1055 				}
1056 
1057 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
1058 				ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
1059 				bp->buf_state = RDS_SNDBUF_FREE;
1060 			} else if (wc[ix].wc_status == IBT_WC_WR_FLUSHED_ERR) {
1061 				RDS_INCR_TXERRS();
1062 				RDS_DPRINTF5("rds_poll_send_completions",
1063 				    "EP(%p): WC ID: %p ERROR: %d", ep,
1064 				    wc[ix].wc_id, wc[ix].wc_status);
1065 
1066 				send_error = 1;
1067 
1068 				if (wc[ix].wc_id == RDS_RDMAW_WRID) {
1069 					mutex_enter(&ep->ep_lock);
1070 					ep->ep_rdmacnt--;
1071 					mutex_exit(&ep->ep_lock);
1072 					continue;
1073 				}
1074 
1075 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
1076 				ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
1077 				bp->buf_state = RDS_SNDBUF_FREE;
1078 			} else {
1079 				RDS_INCR_TXERRS();
1080 				RDS_DPRINTF2("rds_poll_send_completions",
1081 				    "EP(%p): WC ID: %p ERROR: %d", ep,
1082 				    wc[ix].wc_id, wc[ix].wc_status);
1083 				if (send_error == 0) {
1084 					rds_session_t	*sp = ep->ep_sp;
1085 
1086 					/* don't let anyone send anymore */
1087 					rw_enter(&sp->session_lock, RW_WRITER);
1088 					if (sp->session_state !=
1089 					    RDS_SESSION_STATE_ERROR) {
1090 						sp->session_state =
1091 						    RDS_SESSION_STATE_ERROR;
1092 						/* Make this the active end */
1093 						sp->session_type =
1094 						    RDS_SESSION_ACTIVE;
1095 					}
1096 					rw_exit(&sp->session_lock);
1097 				}
1098 
1099 				send_error = 1;
1100 
1101 				if (wc[ix].wc_id == RDS_RDMAW_WRID) {
1102 					mutex_enter(&ep->ep_lock);
1103 					ep->ep_rdmacnt--;
1104 					mutex_exit(&ep->ep_lock);
1105 					continue;
1106 				}
1107 
1108 				bp = (rds_buf_t *)(uintptr_t)wc[ix].wc_id;
1109 				ASSERT(bp->buf_state == RDS_SNDBUF_PENDING);
1110 				bp->buf_state = RDS_SNDBUF_FREE;
1111 			}
1112 
1113 			bp->buf_nextp = NULL;
1114 			if (headp) {
1115 				tailp->buf_nextp = bp;
1116 				tailp = bp;
1117 			} else {
1118 				headp = bp;
1119 				tailp = bp;
1120 			}
1121 
1122 			npolled++;
1123 		}
1124 
1125 		if (rds_no_interrupts && (npolled > 100)) {
1126 			break;
1127 		}
1128 
1129 		if (rds_no_interrupts == 1) {
1130 			break;
1131 		}
1132 	} while (ret != IBT_CQ_EMPTY);
1133 
1134 	RDS_DPRINTF5("rds_poll_send_completions", "Npolled: %d send_error: %d",
1135 	    npolled, send_error);
1136 
1137 	/* put the buffers to the pool */
1138 	if (npolled != 0) {
1139 		rds_free_send_buf(ep, headp, tailp, npolled, lock);
1140 	}
1141 
1142 	if (send_error != 0) {
1143 		rds_handle_send_error(ep);
1144 	}
1145 
1146 	RDS_DPRINTF4("rds_poll_send_completions", "Return EP(%p)", ep);
1147 }
1148 
1149 void
rds_sendcq_handler(ibt_cq_hdl_t cq,void * arg)1150 rds_sendcq_handler(ibt_cq_hdl_t cq, void *arg)
1151 {
1152 	rds_ep_t	*ep;
1153 	int		ret;
1154 
1155 	ep = (rds_ep_t *)arg;
1156 
1157 	RDS_DPRINTF4("rds_sendcq_handler", "Enter: EP(%p)", ep);
1158 
1159 	/* enable the CQ */
1160 	ret = ibt_enable_cq_notify(cq, IBT_NEXT_COMPLETION);
1161 	if (ret != IBT_SUCCESS) {
1162 		RDS_DPRINTF2(LABEL, "EP(%p) CQ(%p): ibt_enable_cq_notify "
1163 		    "failed: %d", ep, cq, ret);
1164 		return;
1165 	}
1166 
1167 	rds_poll_send_completions(cq, ep, B_FALSE);
1168 
1169 	RDS_DPRINTF4("rds_sendcq_handler", "Return: EP(%p)", ep);
1170 }
1171 
1172 void
rds_ep_free_rc_channel(rds_ep_t * ep)1173 rds_ep_free_rc_channel(rds_ep_t *ep)
1174 {
1175 	int ret;
1176 
1177 	RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Enter", ep);
1178 
1179 	ASSERT(mutex_owned(&ep->ep_lock));
1180 
1181 	/* free the QP */
1182 	if (ep->ep_chanhdl != NULL) {
1183 		/* wait until the RQ is empty */
1184 		(void) ibt_flush_channel(ep->ep_chanhdl);
1185 		(void) rds_is_recvq_empty(ep, B_TRUE);
1186 		ret = ibt_free_channel(ep->ep_chanhdl);
1187 		if (ret != IBT_SUCCESS) {
1188 			RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) "
1189 			    "ibt_free_channel returned: %d", ep, ret);
1190 		}
1191 		ep->ep_chanhdl = NULL;
1192 	} else {
1193 		RDS_DPRINTF2("rds_ep_free_rc_channel",
1194 		    "EP(%p) Channel is ALREADY FREE", ep);
1195 	}
1196 
1197 	/* free the Send CQ */
1198 	if (ep->ep_sendcq != NULL) {
1199 		ret = ibt_free_cq(ep->ep_sendcq);
1200 		if (ret != IBT_SUCCESS) {
1201 			RDS_DPRINTF2("rds_ep_free_rc_channel",
1202 			    "EP(%p) - for sendcq, ibt_free_cq returned %d",
1203 			    ep, ret);
1204 		}
1205 		ep->ep_sendcq = NULL;
1206 	} else {
1207 		RDS_DPRINTF2("rds_ep_free_rc_channel",
1208 		    "EP(%p) SendCQ is ALREADY FREE", ep);
1209 	}
1210 
1211 	/* free the Recv CQ */
1212 	if (ep->ep_recvcq != NULL) {
1213 		ret = ibt_free_cq(ep->ep_recvcq);
1214 		if (ret != IBT_SUCCESS) {
1215 			RDS_DPRINTF2("rds_ep_free_rc_channel",
1216 			    "EP(%p) - for recvcq, ibt_free_cq returned %d",
1217 			    ep, ret);
1218 		}
1219 		ep->ep_recvcq = NULL;
1220 	} else {
1221 		RDS_DPRINTF2("rds_ep_free_rc_channel",
1222 		    "EP(%p) RecvCQ is ALREADY FREE", ep);
1223 	}
1224 
1225 	RDS_DPRINTF2("rds_ep_free_rc_channel", "EP(%p) - Return", ep);
1226 }
1227 
1228 /* Allocate resources for RC channel */
1229 ibt_channel_hdl_t
rds_ep_alloc_rc_channel(rds_ep_t * ep,uint8_t hca_port)1230 rds_ep_alloc_rc_channel(rds_ep_t *ep, uint8_t hca_port)
1231 {
1232 	int				ret = IBT_SUCCESS;
1233 	ibt_cq_attr_t			scqattr, rcqattr;
1234 	ibt_rc_chan_alloc_args_t	chanargs;
1235 	ibt_channel_hdl_t		chanhdl;
1236 	rds_session_t			*sp;
1237 	rds_hca_t			*hcap;
1238 
1239 	RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Enter: 0x%p port: %d",
1240 	    ep, hca_port);
1241 
1242 	/* Update the EP with the right IP address and HCA guid */
1243 	sp = ep->ep_sp;
1244 	ASSERT(sp != NULL);
1245 	rw_enter(&sp->session_lock, RW_READER);
1246 	mutex_enter(&ep->ep_lock);
1247 	ep->ep_myip = sp->session_myip;
1248 	ep->ep_remip = sp->session_remip;
1249 	hcap = rds_gid_to_hcap(rdsib_statep, sp->session_lgid);
1250 	ep->ep_hca_guid = hcap->hca_guid;
1251 	mutex_exit(&ep->ep_lock);
1252 	rw_exit(&sp->session_lock);
1253 
1254 	/* reset taskqpending flag here */
1255 	ep->ep_recvqp.qp_taskqpending = B_FALSE;
1256 
1257 	if (ep->ep_type == RDS_EP_TYPE_CTRL) {
1258 		scqattr.cq_size = MaxCtrlSendBuffers;
1259 		scqattr.cq_sched = NULL;
1260 		scqattr.cq_flags = IBT_CQ_NO_FLAGS;
1261 
1262 		rcqattr.cq_size = MaxCtrlRecvBuffers;
1263 		rcqattr.cq_sched = NULL;
1264 		rcqattr.cq_flags = IBT_CQ_NO_FLAGS;
1265 
1266 		chanargs.rc_sizes.cs_sq = MaxCtrlSendBuffers;
1267 		chanargs.rc_sizes.cs_rq = MaxCtrlRecvBuffers;
1268 		chanargs.rc_sizes.cs_sq_sgl = 1;
1269 		chanargs.rc_sizes.cs_rq_sgl = 1;
1270 	} else {
1271 		scqattr.cq_size = MaxDataSendBuffers + RDS_NUM_ACKS;
1272 		scqattr.cq_sched = NULL;
1273 		scqattr.cq_flags = IBT_CQ_NO_FLAGS;
1274 
1275 		rcqattr.cq_size = MaxDataRecvBuffers;
1276 		rcqattr.cq_sched = NULL;
1277 		rcqattr.cq_flags = IBT_CQ_NO_FLAGS;
1278 
1279 		chanargs.rc_sizes.cs_sq = MaxDataSendBuffers + RDS_NUM_ACKS;
1280 		chanargs.rc_sizes.cs_rq = MaxDataRecvBuffers;
1281 		chanargs.rc_sizes.cs_sq_sgl = 1;
1282 		chanargs.rc_sizes.cs_rq_sgl = 1;
1283 	}
1284 
1285 	mutex_enter(&ep->ep_lock);
1286 	if (ep->ep_sendcq == NULL) {
1287 		/* returned size is always greater than the requested size */
1288 		ret = ibt_alloc_cq(hcap->hca_hdl, &scqattr,
1289 		    &ep->ep_sendcq, NULL);
1290 		if (ret != IBT_SUCCESS) {
1291 			RDS_DPRINTF2(LABEL, "ibt_alloc_cq for sendCQ "
1292 			    "failed, size = %d: %d", scqattr.cq_size, ret);
1293 			mutex_exit(&ep->ep_lock);
1294 			return (NULL);
1295 		}
1296 
1297 		(void) ibt_set_cq_handler(ep->ep_sendcq, rds_sendcq_handler,
1298 		    ep);
1299 
1300 		if (rds_no_interrupts == 0) {
1301 			ret = ibt_enable_cq_notify(ep->ep_sendcq,
1302 			    IBT_NEXT_COMPLETION);
1303 			if (ret != IBT_SUCCESS) {
1304 				RDS_DPRINTF2(LABEL,
1305 				    "ibt_enable_cq_notify failed: %d", ret);
1306 				(void) ibt_free_cq(ep->ep_sendcq);
1307 				ep->ep_sendcq = NULL;
1308 				mutex_exit(&ep->ep_lock);
1309 				return (NULL);
1310 			}
1311 		}
1312 	}
1313 
1314 	if (ep->ep_recvcq == NULL) {
1315 		/* returned size is always greater than the requested size */
1316 		ret = ibt_alloc_cq(hcap->hca_hdl, &rcqattr,
1317 		    &ep->ep_recvcq, NULL);
1318 		if (ret != IBT_SUCCESS) {
1319 			RDS_DPRINTF2(LABEL, "ibt_alloc_cq for recvCQ "
1320 			    "failed, size = %d: %d", rcqattr.cq_size, ret);
1321 			(void) ibt_free_cq(ep->ep_sendcq);
1322 			ep->ep_sendcq = NULL;
1323 			mutex_exit(&ep->ep_lock);
1324 			return (NULL);
1325 		}
1326 
1327 		(void) ibt_set_cq_handler(ep->ep_recvcq, rds_recvcq_handler,
1328 		    ep);
1329 
1330 		ret = ibt_enable_cq_notify(ep->ep_recvcq, rds_wc_signal);
1331 		if (ret != IBT_SUCCESS) {
1332 			RDS_DPRINTF2(LABEL,
1333 			    "ibt_enable_cq_notify failed: %d", ret);
1334 			(void) ibt_free_cq(ep->ep_recvcq);
1335 			ep->ep_recvcq = NULL;
1336 			(void) ibt_free_cq(ep->ep_sendcq);
1337 			ep->ep_sendcq = NULL;
1338 			mutex_exit(&ep->ep_lock);
1339 			return (NULL);
1340 		}
1341 	}
1342 
1343 	chanargs.rc_flags = IBT_ALL_SIGNALED;
1344 	chanargs.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR |
1345 	    IBT_CEP_ATOMIC;
1346 	chanargs.rc_hca_port_num = hca_port;
1347 	chanargs.rc_scq = ep->ep_sendcq;
1348 	chanargs.rc_rcq = ep->ep_recvcq;
1349 	chanargs.rc_pd = hcap->hca_pdhdl;
1350 	chanargs.rc_srq = NULL;
1351 
1352 	ret = ibt_alloc_rc_channel(hcap->hca_hdl,
1353 	    IBT_ACHAN_NO_FLAGS, &chanargs, &chanhdl, NULL);
1354 	if (ret != IBT_SUCCESS) {
1355 		RDS_DPRINTF2(LABEL, "ibt_alloc_rc_channel fail: %d",
1356 		    ret);
1357 		(void) ibt_free_cq(ep->ep_recvcq);
1358 		ep->ep_recvcq = NULL;
1359 		(void) ibt_free_cq(ep->ep_sendcq);
1360 		ep->ep_sendcq = NULL;
1361 		mutex_exit(&ep->ep_lock);
1362 		return (NULL);
1363 	}
1364 	mutex_exit(&ep->ep_lock);
1365 
1366 	/* Chan private should contain the ep */
1367 	(void) ibt_set_chan_private(chanhdl, ep);
1368 
1369 	RDS_DPRINTF4("rds_ep_alloc_rc_channel", "Return: 0x%p", chanhdl);
1370 
1371 	return (chanhdl);
1372 }
1373 
1374 
1375 #if 0
1376 
1377 /* Return node guid given a port gid */
1378 ib_guid_t
1379 rds_gid_to_node_guid(ib_gid_t gid)
1380 {
1381 	ibt_node_info_t	nodeinfo;
1382 	int		ret;
1383 
1384 	RDS_DPRINTF4("rds_gid_to_node_guid", "Enter: gid: %llx:%llx",
1385 	    gid.gid_prefix, gid.gid_guid);
1386 
1387 	ret = ibt_gid_to_node_info(gid, &nodeinfo);
1388 	if (ret != IBT_SUCCESS) {
1389 		RDS_DPRINTF2(LABEL, "ibt_gid_node_info for gid: %llx:%llx "
1390 		    "failed", gid.gid_prefix, gid.gid_guid);
1391 		return (0LL);
1392 	}
1393 
1394 	RDS_DPRINTF4("rds_gid_to_node_guid", "Return: Node guid: %llx",
1395 	    nodeinfo.n_node_guid);
1396 
1397 	return (nodeinfo.n_node_guid);
1398 }
1399 
1400 #endif
1401 
1402 static void
rds_handle_portup_event(rds_state_t * statep,ibt_hca_hdl_t hdl,ibt_async_event_t * event)1403 rds_handle_portup_event(rds_state_t *statep, ibt_hca_hdl_t hdl,
1404     ibt_async_event_t *event)
1405 {
1406 	rds_hca_t		*hcap;
1407 	ibt_hca_portinfo_t	*newpinfop, *oldpinfop;
1408 	uint_t			newsize, oldsize, nport;
1409 	ib_gid_t		gid;
1410 	int			ret;
1411 
1412 	RDS_DPRINTF2("rds_handle_portup_event",
1413 	    "Enter: GUID: 0x%llx Statep: %p", event->ev_hca_guid, statep);
1414 
1415 	rw_enter(&statep->rds_hca_lock, RW_WRITER);
1416 
1417 	hcap = statep->rds_hcalistp;
1418 	while ((hcap != NULL) && (hcap->hca_guid != event->ev_hca_guid)) {
1419 		hcap = hcap->hca_nextp;
1420 	}
1421 
1422 	if (hcap == NULL) {
1423 		RDS_DPRINTF2("rds_handle_portup_event", "HCA: 0x%llx is "
1424 		    "not in our list", event->ev_hca_guid);
1425 		rw_exit(&statep->rds_hca_lock);
1426 		return;
1427 	}
1428 
1429 	ret = ibt_query_hca_ports(hdl, 0, &newpinfop, &nport, &newsize);
1430 	if (ret != IBT_SUCCESS) {
1431 		RDS_DPRINTF2(LABEL, "ibt_query_hca_ports failed: %d", ret);
1432 		rw_exit(&statep->rds_hca_lock);
1433 		return;
1434 	}
1435 
1436 	oldpinfop = hcap->hca_pinfop;
1437 	oldsize = hcap->hca_pinfo_sz;
1438 	hcap->hca_pinfop = newpinfop;
1439 	hcap->hca_pinfo_sz = newsize;
1440 
1441 	(void) ibt_free_portinfo(oldpinfop, oldsize);
1442 
1443 	/* If RDS service is not registered then no bind is needed */
1444 	if (statep->rds_srvhdl == NULL) {
1445 		RDS_DPRINTF2("rds_handle_portup_event",
1446 		    "RDS Service is not registered, so no action needed");
1447 		rw_exit(&statep->rds_hca_lock);
1448 		return;
1449 	}
1450 
1451 	/*
1452 	 * If the service was previously bound on this port and
1453 	 * if this port has changed state down and now up, we do not
1454 	 * need to bind the service again. The bind is expected to
1455 	 * persist across state changes. If the service was never bound
1456 	 * before then we bind it this time.
1457 	 */
1458 	if (hcap->hca_bindhdl[event->ev_port - 1] == NULL) {
1459 
1460 		/* structure copy */
1461 		gid = newpinfop[event->ev_port - 1].p_sgid_tbl[0];
1462 
1463 		/* bind RDS service on the port, pass statep as cm_private */
1464 		ret = ibt_bind_service(statep->rds_srvhdl, gid, NULL, statep,
1465 		    &hcap->hca_bindhdl[event->ev_port - 1]);
1466 		if (ret != IBT_SUCCESS) {
1467 			RDS_DPRINTF2("rds_handle_portup_event",
1468 			    "Bind service for HCA: 0x%llx Port: %d "
1469 			    "gid %llx:%llx returned: %d", event->ev_hca_guid,
1470 			    event->ev_port, gid.gid_prefix, gid.gid_guid, ret);
1471 		}
1472 	}
1473 
1474 	rw_exit(&statep->rds_hca_lock);
1475 
1476 	RDS_DPRINTF2("rds_handle_portup_event", "Return: GUID: 0x%llx",
1477 	    event->ev_hca_guid);
1478 }
1479 
1480 static void
rdsib_add_hca(ib_guid_t hca_guid)1481 rdsib_add_hca(ib_guid_t hca_guid)
1482 {
1483 	rds_hca_t	*hcap;
1484 	ibt_mr_attr_t	mem_attr;
1485 	ibt_mr_desc_t	mem_desc;
1486 	int		ret;
1487 
1488 	RDS_DPRINTF2("rdsib_add_hca", "Enter: GUID: 0x%llx", hca_guid);
1489 
1490 	hcap = rdsib_init_hca(hca_guid);
1491 	if (hcap == NULL)
1492 		return;
1493 
1494 	/* register the recv memory with this hca */
1495 	mutex_enter(&rds_dpool.pool_lock);
1496 	if (rds_dpool.pool_memp == NULL) {
1497 		/* no memory to register */
1498 		RDS_DPRINTF2("rdsib_add_hca", "No memory to register");
1499 		mutex_exit(&rds_dpool.pool_lock);
1500 		return;
1501 	}
1502 
1503 	mem_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)rds_dpool.pool_memp;
1504 	mem_attr.mr_len = rds_dpool.pool_memsize;
1505 	mem_attr.mr_as = NULL;
1506 	mem_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE;
1507 
1508 	ret = ibt_register_mr(hcap->hca_hdl, hcap->hca_pdhdl, &mem_attr,
1509 	    &hcap->hca_mrhdl, &mem_desc);
1510 
1511 	mutex_exit(&rds_dpool.pool_lock);
1512 
1513 	if (ret != IBT_SUCCESS) {
1514 		RDS_DPRINTF2("rdsib_add_hca", "ibt_register_mr failed: %d",
1515 		    ret);
1516 	} else {
1517 		rw_enter(&rdsib_statep->rds_hca_lock, RW_WRITER);
1518 		hcap->hca_state = RDS_HCA_STATE_MEM_REGISTERED;
1519 		hcap->hca_lkey = mem_desc.md_lkey;
1520 		hcap->hca_rkey = mem_desc.md_rkey;
1521 		rw_exit(&rdsib_statep->rds_hca_lock);
1522 	}
1523 
1524 	RDS_DPRINTF2("rdsib_add_hca", "Retrun: GUID: 0x%llx", hca_guid);
1525 }
1526 
1527 void rds_close_this_session(rds_session_t *sp, uint8_t wait);
1528 int rds_post_control_message(rds_session_t *sp, uint8_t code, in_port_t port);
1529 
1530 static void
rdsib_del_hca(rds_state_t * statep,ib_guid_t hca_guid)1531 rdsib_del_hca(rds_state_t *statep, ib_guid_t hca_guid)
1532 {
1533 	rds_session_t	*sp;
1534 	rds_hca_t	*hcap;
1535 	rds_hca_state_t	saved_state;
1536 	int		ret, ix;
1537 
1538 	RDS_DPRINTF2("rdsib_del_hca", "Enter: GUID: 0x%llx", hca_guid);
1539 
1540 	/*
1541 	 * This should be a write lock as we don't want anyone to get access
1542 	 * to the hcap while we are modifing its contents
1543 	 */
1544 	rw_enter(&statep->rds_hca_lock, RW_WRITER);
1545 
1546 	hcap = statep->rds_hcalistp;
1547 	while ((hcap != NULL) && (hcap->hca_guid != hca_guid)) {
1548 		hcap = hcap->hca_nextp;
1549 	}
1550 
1551 	/* Prevent initiating any new activity on this HCA */
1552 	ASSERT(hcap != NULL);
1553 	saved_state = hcap->hca_state;
1554 	hcap->hca_state = RDS_HCA_STATE_STOPPING;
1555 
1556 	rw_exit(&statep->rds_hca_lock);
1557 
1558 	/*
1559 	 * stop the outgoing traffic and close any active sessions on this hca.
1560 	 * Any pending messages in the SQ will be allowed to complete.
1561 	 */
1562 	rw_enter(&statep->rds_sessionlock, RW_READER);
1563 	sp = statep->rds_sessionlistp;
1564 	while (sp) {
1565 		if (sp->session_hca_guid != hca_guid) {
1566 			sp = sp->session_nextp;
1567 			continue;
1568 		}
1569 
1570 		rw_enter(&sp->session_lock, RW_WRITER);
1571 		RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
1572 		    sp->session_state);
1573 		/*
1574 		 * We are changing the session state in advance. This prevents
1575 		 * further messages to be posted to the SQ. We then
1576 		 * send a control message to the remote and tell it close
1577 		 * the session.
1578 		 */
1579 		sp->session_state = RDS_SESSION_STATE_HCA_CLOSING;
1580 		RDS_DPRINTF3("rds_handle_cm_conn_closed", "SP(%p) State "
1581 		    "RDS_SESSION_STATE_PASSIVE_CLOSING", sp);
1582 		rw_exit(&sp->session_lock);
1583 
1584 		/*
1585 		 * wait until the sendq is empty then tell the remote to
1586 		 * close this session. This enables for graceful shutdown of
1587 		 * the session
1588 		 */
1589 		(void) rds_is_sendq_empty(&sp->session_dataep, 2);
1590 		(void) rds_post_control_message(sp,
1591 		    RDS_CTRL_CODE_CLOSE_SESSION, 0);
1592 
1593 		sp = sp->session_nextp;
1594 	}
1595 
1596 	/* wait until all the sessions are off this HCA */
1597 	sp = statep->rds_sessionlistp;
1598 	while (sp) {
1599 		if (sp->session_hca_guid != hca_guid) {
1600 			sp = sp->session_nextp;
1601 			continue;
1602 		}
1603 
1604 		rw_enter(&sp->session_lock, RW_READER);
1605 		RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
1606 		    sp->session_state);
1607 
1608 		while ((sp->session_state == RDS_SESSION_STATE_HCA_CLOSING) ||
1609 		    (sp->session_state == RDS_SESSION_STATE_ERROR) ||
1610 		    (sp->session_state == RDS_SESSION_STATE_PASSIVE_CLOSING) ||
1611 		    (sp->session_state == RDS_SESSION_STATE_CLOSED)) {
1612 			rw_exit(&sp->session_lock);
1613 			delay(drv_usectohz(1000000));
1614 			rw_enter(&sp->session_lock, RW_READER);
1615 			RDS_DPRINTF2("rdsib_del_hca", "SP(%p) State: %d", sp,
1616 			    sp->session_state);
1617 		}
1618 
1619 		rw_exit(&sp->session_lock);
1620 
1621 		sp = sp->session_nextp;
1622 	}
1623 	rw_exit(&statep->rds_sessionlock);
1624 
1625 	/*
1626 	 * if rdsib_close_ib was called before this, then that would have
1627 	 * unbound the service on all ports. In that case, the HCA structs
1628 	 * will contain stale bindhdls. Hence, we do not call unbind unless
1629 	 * the service is still registered.
1630 	 */
1631 	if (statep->rds_srvhdl != NULL) {
1632 		/* unbind RDS service on all ports on this HCA */
1633 		for (ix = 0; ix < hcap->hca_nports; ix++) {
1634 			if (hcap->hca_bindhdl[ix] == NULL) {
1635 				continue;
1636 			}
1637 
1638 			RDS_DPRINTF2("rdsib_del_hca",
1639 			    "Unbinding Service: port: %d, bindhdl: %p",
1640 			    ix + 1, hcap->hca_bindhdl[ix]);
1641 			(void) ibt_unbind_service(rdsib_statep->rds_srvhdl,
1642 			    hcap->hca_bindhdl[ix]);
1643 			hcap->hca_bindhdl[ix] = NULL;
1644 		}
1645 	}
1646 
1647 	RDS_DPRINTF2("rdsib_del_hca", "HCA(%p) State: %d", hcap,
1648 	    hcap->hca_state);
1649 
1650 	switch (saved_state) {
1651 	case RDS_HCA_STATE_MEM_REGISTERED:
1652 		ASSERT(hcap->hca_mrhdl != NULL);
1653 		ret = ibt_deregister_mr(hcap->hca_hdl, hcap->hca_mrhdl);
1654 		if (ret != IBT_SUCCESS) {
1655 			RDS_DPRINTF2("rdsib_del_hca",
1656 			    "ibt_deregister_mr failed: %d", ret);
1657 			return;
1658 		}
1659 		hcap->hca_mrhdl = NULL;
1660 		/* FALLTHRU */
1661 	case RDS_HCA_STATE_OPEN:
1662 		ASSERT(hcap->hca_hdl != NULL);
1663 		ASSERT(hcap->hca_pdhdl != NULL);
1664 
1665 
1666 		ret = ibt_free_pd(hcap->hca_hdl, hcap->hca_pdhdl);
1667 		if (ret != IBT_SUCCESS) {
1668 			RDS_DPRINTF2("rdsib_del_hca",
1669 			    "ibt_free_pd failed: %d", ret);
1670 		}
1671 
1672 		(void) ibt_free_portinfo(hcap->hca_pinfop, hcap->hca_pinfo_sz);
1673 
1674 		ret = ibt_close_hca(hcap->hca_hdl);
1675 		if (ret != IBT_SUCCESS) {
1676 			RDS_DPRINTF2("rdsib_del_hca",
1677 			    "ibt_close_hca failed: %d", ret);
1678 		}
1679 
1680 		hcap->hca_hdl = NULL;
1681 		hcap->hca_pdhdl = NULL;
1682 		hcap->hca_lkey = 0;
1683 		hcap->hca_rkey = 0;
1684 	}
1685 
1686 	/*
1687 	 * This should be a write lock as we don't want anyone to get access
1688 	 * to the hcap while we are modifing its contents
1689 	 */
1690 	rw_enter(&statep->rds_hca_lock, RW_WRITER);
1691 	hcap->hca_state = RDS_HCA_STATE_REMOVED;
1692 	rw_exit(&statep->rds_hca_lock);
1693 
1694 	RDS_DPRINTF2("rdsib_del_hca", "Return: GUID: 0x%llx", hca_guid);
1695 }
1696 
1697 static void
rds_async_handler(void * clntp,ibt_hca_hdl_t hdl,ibt_async_code_t code,ibt_async_event_t * event)1698 rds_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code,
1699     ibt_async_event_t *event)
1700 {
1701 	rds_state_t		*statep = (rds_state_t *)clntp;
1702 
1703 	RDS_DPRINTF2("rds_async_handler", "Async code: %d", code);
1704 
1705 	switch (code) {
1706 	case IBT_EVENT_PORT_UP:
1707 		rds_handle_portup_event(statep, hdl, event);
1708 		break;
1709 	case IBT_HCA_ATTACH_EVENT:
1710 		/*
1711 		 * NOTE: In some error recovery paths, it is possible to
1712 		 * receive IBT_HCA_ATTACH_EVENTs on already known HCAs.
1713 		 */
1714 		(void) rdsib_add_hca(event->ev_hca_guid);
1715 		break;
1716 	case IBT_HCA_DETACH_EVENT:
1717 		(void) rdsib_del_hca(statep, event->ev_hca_guid);
1718 		break;
1719 
1720 	default:
1721 		RDS_DPRINTF2(LABEL, "Async event: %d not handled", code);
1722 	}
1723 
1724 	RDS_DPRINTF2("rds_async_handler", "Return: code: %d", code);
1725 }
1726 
1727 /*
1728  * This routine exists to minimize stale connections across ungraceful
1729  * reboots of nodes in a cluster.
1730  */
1731 void
rds_randomize_qps(rds_hca_t * hcap)1732 rds_randomize_qps(rds_hca_t *hcap)
1733 {
1734 	ibt_cq_attr_t			cqattr;
1735 	ibt_rc_chan_alloc_args_t	chanargs;
1736 	ibt_channel_hdl_t		qp1, qp2;
1737 	ibt_cq_hdl_t			cq_hdl;
1738 	hrtime_t			nsec;
1739 	uint8_t				i, j, rand1, rand2;
1740 	int				ret;
1741 
1742 	bzero(&cqattr, sizeof (ibt_cq_attr_t));
1743 	cqattr.cq_size = 1;
1744 	cqattr.cq_sched = NULL;
1745 	cqattr.cq_flags = IBT_CQ_NO_FLAGS;
1746 	ret = ibt_alloc_cq(hcap->hca_hdl, &cqattr, &cq_hdl, NULL);
1747 	if (ret != IBT_SUCCESS) {
1748 		RDS_DPRINTF2("rds_randomize_qps",
1749 		    "ibt_alloc_cq failed: %d", ret);
1750 		return;
1751 	}
1752 
1753 	bzero(&chanargs, sizeof (ibt_rc_chan_alloc_args_t));
1754 	chanargs.rc_flags = IBT_ALL_SIGNALED;
1755 	chanargs.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR |
1756 	    IBT_CEP_ATOMIC;
1757 	chanargs.rc_hca_port_num = 1;
1758 	chanargs.rc_scq = cq_hdl;
1759 	chanargs.rc_rcq = cq_hdl;
1760 	chanargs.rc_pd = hcap->hca_pdhdl;
1761 	chanargs.rc_srq = NULL;
1762 
1763 	nsec = gethrtime();
1764 	rand1 = (nsec & 0xF);
1765 	rand2 = (nsec >> 4) & 0xF;
1766 	RDS_DPRINTF2("rds_randomize_qps", "rand1: %d rand2: %d",
1767 	    rand1, rand2);
1768 
1769 	for (i = 0; i < rand1 + 3; i++) {
1770 		if (ibt_alloc_rc_channel(hcap->hca_hdl,
1771 		    IBT_ACHAN_NO_FLAGS, &chanargs, &qp1, NULL) !=
1772 		    IBT_SUCCESS) {
1773 			RDS_DPRINTF2("rds_randomize_qps",
1774 			    "Bailing at i: %d", i);
1775 			(void) ibt_free_cq(cq_hdl);
1776 			return;
1777 		}
1778 		for (j = 0; j < rand2 + 3; j++) {
1779 			if (ibt_alloc_rc_channel(hcap->hca_hdl,
1780 			    IBT_ACHAN_NO_FLAGS, &chanargs, &qp2,
1781 			    NULL) != IBT_SUCCESS) {
1782 				RDS_DPRINTF2("rds_randomize_qps",
1783 				    "Bailing at i: %d j: %d", i, j);
1784 				(void) ibt_free_channel(qp1);
1785 				(void) ibt_free_cq(cq_hdl);
1786 				return;
1787 			}
1788 			(void) ibt_free_channel(qp2);
1789 		}
1790 		(void) ibt_free_channel(qp1);
1791 	}
1792 
1793 	(void) ibt_free_cq(cq_hdl);
1794 }
1795