xref: /illumos-gate/usr/src/uts/common/io/ib/clients/iser/iser_ib.c (revision 9525b14bcdeb5b5f6f95ab27c2f48f18bd2ec829)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/ddi.h>
28 #include <sys/types.h>
29 #include <sys/socket.h>
30 #include <netinet/in.h>
31 #include <sys/sunddi.h>
32 #include <sys/sysmacros.h>
33 #include <sys/iscsi_protocol.h>
34 
35 #include <sys/ib/clients/iser/iser.h>
36 #include <sys/ib/clients/iser/iser_idm.h>
37 
38 /*
39  * iser_ib.c
40  * Routines for InfiniBand transport for iSER
41  *
42  * This file contains the routines to interface with the IBT API to attach and
43  * allocate IB resources, handle async events, and post recv work requests.
44  *
45  */
46 
47 static iser_hca_t *iser_ib_gid2hca(ib_gid_t gid);
48 static iser_hca_t *iser_ib_guid2hca(ib_guid_t guid);
49 
50 static iser_hca_t *iser_ib_alloc_hca(ib_guid_t guid);
51 static int iser_ib_free_hca(iser_hca_t *hca);
52 static int iser_ib_update_hcaports(iser_hca_t *hca);
53 static int iser_ib_init_hcas(void);
54 static int iser_ib_fini_hcas(void);
55 
56 static iser_sbind_t *iser_ib_get_bind(
57     iser_svc_t *iser_svc, ib_guid_t hca_guid, ib_gid_t gid);
58 static int iser_ib_activate_port(
59     idm_svc_t *idm_svc, ib_guid_t guid, ib_gid_t gid);
60 static void iser_ib_deactivate_port(ib_guid_t hca_guid, ib_gid_t gid);
61 
62 static void iser_ib_init_qp(iser_chan_t *chan, uint_t sq_size, uint_t rq_size);
63 static void iser_ib_fini_qp(iser_qp_t *qp);
64 
65 static int iser_ib_setup_cq(ibt_hca_hdl_t hca_hdl, uint_t cq_size,
66     ibt_cq_hdl_t *cq_hdl);
67 
68 static void iser_ib_setup_chanargs(uint8_t hca_port, ibt_cq_hdl_t scq_hdl,
69     ibt_cq_hdl_t rcq_hdl, uint_t sq_size, uint_t rq_size,
70     ibt_pd_hdl_t hca_pdhdl, ibt_rc_chan_alloc_args_t *cargs);
71 
72 static void iser_ib_handle_portup_event(ibt_hca_hdl_t hdl,
73     ibt_async_event_t *event);
74 static void iser_ib_handle_portdown_event(ibt_hca_hdl_t hdl,
75     ibt_async_event_t *event);
76 static void iser_ib_handle_hca_detach_event(ibt_hca_hdl_t hdl,
77     ibt_async_event_t *event);
78 
79 static void iser_ib_post_recv_task(void *arg);
80 
81 static struct ibt_clnt_modinfo_s iser_ib_modinfo = {
82 	IBTI_V_CURR,
83 	IBT_STORAGE_DEV,
84 	iser_ib_async_handler,
85 	NULL,
86 	"iSER"
87 };
88 
89 /*
90  * iser_ib_init
91  *
92  * This function registers the HCA drivers with IBTF and registers and binds
93  * iSER as a service with IBTF.
94  */
95 int
96 iser_ib_init(void)
97 {
98 	int		status;
99 
100 	/* Register with IBTF */
101 	status = ibt_attach(&iser_ib_modinfo, iser_state->is_dip, iser_state,
102 	    &iser_state->is_ibhdl);
103 	if (status != DDI_SUCCESS) {
104 		ISER_LOG(CE_NOTE, "iser_ib_init: ibt_attach failed (0x%x)",
105 		    status);
106 		return (DDI_FAILURE);
107 	}
108 
109 	/* Create the global work request kmem_cache */
110 	iser_state->iser_wr_cache = kmem_cache_create("iser_wr_cache",
111 	    sizeof (iser_wr_t), 0, NULL, NULL, NULL,
112 	    iser_state, NULL, KM_SLEEP);
113 
114 	/* Populate our list of HCAs */
115 	status = iser_ib_init_hcas();
116 	if (status != DDI_SUCCESS) {
117 		/* HCAs failed to initialize, tear it down */
118 		kmem_cache_destroy(iser_state->iser_wr_cache);
119 		(void) ibt_detach(iser_state->is_ibhdl);
120 		iser_state->is_ibhdl = NULL;
121 		ISER_LOG(CE_NOTE, "iser_ib_init: failed to initialize HCAs");
122 		return (DDI_FAILURE);
123 	}
124 
125 	/* Target will register iSER as a service with IBTF when required */
126 
127 	/* Target will bind this service when it comes online */
128 
129 	return (DDI_SUCCESS);
130 }
131 
132 /*
133  * iser_ib_fini
134  *
135  * This function unbinds and degisters the iSER service from IBTF
136  */
137 int
138 iser_ib_fini(void)
139 {
140 	/* IDM would have already disabled all the services */
141 
142 	/* Teardown the HCA list and associated resources */
143 	if (iser_ib_fini_hcas() != DDI_SUCCESS)
144 		return (DDI_FAILURE);
145 
146 	/* Teardown the global work request kmem_cache */
147 	kmem_cache_destroy(iser_state->iser_wr_cache);
148 
149 	/* Deregister with IBTF */
150 	if (iser_state->is_ibhdl != NULL) {
151 		(void) ibt_detach(iser_state->is_ibhdl);
152 		iser_state->is_ibhdl = NULL;
153 	}
154 
155 	return (DDI_SUCCESS);
156 }
157 
158 /*
159  * iser_ib_register_service
160  *
161  * This function registers the iSER service using the RDMA-Aware Service ID.
162  */
163 int
164 iser_ib_register_service(idm_svc_t *idm_svc)
165 {
166 	ibt_srv_desc_t	srvdesc;
167 	iser_svc_t	*iser_svc;
168 	int		status;
169 
170 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
171 
172 	/* Set up IBTI client callback handler from the CM */
173 	srvdesc.sd_handler = iser_ib_cm_handler;
174 
175 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
176 
177 	iser_svc = (iser_svc_t *)idm_svc->is_iser_svc;
178 
179 	/* Register the service on the specified port */
180 	status = ibt_register_service(
181 	    iser_state->is_ibhdl, &srvdesc,
182 	    iser_svc->is_svcid, 1, &iser_svc->is_srvhdl, NULL);
183 
184 	return (status);
185 }
186 
187 /*
188  * iser_ib_bind_service
189  *
190  * This function binds a given iSER service on all available HCA ports. The
191  * current specification does not allow user to specify transport bindings
192  * for each iscsi target. The ULP invokes this function to bind the target
193  * to all available iser ports after checking for the presence of an IB HCA.
194  * iSER is "configured" whenever an IB-capable IP address exists. The lack
195  * of active IB ports is a less-fatal condition, and sockets would be used
196  * as the transport even though an Infiniband HCA is configured but unusable.
197  *
198  */
199 int
200 iser_ib_bind_service(idm_svc_t *idm_svc)
201 {
202 	iser_hca_t	*hca;
203 	ib_gid_t	gid;
204 	int		num_ports = 0;
205 	int		num_binds = 0;
206 	int		num_inactive_binds = 0; /* if HCA ports inactive */
207 	int		status;
208 	int		i;
209 
210 	ASSERT(idm_svc != NULL);
211 	ASSERT(idm_svc->is_iser_svc != NULL);
212 
213 	/* Register the iSER service on all available ports */
214 	mutex_enter(&iser_state->is_hcalist_lock);
215 
216 	for (hca = list_head(&iser_state->is_hcalist);
217 	    hca != NULL;
218 	    hca = list_next(&iser_state->is_hcalist, hca)) {
219 
220 		for (i = 0; i < hca->hca_num_ports; i++) {
221 			num_ports++;
222 			if (hca->hca_port_info[i].p_linkstate !=
223 			    IBT_PORT_ACTIVE) {
224 				/*
225 				 * Move on. We will attempt to bind service
226 				 * in our async handler if the port comes up
227 				 * at a later time.
228 				 */
229 				num_inactive_binds++;
230 				continue;
231 			}
232 
233 			gid = hca->hca_port_info[i].p_sgid_tbl[0];
234 
235 			/* If the port is already bound, skip */
236 			if (iser_ib_get_bind(
237 			    idm_svc->is_iser_svc, hca->hca_guid, gid) == NULL) {
238 
239 				status = iser_ib_activate_port(
240 				    idm_svc, hca->hca_guid, gid);
241 				if (status != IBT_SUCCESS) {
242 					ISER_LOG(CE_NOTE,
243 					    "iser_ib_bind_service: "
244 					    "iser_ib_activate_port failure "
245 					    "(0x%x)", status);
246 					continue;
247 				}
248 			}
249 			num_binds++;
250 		}
251 	}
252 	mutex_exit(&iser_state->is_hcalist_lock);
253 
254 	if (num_binds) {
255 		ISER_LOG(CE_NOTE, "iser_ib_bind_service: Service available on "
256 		    "(%d) of (%d) ports", num_binds, num_ports);
257 		return (ISER_STATUS_SUCCESS);
258 	} else if (num_inactive_binds) {
259 		ISER_LOG(CE_NOTE, "iser_ib_bind_service: Could not bind "
260 		    "service, HCA ports are not active.");
261 		/*
262 		 * still considered success, the async handler will bind
263 		 * the service when the port comes up at a later time
264 		 */
265 		return (ISER_STATUS_SUCCESS);
266 	} else {
267 		ISER_LOG(CE_NOTE, "iser_ib_bind_service: Did not bind service");
268 		return (ISER_STATUS_FAIL);
269 	}
270 }
271 
272 /*
273  * iser_ib_unbind_service
274  *
275  * This function unbinds a given service on a all HCA ports
276  */
277 void
278 iser_ib_unbind_service(idm_svc_t *idm_svc)
279 {
280 	iser_svc_t	*iser_svc;
281 	iser_sbind_t	*is_sbind, *next_sb;
282 
283 	if (idm_svc != NULL && idm_svc->is_iser_svc != NULL) {
284 
285 		iser_svc = idm_svc->is_iser_svc;
286 
287 		for (is_sbind = list_head(&iser_svc->is_sbindlist);
288 		    is_sbind != NULL;
289 		    is_sbind = next_sb) {
290 			next_sb = list_next(&iser_svc->is_sbindlist, is_sbind);
291 			ibt_unbind_service(iser_svc->is_srvhdl,
292 			    is_sbind->is_sbindhdl);
293 			list_remove(&iser_svc->is_sbindlist, is_sbind);
294 			kmem_free(is_sbind, sizeof (iser_sbind_t));
295 		}
296 	}
297 }
298 
299 /* ARGSUSED */
300 void
301 iser_ib_deregister_service(idm_svc_t *idm_svc)
302 {
303 	iser_svc_t	*iser_svc;
304 
305 	if (idm_svc != NULL && idm_svc->is_iser_svc != NULL) {
306 
307 		iser_svc = (iser_svc_t *)idm_svc->is_iser_svc;
308 		ibt_deregister_service(iser_state->is_ibhdl,
309 		    iser_svc->is_srvhdl);
310 		ibt_release_ip_sid(iser_svc->is_svcid);
311 	}
312 }
313 
314 /*
315  * iser_ib_get_paths
316  * This function finds the IB path between the local and the remote address.
317  *
318  */
319 int
320 iser_ib_get_paths(ibt_ip_addr_t *local_ip, ibt_ip_addr_t *remote_ip,
321     ibt_path_info_t *path, ibt_path_ip_src_t *path_src_ip)
322 {
323 	ibt_ip_path_attr_t	ipattr;
324 	int			status;
325 
326 	(void) bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
327 	ipattr.ipa_dst_ip	= remote_ip;
328 	ipattr.ipa_src_ip	= *local_ip;
329 	ipattr.ipa_max_paths	= 1;
330 	ipattr.ipa_ndst		= 1;
331 
332 	(void) bzero(path, sizeof (ibt_path_info_t));
333 	status = ibt_get_ip_paths(iser_state->is_ibhdl, IBT_PATH_NO_FLAGS,
334 	    &ipattr, path, NULL, path_src_ip);
335 	if (status != IBT_SUCCESS) {
336 		ISER_LOG(CE_NOTE, "ibt_get_ip_paths: ibt_get_ip_paths "
337 		    "failure: status (%d)", status);
338 		return (status);
339 	}
340 
341 	if (local_ip != NULL) {
342 		ISER_LOG(CE_NOTE, "iser_ib_get_paths success: IP[%x to %x]",
343 		    local_ip->un.ip4addr, remote_ip->un.ip4addr);
344 	} else {
345 		ISER_LOG(CE_NOTE, "iser_ib_get_paths success: "
346 		    "IP[INADDR_ANY to %x]", remote_ip->un.ip4addr);
347 	}
348 
349 	return (ISER_STATUS_SUCCESS);
350 }
351 
352 /*
353  * iser_ib_alloc_channel_nopathlookup
354  *
355  * This function allocates a reliable connected channel. This function does
356  * not invoke ibt_get_ip_paths() to do the path lookup. The HCA GUID and
357  * port are input to this function.
358  */
359 iser_chan_t *
360 iser_ib_alloc_channel_nopathlookup(ib_guid_t hca_guid, uint8_t hca_port)
361 {
362 	iser_hca_t	*hca;
363 	iser_chan_t	*chan;
364 
365 	/* Lookup the hca using the gid in the path info */
366 	hca = iser_ib_guid2hca(hca_guid);
367 	if (hca == NULL) {
368 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_nopathlookup: failed "
369 		    "to lookup HCA(%llx) handle", (longlong_t)hca_guid);
370 		return (NULL);
371 	}
372 
373 	chan = iser_ib_alloc_rc_channel(hca, hca_port);
374 	if (chan == NULL) {
375 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_nopathlookup: failed "
376 		    "to alloc channel on HCA(%llx) %d",
377 		    (longlong_t)hca_guid, hca_port);
378 		return (NULL);
379 	}
380 
381 	ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup success: "
382 	    "chanhdl (0x%p), HCA(%llx) %d",
383 	    (void *)chan->ic_chanhdl, (longlong_t)hca_guid, hca_port);
384 
385 	return (chan);
386 }
387 
388 /*
389  * iser_ib_alloc_channel_pathlookup
390  *
391  * This function allocates a reliable connected channel but first invokes
392  * ibt_get_ip_paths() with the given local and remote addres to get the
393  * HCA lgid and the port number.
394  */
395 iser_chan_t *
396 iser_ib_alloc_channel_pathlookup(
397     ibt_ip_addr_t *local_ip, ibt_ip_addr_t *remote_ip)
398 {
399 	ibt_path_info_t		ibt_path;
400 	ibt_path_ip_src_t	path_src_ip;
401 	ib_gid_t		lgid;
402 	uint8_t			hca_port; /* from path */
403 	iser_hca_t		*hca;
404 	iser_chan_t		*chan;
405 	int			status;
406 
407 	/* Lookup a path to the given destination */
408 	status = iser_ib_get_paths(
409 	    local_ip, remote_ip, &ibt_path, &path_src_ip);
410 
411 	if (status != ISER_STATUS_SUCCESS) {
412 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup: faild "
413 		    "Path lookup IP:[%llx to %llx] failed: status (%d)",
414 		    (longlong_t)local_ip->un.ip4addr,
415 		    (longlong_t)remote_ip->un.ip4addr,
416 		    status);
417 		return (NULL);
418 	}
419 
420 	/* get the local gid from the path info */
421 	lgid = ibt_path.pi_prim_cep_path.cep_adds_vect.av_sgid;
422 
423 	/* get the hca port from the path info */
424 	hca_port = ibt_path.pi_prim_cep_path.cep_hca_port_num;
425 
426 	/* Lookup the hca using the gid in the path info */
427 	hca = iser_ib_gid2hca(lgid);
428 	if (hca == NULL) {
429 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup: failed "
430 		    "to lookup HCA (%llx) handle",
431 		    (longlong_t)hca->hca_guid);
432 		return (NULL);
433 	}
434 
435 	chan = iser_ib_alloc_rc_channel(hca, hca_port);
436 	if (chan == NULL) {
437 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup: failed "
438 		    "to alloc channel from IP:[%llx to %llx] on HCA (%llx) %d",
439 		    (longlong_t)local_ip->un.ip4addr,
440 		    (longlong_t)remote_ip->un.ip4addr,
441 		    (longlong_t)hca->hca_guid, hca_port);
442 		return (NULL);
443 	}
444 
445 	ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup success: "
446 	    "chanhdl (0x%p), IP:[%llx to %llx], lgid (%llx:%llx), HCA(%llx) %d",
447 	    (void *)chan->ic_chanhdl,
448 	    (longlong_t)local_ip->un.ip4addr,
449 	    (longlong_t)remote_ip->un.ip4addr,
450 	    (longlong_t)lgid.gid_prefix, (longlong_t)lgid.gid_guid,
451 	    (longlong_t)hca->hca_guid, hca_port);
452 
453 	chan->ic_ibt_path	= ibt_path;
454 	chan->ic_localip	= path_src_ip.ip_primary;
455 	chan->ic_remoteip	= *remote_ip;
456 
457 	return (chan);
458 }
459 
460 /*
461  * iser_ib_alloc_rc_channel
462  *
463  * This function allocates a reliable communication channel using the specified
464  * channel attributes.
465  */
466 iser_chan_t *
467 iser_ib_alloc_rc_channel(iser_hca_t *hca, uint8_t hca_port)
468 {
469 
470 	iser_chan_t			*chan;
471 	ibt_rc_chan_alloc_args_t	chanargs;
472 	uint_t				sq_size, rq_size;
473 	int				status;
474 
475 	chan = kmem_zalloc(sizeof (iser_chan_t), KM_SLEEP);
476 
477 	mutex_init(&chan->ic_lock, NULL, MUTEX_DRIVER, NULL);
478 	mutex_init(&chan->ic_sq_post_lock, NULL, MUTEX_DRIVER, NULL);
479 
480 	/* Set up the iSER channel handle with HCA */
481 	chan->ic_hca		= hca;
482 
483 	/*
484 	 * Determine the queue sizes, based upon the HCA query data.
485 	 * For our Work Queues, we will use either our default value,
486 	 * or the HCA's maximum value, whichever is smaller.
487 	 */
488 	sq_size = min(hca->hca_attr.hca_max_chan_sz, ISER_IB_SENDQ_SIZE);
489 	rq_size = min(hca->hca_attr.hca_max_chan_sz, ISER_IB_RECVQ_SIZE);
490 
491 	/*
492 	 * For our Completion Queues, we again check the device maximum.
493 	 * We want to end up with CQs that are the next size up from the
494 	 * WQs they are servicing so that they have some overhead.
495 	 */
496 	if (hca->hca_attr.hca_max_cq_sz >= (sq_size + 1)) {
497 		chan->ic_sendcq_sz = sq_size + 1;
498 	} else {
499 		chan->ic_sendcq_sz = hca->hca_attr.hca_max_cq_sz;
500 		sq_size = chan->ic_sendcq_sz - 1;
501 	}
502 
503 	if (hca->hca_attr.hca_max_cq_sz >= (rq_size + 1)) {
504 		chan->ic_recvcq_sz = rq_size + 1;
505 	} else {
506 		chan->ic_recvcq_sz = hca->hca_attr.hca_max_cq_sz;
507 		rq_size = chan->ic_recvcq_sz - 1;
508 	}
509 
510 	/* Initialize the iSER channel's QP handle */
511 	iser_ib_init_qp(chan, sq_size, rq_size);
512 
513 	/* Set up the Send Completion Queue */
514 	status = iser_ib_setup_cq(hca->hca_hdl, chan->ic_sendcq_sz,
515 	    &chan->ic_sendcq);
516 	if (status != ISER_STATUS_SUCCESS) {
517 		iser_ib_fini_qp(&chan->ic_qp);
518 		mutex_destroy(&chan->ic_lock);
519 		mutex_destroy(&chan->ic_sq_post_lock);
520 		kmem_free(chan, sizeof (iser_chan_t));
521 		return (NULL);
522 	}
523 	ibt_set_cq_handler(chan->ic_sendcq, iser_ib_sendcq_handler, chan);
524 	ibt_enable_cq_notify(chan->ic_sendcq, IBT_NEXT_COMPLETION);
525 
526 	/* Set up the Receive Completion Queue */
527 	status = iser_ib_setup_cq(hca->hca_hdl, chan->ic_recvcq_sz,
528 	    &chan->ic_recvcq);
529 	if (status != ISER_STATUS_SUCCESS) {
530 		(void) ibt_free_cq(chan->ic_sendcq);
531 		iser_ib_fini_qp(&chan->ic_qp);
532 		mutex_destroy(&chan->ic_lock);
533 		mutex_destroy(&chan->ic_sq_post_lock);
534 		kmem_free(chan, sizeof (iser_chan_t));
535 		return (NULL);
536 	}
537 	ibt_set_cq_handler(chan->ic_recvcq, iser_ib_recvcq_handler, chan);
538 	ibt_enable_cq_notify(chan->ic_recvcq, IBT_NEXT_COMPLETION);
539 
540 	/* Setup the channel arguments */
541 	iser_ib_setup_chanargs(hca_port, chan->ic_sendcq, chan->ic_recvcq,
542 	    sq_size, rq_size, hca->hca_pdhdl, &chanargs);
543 
544 	status = ibt_alloc_rc_channel(hca->hca_hdl,
545 	    IBT_ACHAN_NO_FLAGS, &chanargs, &chan->ic_chanhdl, NULL);
546 	if (status != IBT_SUCCESS) {
547 		ISER_LOG(CE_NOTE, "iser_ib_alloc_rc_channel: failed "
548 		    "ibt_alloc_rc_channel: status (%d)", status);
549 		(void) ibt_free_cq(chan->ic_sendcq);
550 		(void) ibt_free_cq(chan->ic_recvcq);
551 		iser_ib_fini_qp(&chan->ic_qp);
552 		mutex_destroy(&chan->ic_lock);
553 		mutex_destroy(&chan->ic_sq_post_lock);
554 		kmem_free(chan, sizeof (iser_chan_t));
555 		return (NULL);
556 	}
557 
558 	/* Set the 'channel' as the client private data */
559 	(void) ibt_set_chan_private(chan->ic_chanhdl, chan);
560 
561 	return (chan);
562 }
563 
564 /*
565  * iser_ib_open_rc_channel
566  * This function opens a RC connection on the given allocated RC channel
567  */
568 int
569 iser_ib_open_rc_channel(iser_chan_t *chan)
570 {
571 	ibt_ip_cm_info_t	ipcm_info;
572 	iser_private_data_t	iser_priv_data;
573 	ibt_chan_open_args_t	ocargs;
574 	ibt_rc_returns_t	ocreturns;
575 	int			status;
576 
577 	mutex_enter(&chan->ic_lock);
578 
579 	/*
580 	 * For connection establishment, the initiator sends a CM REQ using the
581 	 * iSER RDMA-Aware Service ID. Included are the source and destination
582 	 * IP addresses, and the src port.
583 	 */
584 	bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
585 	ipcm_info.src_addr = chan->ic_localip;
586 	ipcm_info.dst_addr = chan->ic_remoteip;
587 	ipcm_info.src_port = chan->ic_lport;
588 
589 	/*
590 	 * The CM Private Data field defines the iSER connection parameters
591 	 * such as zero based virtual address exception (ZBVAE) and Send with
592 	 * invalidate Exception (SIE).
593 	 *
594 	 * Solaris IBT does not currently support ZBVAE or SIE.
595 	 */
596 	iser_priv_data.rsvd1	= 0;
597 	iser_priv_data.sie	= 1;
598 	iser_priv_data.zbvae	= 1;
599 
600 	status = ibt_format_ip_private_data(&ipcm_info,
601 	    sizeof (iser_private_data_t), &iser_priv_data);
602 	if (status != IBT_SUCCESS) {
603 		ISER_LOG(CE_NOTE, "iser_ib_open_rc_channel failed: %d", status);
604 		mutex_exit(&chan->ic_lock);
605 		return (status);
606 	}
607 
608 	/*
609 	 * Set the SID we are attempting to connect to, based upon the
610 	 * remote port number.
611 	 */
612 	chan->ic_ibt_path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, chan->ic_rport);
613 
614 	/* Set up the args for the channel open */
615 	bzero(&ocargs, sizeof (ibt_chan_open_args_t));
616 	ocargs.oc_path			= &chan->ic_ibt_path;
617 	ocargs.oc_cm_handler		= iser_ib_cm_handler;
618 	ocargs.oc_cm_clnt_private	= iser_state;
619 	ocargs.oc_rdma_ra_out		= 4;
620 	ocargs.oc_rdma_ra_in		= 4;
621 	ocargs.oc_path_retry_cnt	= 2;
622 	ocargs.oc_path_rnr_retry_cnt	= 2;
623 	ocargs.oc_priv_data_len		= sizeof (iser_private_data_t);
624 	ocargs.oc_priv_data		= &iser_priv_data;
625 
626 	bzero(&ocreturns, sizeof (ibt_rc_returns_t));
627 
628 	status = ibt_open_rc_channel(chan->ic_chanhdl,
629 	    IBT_OCHAN_NO_FLAGS, IBT_BLOCKING, &ocargs, &ocreturns);
630 
631 	if (status != IBT_SUCCESS) {
632 		ISER_LOG(CE_NOTE, "iser_ib_open_rc_channel failed: %d", status);
633 		mutex_exit(&chan->ic_lock);
634 		return (status);
635 	}
636 
637 	mutex_exit(&chan->ic_lock);
638 	return (IDM_STATUS_SUCCESS);
639 }
640 
641 /*
642  * iser_ib_close_rc_channel
643  * This function closes the RC channel related to this iser_chan handle.
644  * We invoke this in a non-blocking, no callbacks context.
645  */
646 void
647 iser_ib_close_rc_channel(iser_chan_t *chan)
648 {
649 	int			status;
650 
651 	mutex_enter(&chan->ic_lock);
652 	status = ibt_close_rc_channel(chan->ic_chanhdl, IBT_BLOCKING, NULL,
653 	    0, NULL, NULL, 0);
654 	if (status != IBT_SUCCESS) {
655 		ISER_LOG(CE_NOTE, "iser_ib_close_rc_channel: "
656 		    "ibt_close_rc_channel failed: status (%d)", status);
657 	}
658 	mutex_exit(&chan->ic_lock);
659 }
660 
661 /*
662  * iser_ib_free_rc_channel
663  *
664  * This function tears down an RC channel's QP initialization and frees it.
665  * Note that we do not need synchronization here; the channel has been
666  * closed already, so we should only have completion polling occuring.  Once
667  * complete, we are free to free the IBTF channel, WQ and CQ resources, and
668  * our own related resources.
669  */
670 void
671 iser_ib_free_rc_channel(iser_chan_t *chan)
672 {
673 	iser_qp_t	*iser_qp;
674 
675 	iser_qp = &chan->ic_qp;
676 
677 	/* Ensure the SQ is empty */
678 	while (chan->ic_sq_post_count != 0) {
679 		mutex_exit(&chan->ic_conn->ic_lock);
680 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
681 		mutex_enter(&chan->ic_conn->ic_lock);
682 	}
683 	mutex_destroy(&chan->ic_sq_post_lock);
684 
685 	/* Ensure the RQ is empty */
686 	(void) ibt_flush_channel(chan->ic_chanhdl);
687 	mutex_enter(&iser_qp->qp_lock);
688 	while (iser_qp->rq_level != 0) {
689 		mutex_exit(&iser_qp->qp_lock);
690 		mutex_exit(&chan->ic_conn->ic_lock);
691 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
692 		mutex_enter(&chan->ic_conn->ic_lock);
693 		mutex_enter(&iser_qp->qp_lock);
694 	}
695 
696 	/* Free our QP handle */
697 	mutex_exit(&iser_qp->qp_lock);
698 	(void) iser_ib_fini_qp(iser_qp);
699 
700 	/* Free the IBT channel resources */
701 	(void) ibt_free_channel(chan->ic_chanhdl);
702 	chan->ic_chanhdl = NULL;
703 
704 	/* Free the CQs */
705 	ibt_free_cq(chan->ic_sendcq);
706 	ibt_free_cq(chan->ic_recvcq);
707 
708 	/* Free the chan handle */
709 	mutex_destroy(&chan->ic_lock);
710 	kmem_free(chan, sizeof (iser_chan_t));
711 }
712 
713 /*
714  * iser_ib_post_recv
715  *
716  * This function handles keeping the RQ full on a given channel.
717  * This routine will mostly be run on a taskq, and will check the
718  * current fill level of the RQ, and post as many WRs as necessary
719  * to fill it again.
720  */
721 
722 int
723 iser_ib_post_recv_async(ibt_channel_hdl_t chanhdl)
724 {
725 	iser_chan_t	*chan;
726 	int		status;
727 
728 	/* Pull our iSER channel handle from the private data */
729 	chan = (iser_chan_t *)ibt_get_chan_private(chanhdl);
730 
731 	/*
732 	 * Caller must check that chan->ic_conn->ic_stage indicates
733 	 * the connection is active (not closing, not closed) and
734 	 * it must hold the mutex cross the check and the call to this function
735 	 */
736 	ASSERT(mutex_owned(&chan->ic_conn->ic_lock));
737 	ASSERT((chan->ic_conn->ic_stage >= ISER_CONN_STAGE_IC_CONNECTED) &&
738 	    (chan->ic_conn->ic_stage <= ISER_CONN_STAGE_LOGGED_IN));
739 	idm_conn_hold(chan->ic_conn->ic_idmc);
740 	status = ddi_taskq_dispatch(iser_taskq, iser_ib_post_recv_task,
741 	    (void *)chanhdl, DDI_NOSLEEP);
742 	if (status != DDI_SUCCESS) {
743 		idm_conn_rele(chan->ic_conn->ic_idmc);
744 	}
745 
746 	return (status);
747 }
748 
749 static void
750 iser_ib_post_recv_task(void *arg)
751 {
752 	ibt_channel_hdl_t	chanhdl = arg;
753 	iser_chan_t		*chan;
754 
755 	/* Pull our iSER channel handle from the private data */
756 	chan = (iser_chan_t *)ibt_get_chan_private(chanhdl);
757 
758 	iser_ib_post_recv(chanhdl);
759 	idm_conn_rele(chan->ic_conn->ic_idmc);
760 }
761 
762 void
763 iser_ib_post_recv(ibt_channel_hdl_t chanhdl)
764 {
765 	iser_chan_t	*chan;
766 	iser_hca_t	*hca;
767 	iser_msg_t	*msg;
768 	ibt_recv_wr_t	*wrlist, wr[ISER_IB_RQ_POST_MAX];
769 	int		rq_space, msg_ret;
770 	int		total_num, npost;
771 	uint_t		nposted;
772 	int		status, i;
773 	iser_qp_t	*iser_qp;
774 
775 	/* Pull our iSER channel handle from the private data */
776 	chan = (iser_chan_t *)ibt_get_chan_private(chanhdl);
777 
778 	ASSERT(chan != NULL);
779 
780 	mutex_enter(&chan->ic_conn->ic_lock);
781 
782 	/* Bail out if the connection is closed; no need for more recv WRs */
783 	if ((chan->ic_conn->ic_stage == ISER_CONN_STAGE_CLOSING) ||
784 	    (chan->ic_conn->ic_stage == ISER_CONN_STAGE_CLOSED)) {
785 		mutex_exit(&chan->ic_conn->ic_lock);
786 		return;
787 	}
788 
789 	/* get the QP handle from the iser_chan */
790 	iser_qp = &chan->ic_qp;
791 
792 	hca = chan->ic_hca;
793 
794 	if (hca == NULL) {
795 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: unable to retrieve "
796 		    "HCA handle");
797 		mutex_exit(&chan->ic_conn->ic_lock);
798 		return;
799 	}
800 
801 	/* check for space to post on the RQ */
802 	mutex_enter(&iser_qp->qp_lock);
803 	rq_space = iser_qp->rq_depth - iser_qp->rq_level;
804 	if (rq_space == 0) {
805 		/* The RQ is full, clear the pending flag and return */
806 		iser_qp->rq_taskqpending = B_FALSE;
807 		mutex_exit(&iser_qp->qp_lock);
808 		mutex_exit(&chan->ic_conn->ic_lock);
809 		return;
810 	}
811 
812 	/* Keep track of the lowest value for rq_min_post_level */
813 	if (iser_qp->rq_level < iser_qp->rq_min_post_level)
814 		iser_qp->rq_min_post_level = iser_qp->rq_level;
815 
816 	mutex_exit(&iser_qp->qp_lock);
817 
818 	/* we've room to post, so pull from the msg cache */
819 	msg = iser_msg_get(hca, rq_space, &msg_ret);
820 	if (msg == NULL) {
821 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: no message handles "
822 		    "available in msg cache currently");
823 		/*
824 		 * There are no messages on the cache. Wait a half-
825 		 * second, then try again.
826 		 */
827 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
828 		status = iser_ib_post_recv_async(chanhdl);
829 		if (status != DDI_SUCCESS) {
830 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: failed to "
831 			    "redispatch routine");
832 			/* Failed to dispatch, clear pending flag */
833 			mutex_enter(&iser_qp->qp_lock);
834 			iser_qp->rq_taskqpending = B_FALSE;
835 			mutex_exit(&iser_qp->qp_lock);
836 		}
837 		mutex_exit(&chan->ic_conn->ic_lock);
838 		return;
839 	}
840 
841 	if (msg_ret != rq_space) {
842 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: requested number of "
843 		    "messages not allocated: requested (%d) allocated (%d)",
844 		    rq_space, msg_ret);
845 		/* We got some, but not all, of our requested depth */
846 		rq_space = msg_ret;
847 	}
848 
849 	/*
850 	 * Now, walk through the allocated WRs and post them,
851 	 * ISER_IB_RQ_POST_MAX (or less) at a time.
852 	 */
853 	wrlist = &wr[0];
854 	total_num = rq_space;
855 
856 	while (total_num) {
857 		/* determine the number to post on this iteration */
858 		npost = (total_num > ISER_IB_RQ_POST_MAX) ?
859 		    ISER_IB_RQ_POST_MAX : total_num;
860 
861 		/* build a list of WRs from the msg list */
862 		for (i = 0; i < npost; i++) {
863 			wrlist[i].wr_id		= (ibt_wrid_t)(uintptr_t)msg;
864 			wrlist[i].wr_nds	= ISER_IB_SGLIST_SIZE;
865 			wrlist[i].wr_sgl	= &msg->msg_ds;
866 			msg = msg->nextp;
867 		}
868 
869 		/* post the list to the RQ */
870 		nposted = 0;
871 		status = ibt_post_recv(chanhdl, wrlist, npost, &nposted);
872 		if ((status != IBT_SUCCESS) || (nposted != npost)) {
873 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: ibt_post_recv "
874 			    "failed: requested (%d) posted (%d) status (%d)",
875 			    npost, nposted, status);
876 			total_num -= nposted;
877 			break;
878 		}
879 
880 		/* decrement total number to post by the number posted */
881 		total_num -= nposted;
882 	}
883 
884 	mutex_enter(&iser_qp->qp_lock);
885 	if (total_num != 0) {
886 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: unable to fill RQ, "
887 		    "failed to post (%d) WRs", total_num);
888 		iser_qp->rq_level += rq_space - total_num;
889 	} else {
890 		iser_qp->rq_level += rq_space;
891 	}
892 
893 	/*
894 	 * Now that we've filled the RQ, check that all of the recv WRs
895 	 * haven't just been immediately consumed. If so, taskqpending is
896 	 * still B_TRUE, so we need to fire off a taskq thread to post
897 	 * more WRs.
898 	 */
899 	if (iser_qp->rq_level == 0) {
900 		mutex_exit(&iser_qp->qp_lock);
901 		status = iser_ib_post_recv_async(chanhdl);
902 		if (status != DDI_SUCCESS) {
903 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: failed to "
904 			    "dispatch followup routine");
905 			/* Failed to dispatch, clear pending flag */
906 			mutex_enter(&iser_qp->qp_lock);
907 			iser_qp->rq_taskqpending = B_FALSE;
908 			mutex_exit(&iser_qp->qp_lock);
909 		}
910 	} else {
911 		/*
912 		 * We're done, we've filled the RQ. Clear the taskq
913 		 * flag so that we can run again.
914 		 */
915 		iser_qp->rq_taskqpending = B_FALSE;
916 		mutex_exit(&iser_qp->qp_lock);
917 	}
918 
919 	mutex_exit(&chan->ic_conn->ic_lock);
920 }
921 
922 /*
923  * iser_ib_handle_portup_event()
924  * This handles the IBT_EVENT_PORT_UP unaffiliated asynchronous event.
925  *
926  * To facilitate a seamless bringover of the port and configure the CM service
927  * for inbound iSER service requests on this newly active port, the existing
928  * IDM services will be checked for iSER support.
929  * If an iSER service was already created, then this service will simply be
930  * bound to the gid of the newly active port. If on the other hand, the CM
931  * service did not exist, i.e. only socket communication, then a new CM
932  * service will be first registered with the saved service parameters and
933  * then bound to the newly active port.
934  *
935  */
936 /* ARGSUSED */
937 static void
938 iser_ib_handle_portup_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
939 {
940 	iser_hca_t		*hca;
941 	ib_gid_t		gid;
942 	idm_svc_t		*idm_svc;
943 	int			status;
944 
945 	ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event: HCA(0x%llx) port(%d)",
946 	    (longlong_t)event->ev_hca_guid, event->ev_port);
947 
948 	/*
949 	 * Query all ports on the HCA and update the port information
950 	 * maintainted in the iser_hca_t structure
951 	 */
952 	hca = iser_ib_guid2hca(event->ev_hca_guid);
953 	if (hca == NULL) {
954 
955 		/* HCA is just made available, first port on that HCA */
956 		hca = iser_ib_alloc_hca(event->ev_hca_guid);
957 		if (hca == NULL) {
958 			ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
959 			    "iser_ib_alloc_hca failed: HCA(0x%llx) port(%d)",
960 			    (longlong_t)event->ev_hca_guid, event->ev_port);
961 			return;
962 		}
963 		mutex_enter(&iser_state->is_hcalist_lock);
964 		list_insert_tail(&iser_state->is_hcalist, hca);
965 		iser_state->is_num_hcas++;
966 		mutex_exit(&iser_state->is_hcalist_lock);
967 
968 	} else {
969 
970 		status = iser_ib_update_hcaports(hca);
971 
972 		if (status != IBT_SUCCESS) {
973 			ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
974 			    "status(0x%x): iser_ib_update_hcaports failed: "
975 			    "HCA(0x%llx) port(%d)", status,
976 			    (longlong_t)event->ev_hca_guid, event->ev_port);
977 			return;
978 		}
979 	}
980 
981 	gid = hca->hca_port_info[event->ev_port - 1].p_sgid_tbl[0];
982 
983 	/*
984 	 * Iterate through the global list of IDM target services
985 	 * and check for existing iSER CM service.
986 	 */
987 	mutex_enter(&idm.idm_global_mutex);
988 	for (idm_svc = list_head(&idm.idm_tgt_svc_list);
989 	    idm_svc != NULL;
990 	    idm_svc = list_next(&idm.idm_tgt_svc_list, idm_svc)) {
991 
992 
993 		if (idm_svc->is_iser_svc == NULL) {
994 
995 			/* Establish a new CM service for iSER requests */
996 			status = iser_tgt_svc_create(
997 			    &idm_svc->is_svc_req, idm_svc);
998 
999 			if (status != IBT_SUCCESS) {
1000 				ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
1001 				    "status(0x%x): iser_tgt_svc_create failed: "
1002 				    "HCA(0x%llx) port(%d)", status,
1003 				    (longlong_t)event->ev_hca_guid,
1004 				    event->ev_port);
1005 
1006 				continue;
1007 			}
1008 		}
1009 
1010 		status = iser_ib_activate_port(
1011 		    idm_svc, event->ev_hca_guid, gid);
1012 		if (status != IBT_SUCCESS) {
1013 
1014 			ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
1015 			    "status(0x%x): Bind service on port "
1016 			    "(%llx:%llx) failed",
1017 			    status, (longlong_t)gid.gid_prefix,
1018 			    (longlong_t)gid.gid_guid);
1019 
1020 			continue;
1021 		}
1022 		ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event: service bound "
1023 		    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
1024 		    event->ev_port);
1025 	}
1026 	mutex_exit(&idm.idm_global_mutex);
1027 
1028 	ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event success: "
1029 	    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
1030 	    event->ev_port);
1031 }
1032 
1033 /*
1034  * iser_ib_handle_portdown_event()
1035  * This handles the IBT_EVENT_PORT_DOWN unaffiliated asynchronous error.
1036  *
1037  * Unconfigure the CM service on the deactivated port and teardown the
1038  * connections that are using the CM service.
1039  */
1040 /* ARGSUSED */
1041 static void
1042 iser_ib_handle_portdown_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
1043 {
1044 	iser_hca_t		*hca;
1045 	ib_gid_t		gid;
1046 	int			status;
1047 
1048 	/*
1049 	 * Query all ports on the HCA and update the port information
1050 	 * maintainted in the iser_hca_t structure
1051 	 */
1052 	hca = iser_ib_guid2hca(event->ev_hca_guid);
1053 	ASSERT(hca != NULL);
1054 
1055 	status = iser_ib_update_hcaports(hca);
1056 	if (status != IBT_SUCCESS) {
1057 		ISER_LOG(CE_NOTE, "iser_ib_handle_portdown_event status(0x%x): "
1058 		    "ibt_ib_update_hcaports failed: HCA(0x%llx) port(%d)",
1059 		    status, (longlong_t)event->ev_hca_guid, event->ev_port);
1060 		return;
1061 	}
1062 
1063 	/* get the gid of the new port */
1064 	gid = hca->hca_port_info[event->ev_port - 1].p_sgid_tbl[0];
1065 	iser_ib_deactivate_port(event->ev_hca_guid, gid);
1066 
1067 	ISER_LOG(CE_NOTE, "iser_ib_handle_portdown_event success: "
1068 	    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
1069 	    event->ev_port);
1070 }
1071 
1072 /*
1073  * iser_ib_handle_hca_detach_event()
1074  * Quiesce all activity bound for the port, teardown the connection, unbind
1075  * iSER services on all ports and release the HCA handle.
1076  */
1077 /* ARGSUSED */
1078 static void
1079 iser_ib_handle_hca_detach_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
1080 {
1081 	iser_hca_t	*nexthca, *hca;
1082 	int		i, status;
1083 
1084 	ISER_LOG(CE_NOTE, "iser_ib_handle_hca_detach_event: HCA(0x%llx)",
1085 	    (longlong_t)event->ev_hca_guid);
1086 
1087 	hca = iser_ib_guid2hca(event->ev_hca_guid);
1088 	for (i = 0; i < hca->hca_num_ports; i++) {
1089 		iser_ib_deactivate_port(hca->hca_guid,
1090 		    hca->hca_port_info[i].p_sgid_tbl[0]);
1091 	}
1092 
1093 	/*
1094 	 * Update the HCA list maintained in the iser_state. Free the
1095 	 * resources allocated to the HCA, i.e. caches, protection domain
1096 	 */
1097 	mutex_enter(&iser_state->is_hcalist_lock);
1098 
1099 	for (hca = list_head(&iser_state->is_hcalist);
1100 	    hca != NULL;
1101 	    hca = nexthca) {
1102 
1103 		nexthca = list_next(&iser_state->is_hcalist, hca);
1104 
1105 		if (hca->hca_guid == event->ev_hca_guid) {
1106 
1107 			list_remove(&iser_state->is_hcalist, hca);
1108 			iser_state->is_num_hcas--;
1109 
1110 			status = iser_ib_free_hca(hca);
1111 			if (status != DDI_SUCCESS) {
1112 				ISER_LOG(CE_WARN, "iser_ib_handle_hca_detach: "
1113 				    "Failed to free hca(%p)", (void *)hca);
1114 				list_insert_tail(&iser_state->is_hcalist, hca);
1115 				iser_state->is_num_hcas++;
1116 			}
1117 			/* No way to return status to IBT if this fails */
1118 		}
1119 	}
1120 	mutex_exit(&iser_state->is_hcalist_lock);
1121 
1122 }
1123 
1124 /*
1125  * iser_ib_async_handler
1126  * An IBT Asynchronous Event handler is registered it with the framework and
1127  * passed via the ibt_attach() routine. This function handles the following
1128  * asynchronous events.
1129  * IBT_EVENT_PORT_UP
1130  * IBT_ERROR_PORT_DOWN
1131  * IBT_HCA_ATTACH_EVENT
1132  * IBT_HCA_DETACH_EVENT
1133  */
1134 /* ARGSUSED */
1135 void
1136 iser_ib_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code,
1137     ibt_async_event_t *event)
1138 {
1139 	switch (code) {
1140 	case IBT_EVENT_PORT_UP:
1141 		iser_ib_handle_portup_event(hdl, event);
1142 		break;
1143 
1144 	case IBT_ERROR_PORT_DOWN:
1145 		iser_ib_handle_portdown_event(hdl, event);
1146 		break;
1147 
1148 	case IBT_HCA_ATTACH_EVENT:
1149 		/*
1150 		 * A new HCA device is available for use, ignore this
1151 		 * event because the corresponding IBT_EVENT_PORT_UP
1152 		 * events will get triggered and handled accordingly.
1153 		 */
1154 		break;
1155 
1156 	case IBT_HCA_DETACH_EVENT:
1157 		iser_ib_handle_hca_detach_event(hdl, event);
1158 		break;
1159 
1160 	default:
1161 		break;
1162 	}
1163 }
1164 
1165 /*
1166  * iser_ib_init_hcas
1167  *
1168  * This function opens all the HCA devices, gathers the HCA state information
1169  * and adds the HCA handle for each HCA found in the iser_soft_state.
1170  */
1171 static int
1172 iser_ib_init_hcas(void)
1173 {
1174 	ib_guid_t	*guid;
1175 	int		num_hcas;
1176 	int		i;
1177 	iser_hca_t	*hca;
1178 
1179 	/* Retrieve the HCA list */
1180 	num_hcas = ibt_get_hca_list(&guid);
1181 	if (num_hcas == 0) {
1182 		/*
1183 		 * This shouldn't happen, but might if we have all HCAs
1184 		 * detach prior to initialization.
1185 		 */
1186 		return (DDI_FAILURE);
1187 	}
1188 
1189 	/* Initialize the hcalist lock */
1190 	mutex_init(&iser_state->is_hcalist_lock, NULL, MUTEX_DRIVER, NULL);
1191 
1192 	/* Create the HCA list */
1193 	list_create(&iser_state->is_hcalist, sizeof (iser_hca_t),
1194 	    offsetof(iser_hca_t, hca_node));
1195 
1196 	for (i = 0; i < num_hcas; i++) {
1197 
1198 		ISER_LOG(CE_NOTE, "iser_ib_init_hcas: initializing HCA "
1199 		    "(0x%llx)", (longlong_t)guid[i]);
1200 
1201 		hca = iser_ib_alloc_hca(guid[i]);
1202 		if (hca == NULL) {
1203 			/* This shouldn't happen, teardown and fail */
1204 			(void) iser_ib_fini_hcas();
1205 			(void) ibt_free_hca_list(guid, num_hcas);
1206 			return (DDI_FAILURE);
1207 		}
1208 
1209 		mutex_enter(&iser_state->is_hcalist_lock);
1210 		list_insert_tail(&iser_state->is_hcalist, hca);
1211 		iser_state->is_num_hcas++;
1212 		mutex_exit(&iser_state->is_hcalist_lock);
1213 
1214 	}
1215 
1216 	/* Free the IBT HCA list */
1217 	(void) ibt_free_hca_list(guid, num_hcas);
1218 
1219 	/* Check that we've initialized at least one HCA */
1220 	mutex_enter(&iser_state->is_hcalist_lock);
1221 	if (list_is_empty(&iser_state->is_hcalist)) {
1222 		ISER_LOG(CE_NOTE, "iser_ib_init_hcas: failed to initialize "
1223 		    "any HCAs");
1224 
1225 		mutex_exit(&iser_state->is_hcalist_lock);
1226 		(void) iser_ib_fini_hcas();
1227 		return (DDI_FAILURE);
1228 	}
1229 	mutex_exit(&iser_state->is_hcalist_lock);
1230 
1231 	return (DDI_SUCCESS);
1232 }
1233 
1234 /*
1235  * iser_ib_fini_hcas
1236  *
1237  * Teardown the iSER HCA list initialized above.
1238  */
1239 static int
1240 iser_ib_fini_hcas(void)
1241 {
1242 	iser_hca_t	*nexthca, *hca;
1243 	int		status;
1244 
1245 	mutex_enter(&iser_state->is_hcalist_lock);
1246 	for (hca = list_head(&iser_state->is_hcalist);
1247 	    hca != NULL;
1248 	    hca = nexthca) {
1249 
1250 		nexthca = list_next(&iser_state->is_hcalist, hca);
1251 
1252 		list_remove(&iser_state->is_hcalist, hca);
1253 
1254 		status = iser_ib_free_hca(hca);
1255 		if (status != IBT_SUCCESS) {
1256 			ISER_LOG(CE_NOTE, "iser_ib_fini_hcas: failed to free "
1257 			    "HCA during fini");
1258 			list_insert_tail(&iser_state->is_hcalist, hca);
1259 			return (DDI_FAILURE);
1260 		}
1261 
1262 		iser_state->is_num_hcas--;
1263 
1264 	}
1265 	mutex_exit(&iser_state->is_hcalist_lock);
1266 	list_destroy(&iser_state->is_hcalist);
1267 	mutex_destroy(&iser_state->is_hcalist_lock);
1268 
1269 	return (DDI_SUCCESS);
1270 }
1271 
1272 /*
1273  * iser_ib_alloc_hca
1274  *
1275  * This function opens the given HCA device, gathers the HCA state information
1276  * and adds the HCA handle
1277  */
1278 static iser_hca_t *
1279 iser_ib_alloc_hca(ib_guid_t guid)
1280 {
1281 	iser_hca_t	*hca;
1282 	int		status;
1283 
1284 	/* Allocate an iser_hca_t HCA handle */
1285 	hca = (iser_hca_t *)kmem_zalloc(sizeof (iser_hca_t), KM_SLEEP);
1286 
1287 	/* Open this HCA */
1288 	status = ibt_open_hca(iser_state->is_ibhdl, guid, &hca->hca_hdl);
1289 	if (status != IBT_SUCCESS) {
1290 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_open_hca failed:"
1291 		    " guid (0x%llx) status (0x%x)", (longlong_t)guid, status);
1292 		kmem_free(hca, sizeof (iser_hca_t));
1293 		return (NULL);
1294 	}
1295 
1296 	hca->hca_guid		= guid;
1297 	hca->hca_clnt_hdl	= iser_state->is_ibhdl;
1298 
1299 	/* Query the HCA */
1300 	status = ibt_query_hca(hca->hca_hdl, &hca->hca_attr);
1301 	if (status != IBT_SUCCESS) {
1302 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_query_hca "
1303 		    "failure: guid (0x%llx) status (0x%x)",
1304 		    (longlong_t)guid, status);
1305 		(void) ibt_close_hca(hca->hca_hdl);
1306 		kmem_free(hca, sizeof (iser_hca_t));
1307 		return (NULL);
1308 	}
1309 
1310 	/* Query all ports on the HCA */
1311 	status = ibt_query_hca_ports(hca->hca_hdl, 0,
1312 	    &hca->hca_port_info, &hca->hca_num_ports,
1313 	    &hca->hca_port_info_sz);
1314 	if (status != IBT_SUCCESS) {
1315 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: "
1316 		    "ibt_query_hca_ports failure: guid (0x%llx) "
1317 		    "status (0x%x)", (longlong_t)guid, status);
1318 		(void) ibt_close_hca(hca->hca_hdl);
1319 		kmem_free(hca, sizeof (iser_hca_t));
1320 		return (NULL);
1321 	}
1322 
1323 	/* Allocate a single PD on this HCA */
1324 	status = ibt_alloc_pd(hca->hca_hdl, IBT_PD_NO_FLAGS,
1325 	    &hca->hca_pdhdl);
1326 	if (status != IBT_SUCCESS) {
1327 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_alloc_pd "
1328 		    "failure: guid (0x%llx) status (0x%x)",
1329 		    (longlong_t)guid, status);
1330 		(void) ibt_close_hca(hca->hca_hdl);
1331 		ibt_free_portinfo(hca->hca_port_info, hca->hca_port_info_sz);
1332 		kmem_free(hca, sizeof (iser_hca_t));
1333 		return (NULL);
1334 	}
1335 
1336 	/* Initialize the message and data MR caches for this HCA */
1337 	iser_init_hca_caches(hca);
1338 
1339 	return (hca);
1340 }
1341 
1342 static int
1343 iser_ib_free_hca(iser_hca_t *hca)
1344 {
1345 	int			status;
1346 	ibt_hca_portinfo_t	*hca_port_info;
1347 	uint_t			hca_port_info_sz;
1348 
1349 	ASSERT(hca != NULL);
1350 	if (hca->hca_failed)
1351 		return (DDI_FAILURE);
1352 
1353 	hca_port_info = hca->hca_port_info;
1354 	hca_port_info_sz = hca->hca_port_info_sz;
1355 
1356 	/*
1357 	 * Free the memory regions before freeing
1358 	 * the associated protection domain
1359 	 */
1360 	iser_fini_hca_caches(hca);
1361 
1362 	status = ibt_free_pd(hca->hca_hdl, hca->hca_pdhdl);
1363 	if (status != IBT_SUCCESS) {
1364 		ISER_LOG(CE_NOTE, "iser_ib_free_hca: failed to free PD "
1365 		    "status=0x%x", status);
1366 		goto out_caches;
1367 	}
1368 
1369 	status = ibt_close_hca(hca->hca_hdl);
1370 	if (status != IBT_SUCCESS) {
1371 		ISER_LOG(CE_NOTE, "iser_ib_fini_hcas: failed to close HCA "
1372 		    "status=0x%x", status);
1373 		goto out_pd;
1374 	}
1375 
1376 	ibt_free_portinfo(hca_port_info, hca_port_info_sz);
1377 
1378 	kmem_free(hca, sizeof (iser_hca_t));
1379 	return (DDI_SUCCESS);
1380 
1381 	/*
1382 	 * We only managed to partially tear down the HCA, try to put it back
1383 	 * like it was before returning.
1384 	 */
1385 out_pd:
1386 	status = ibt_alloc_pd(hca->hca_hdl, IBT_PD_NO_FLAGS, &hca->hca_pdhdl);
1387 	if (status != IBT_SUCCESS) {
1388 		hca->hca_failed = B_TRUE;
1389 		/* Report error and exit */
1390 		ISER_LOG(CE_NOTE, "iser_ib_free_hca: could not re-alloc PD "
1391 		    "status=0x%x", status);
1392 		return (DDI_FAILURE);
1393 	}
1394 
1395 out_caches:
1396 	iser_init_hca_caches(hca);
1397 
1398 	return (DDI_FAILURE);
1399 }
1400 
1401 static int
1402 iser_ib_update_hcaports(iser_hca_t *hca)
1403 {
1404 	ibt_hca_portinfo_t	*pinfop, *oldpinfop;
1405 	uint_t			size, oldsize, nport;
1406 	int			status;
1407 
1408 	ASSERT(hca != NULL);
1409 
1410 	status = ibt_query_hca_ports(hca->hca_hdl, 0, &pinfop, &nport, &size);
1411 	if (status != IBT_SUCCESS) {
1412 		ISER_LOG(CE_NOTE, "ibt_query_hca_ports failed: %d", status);
1413 		return (status);
1414 	}
1415 
1416 	oldpinfop = hca->hca_port_info;
1417 	oldsize	= hca->hca_port_info_sz;
1418 	hca->hca_port_info = pinfop;
1419 	hca->hca_port_info_sz = size;
1420 
1421 	(void) ibt_free_portinfo(oldpinfop, oldsize);
1422 
1423 	return (IBT_SUCCESS);
1424 }
1425 
1426 /*
1427  * iser_ib_gid2hca
1428  * Given a gid, find the corresponding hca
1429  */
1430 iser_hca_t *
1431 iser_ib_gid2hca(ib_gid_t gid)
1432 {
1433 
1434 	iser_hca_t	*hca;
1435 	int		i;
1436 
1437 	mutex_enter(&iser_state->is_hcalist_lock);
1438 	for (hca = list_head(&iser_state->is_hcalist);
1439 	    hca != NULL;
1440 	    hca = list_next(&iser_state->is_hcalist, hca)) {
1441 
1442 		for (i = 0; i < hca->hca_num_ports; i++) {
1443 			if ((hca->hca_port_info[i].p_sgid_tbl[0].gid_prefix ==
1444 			    gid.gid_prefix) &&
1445 			    (hca->hca_port_info[i].p_sgid_tbl[0].gid_guid ==
1446 			    gid.gid_guid)) {
1447 
1448 				mutex_exit(&iser_state->is_hcalist_lock);
1449 
1450 				return (hca);
1451 			}
1452 		}
1453 	}
1454 	mutex_exit(&iser_state->is_hcalist_lock);
1455 	return (NULL);
1456 }
1457 
1458 /*
1459  * iser_ib_guid2hca
1460  * Given a HCA guid, find the corresponding HCA
1461  */
1462 iser_hca_t *
1463 iser_ib_guid2hca(ib_guid_t guid)
1464 {
1465 
1466 	iser_hca_t	*hca;
1467 
1468 	mutex_enter(&iser_state->is_hcalist_lock);
1469 	for (hca = list_head(&iser_state->is_hcalist);
1470 	    hca != NULL;
1471 	    hca = list_next(&iser_state->is_hcalist, hca)) {
1472 
1473 		if (hca->hca_guid == guid) {
1474 			mutex_exit(&iser_state->is_hcalist_lock);
1475 			return (hca);
1476 		}
1477 	}
1478 	mutex_exit(&iser_state->is_hcalist_lock);
1479 	return (NULL);
1480 }
1481 
1482 /*
1483  * iser_ib_conv_sockaddr2ibtaddr
1484  * This function converts a socket address into the IBT format
1485  */
1486 void iser_ib_conv_sockaddr2ibtaddr(
1487     idm_sockaddr_t *saddr, ibt_ip_addr_t *ibt_addr)
1488 {
1489 	if (saddr == NULL) {
1490 		ibt_addr->family = AF_UNSPEC;
1491 		ibt_addr->un.ip4addr = 0;
1492 	} else {
1493 		switch (saddr->sin.sa_family) {
1494 		case AF_INET:
1495 
1496 			ibt_addr->family	= saddr->sin4.sin_family;
1497 			ibt_addr->un.ip4addr	= saddr->sin4.sin_addr.s_addr;
1498 			break;
1499 
1500 		case AF_INET6:
1501 
1502 			ibt_addr->family	= saddr->sin6.sin6_family;
1503 			ibt_addr->un.ip6addr	= saddr->sin6.sin6_addr;
1504 			break;
1505 
1506 		default:
1507 			ibt_addr->family = AF_UNSPEC;
1508 		}
1509 
1510 	}
1511 }
1512 
1513 /*
1514  * iser_ib_conv_ibtaddr2sockaddr
1515  * This function converts an IBT ip address handle to a sockaddr
1516  */
1517 void iser_ib_conv_ibtaddr2sockaddr(struct sockaddr_storage *ss,
1518     ibt_ip_addr_t *ibt_addr, in_port_t port)
1519 {
1520 	struct sockaddr_in *sin;
1521 	struct sockaddr_in6 *sin6;
1522 
1523 	switch (ibt_addr->family) {
1524 	case AF_INET:
1525 	case AF_UNSPEC:
1526 
1527 		sin = (struct sockaddr_in *)ibt_addr;
1528 		sin->sin_port = ntohs(port);
1529 		bcopy(sin, ss, sizeof (struct sockaddr_in));
1530 		break;
1531 
1532 	case AF_INET6:
1533 
1534 		sin6 = (struct sockaddr_in6 *)ibt_addr;
1535 		sin6->sin6_port = ntohs(port);
1536 		bcopy(sin6, ss, sizeof (struct sockaddr_in6));
1537 		break;
1538 
1539 	default:
1540 		ISER_LOG(CE_NOTE, "iser_ib_conv_ibtaddr2sockaddr: "
1541 		    "unknown family type: 0x%x", ibt_addr->family);
1542 	}
1543 }
1544 
1545 /*
1546  * iser_ib_setup_cq
1547  * This function sets up the Completion Queue size and allocates the specified
1548  * Completion Queue
1549  */
1550 static int
1551 iser_ib_setup_cq(ibt_hca_hdl_t hca_hdl, uint_t cq_size, ibt_cq_hdl_t *cq_hdl)
1552 {
1553 
1554 	ibt_cq_attr_t		cq_attr;
1555 	int			status;
1556 
1557 	cq_attr.cq_size		= cq_size;
1558 	cq_attr.cq_sched	= 0;
1559 	cq_attr.cq_flags	= IBT_CQ_NO_FLAGS;
1560 
1561 	/* Allocate a Completion Queue */
1562 	status = ibt_alloc_cq(hca_hdl, &cq_attr, cq_hdl, NULL);
1563 	if (status != IBT_SUCCESS) {
1564 		ISER_LOG(CE_NOTE, "iser_ib_setup_cq: ibt_alloc_cq failure (%d)",
1565 		    status);
1566 		return (status);
1567 	}
1568 
1569 	return (ISER_STATUS_SUCCESS);
1570 }
1571 
1572 /*
1573  * iser_ib_setup_chanargs
1574  *
1575  */
1576 static void
1577 iser_ib_setup_chanargs(uint8_t hca_port, ibt_cq_hdl_t scq_hdl,
1578     ibt_cq_hdl_t rcq_hdl, uint_t sq_size, uint_t rq_size,
1579     ibt_pd_hdl_t hca_pdhdl, ibt_rc_chan_alloc_args_t *cargs)
1580 {
1581 
1582 	bzero(cargs, sizeof (ibt_rc_chan_alloc_args_t));
1583 
1584 	/*
1585 	 * Set up the size of the channels send queue, receive queue and the
1586 	 * maximum number of elements in a scatter gather list of work requests
1587 	 * posted to the send and receive queues.
1588 	 */
1589 	cargs->rc_sizes.cs_sq		= sq_size;
1590 	cargs->rc_sizes.cs_rq		= rq_size;
1591 	cargs->rc_sizes.cs_sq_sgl	= ISER_IB_SGLIST_SIZE;
1592 	cargs->rc_sizes.cs_rq_sgl	= ISER_IB_SGLIST_SIZE;
1593 
1594 	/*
1595 	 * All Work requests signaled on a WR basis will receive a send
1596 	 * request completion.
1597 	 */
1598 	cargs->rc_flags			= IBT_ALL_SIGNALED;
1599 
1600 	/* Enable RDMA read and RDMA write on the channel end points */
1601 	cargs->rc_control		= IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1602 
1603 	/* Set the local hca port on which the channel is allocated */
1604 	cargs->rc_hca_port_num		= hca_port;
1605 
1606 	/* Set the Send and Receive Completion Queue handles */
1607 	cargs->rc_scq			= scq_hdl;
1608 	cargs->rc_rcq			= rcq_hdl;
1609 
1610 	/* Set the protection domain associated with the channel */
1611 	cargs->rc_pd			= hca_pdhdl;
1612 
1613 	/* No SRQ usage */
1614 	cargs->rc_srq			= NULL;
1615 }
1616 
1617 /*
1618  * iser_ib_init_qp
1619  * Initialize the QP handle
1620  */
1621 void
1622 iser_ib_init_qp(iser_chan_t *chan, uint_t sq_size, uint_t rq_size)
1623 {
1624 	/* Initialize the handle lock */
1625 	mutex_init(&chan->ic_qp.qp_lock, NULL, MUTEX_DRIVER, NULL);
1626 
1627 	/* Record queue sizes */
1628 	chan->ic_qp.sq_size = sq_size;
1629 	chan->ic_qp.rq_size = rq_size;
1630 
1631 	/* Initialize the RQ monitoring data */
1632 	chan->ic_qp.rq_depth  = rq_size;
1633 	chan->ic_qp.rq_level  = 0;
1634 	chan->ic_qp.rq_lwm = (chan->ic_recvcq_sz * ISER_IB_RQ_LWM_PCT) / 100;
1635 
1636 	/* Initialize the taskq flag */
1637 	chan->ic_qp.rq_taskqpending = B_FALSE;
1638 }
1639 
1640 /*
1641  * iser_ib_fini_qp
1642  * Teardown the QP handle
1643  */
1644 void
1645 iser_ib_fini_qp(iser_qp_t *qp)
1646 {
1647 	/* Destroy the handle lock */
1648 	mutex_destroy(&qp->qp_lock);
1649 }
1650 
1651 static int
1652 iser_ib_activate_port(idm_svc_t *idm_svc, ib_guid_t guid, ib_gid_t gid)
1653 {
1654 	iser_svc_t	*iser_svc;
1655 	iser_sbind_t	*is_sbind;
1656 	int		status;
1657 
1658 	iser_svc = idm_svc->is_iser_svc;
1659 
1660 	/*
1661 	 * Save the address of the service bind handle in the
1662 	 * iser_svc_t to undo the service binding at a later time
1663 	 */
1664 	is_sbind = kmem_zalloc(sizeof (iser_sbind_t), KM_SLEEP);
1665 	is_sbind->is_gid	= gid;
1666 	is_sbind->is_guid	= guid;
1667 
1668 	status  = ibt_bind_service(iser_svc->is_srvhdl, gid, NULL,
1669 	    idm_svc, &is_sbind->is_sbindhdl);
1670 
1671 	if (status != IBT_SUCCESS) {
1672 		ISER_LOG(CE_NOTE, "iser_ib_activate_port: status(0x%x): "
1673 		    "Bind service(%llx) on port(%llx:%llx) failed",
1674 		    status, (longlong_t)iser_svc->is_svcid,
1675 		    (longlong_t)gid.gid_prefix, (longlong_t)gid.gid_guid);
1676 
1677 		kmem_free(is_sbind, sizeof (iser_sbind_t));
1678 
1679 		return (status);
1680 	}
1681 
1682 	list_insert_tail(&iser_svc->is_sbindlist, is_sbind);
1683 
1684 	return (IBT_SUCCESS);
1685 }
1686 
1687 static void
1688 iser_ib_deactivate_port(ib_guid_t hca_guid, ib_gid_t gid)
1689 {
1690 	iser_svc_t	*iser_svc;
1691 	iser_conn_t	*iser_conn;
1692 	iser_sbind_t	*is_sbind;
1693 	idm_conn_t	*idm_conn;
1694 
1695 	/*
1696 	 * Iterate through the global list of IDM target connections.
1697 	 * Issue a TRANSPORT_FAIL for any connections on this port, and
1698 	 * if there is a bound service running on the port, tear it down.
1699 	 */
1700 	mutex_enter(&idm.idm_global_mutex);
1701 	for (idm_conn = list_head(&idm.idm_tgt_conn_list);
1702 	    idm_conn != NULL;
1703 	    idm_conn = list_next(&idm.idm_tgt_conn_list, idm_conn)) {
1704 
1705 		if (idm_conn->ic_transport_type != IDM_TRANSPORT_TYPE_ISER) {
1706 			/* this is not an iSER connection, skip it */
1707 			continue;
1708 		}
1709 
1710 		iser_conn = idm_conn->ic_transport_private;
1711 		if (iser_conn->ic_chan->ic_ibt_path.pi_hca_guid != hca_guid) {
1712 			/* this iSER connection is on a different port */
1713 			continue;
1714 		}
1715 
1716 		/* Fail the transport for this connection */
1717 		idm_conn_event(idm_conn, CE_TRANSPORT_FAIL, IDM_STATUS_FAIL);
1718 
1719 		if (idm_conn->ic_conn_type == CONN_TYPE_INI) {
1720 			/* initiator connection, nothing else to do */
1721 			continue;
1722 		}
1723 
1724 		/* Check for a service binding */
1725 		iser_svc = idm_conn->ic_svc_binding->is_iser_svc;
1726 		is_sbind = iser_ib_get_bind(iser_svc, hca_guid, gid);
1727 		if (is_sbind != NULL) {
1728 			/* This service is still bound, tear it down */
1729 			ibt_unbind_service(iser_svc->is_srvhdl,
1730 			    is_sbind->is_sbindhdl);
1731 			list_remove(&iser_svc->is_sbindlist, is_sbind);
1732 			kmem_free(is_sbind, sizeof (iser_sbind_t));
1733 		}
1734 	}
1735 	mutex_exit(&idm.idm_global_mutex);
1736 }
1737 
1738 static iser_sbind_t *
1739 iser_ib_get_bind(iser_svc_t *iser_svc, ib_guid_t hca_guid, ib_gid_t gid)
1740 {
1741 	iser_sbind_t	*is_sbind;
1742 
1743 	for (is_sbind = list_head(&iser_svc->is_sbindlist);
1744 	    is_sbind != NULL;
1745 	    is_sbind = list_next(&iser_svc->is_sbindlist, is_sbind)) {
1746 
1747 		if ((is_sbind->is_guid == hca_guid) &&
1748 		    (is_sbind->is_gid.gid_prefix == gid.gid_prefix) &&
1749 		    (is_sbind->is_gid.gid_guid == gid.gid_guid)) {
1750 			return (is_sbind);
1751 		}
1752 	}
1753 	return (NULL);
1754 }
1755