xref: /titanic_52/usr/src/uts/common/io/ib/clients/iser/iser_ib.c (revision 2360e12de6667a0a73d68895549343137c26c892)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 #include <sys/types.h>
26 #include <sys/ddi.h>
27 #include <sys/types.h>
28 #include <sys/socket.h>
29 #include <netinet/in.h>
30 #include <sys/sunddi.h>
31 #include <sys/sysmacros.h>
32 #include <sys/iscsi_protocol.h>
33 
34 #include <sys/ib/clients/iser/iser.h>
35 #include <sys/ib/clients/iser/iser_idm.h>
36 
37 /*
38  * iser_ib.c
39  * Routines for InfiniBand transport for iSER
40  *
41  * This file contains the routines to interface with the IBT API to attach and
42  * allocate IB resources, handle async events, and post recv work requests.
43  *
44  */
45 
46 static iser_hca_t *iser_ib_gid2hca(ib_gid_t gid);
47 static iser_hca_t *iser_ib_guid2hca(ib_guid_t guid);
48 
49 static iser_hca_t *iser_ib_alloc_hca(ib_guid_t guid);
50 static int iser_ib_free_hca(iser_hca_t *hca);
51 static int iser_ib_update_hcaports(iser_hca_t *hca);
52 static int iser_ib_init_hcas(void);
53 static int iser_ib_fini_hcas(void);
54 
55 static iser_sbind_t *iser_ib_get_bind(
56     iser_svc_t *iser_svc, ib_guid_t hca_guid, ib_gid_t gid);
57 static int iser_ib_activate_port(
58     idm_svc_t *idm_svc, ib_guid_t guid, ib_gid_t gid);
59 static void iser_ib_deactivate_port(ib_guid_t hca_guid, ib_gid_t gid);
60 
61 static void iser_ib_init_qp(iser_chan_t *chan, uint_t sq_size, uint_t rq_size);
62 static void iser_ib_fini_qp(iser_qp_t *qp);
63 
64 static int iser_ib_setup_cq(ibt_hca_hdl_t hca_hdl, uint_t cq_size,
65     ibt_cq_hdl_t *cq_hdl);
66 
67 static void iser_ib_setup_chanargs(uint8_t hca_port, ibt_cq_hdl_t scq_hdl,
68     ibt_cq_hdl_t rcq_hdl, uint_t sq_size, uint_t rq_size,
69     ibt_pd_hdl_t hca_pdhdl, ibt_rc_chan_alloc_args_t *cargs);
70 
71 static void iser_ib_handle_portup_event(ibt_hca_hdl_t hdl,
72     ibt_async_event_t *event);
73 static void iser_ib_handle_portdown_event(ibt_hca_hdl_t hdl,
74     ibt_async_event_t *event);
75 static void iser_ib_handle_hca_detach_event(ibt_hca_hdl_t hdl,
76     ibt_async_event_t *event);
77 
78 static void iser_ib_post_recv_task(void *arg);
79 
80 static struct ibt_clnt_modinfo_s iser_ib_modinfo = {
81 	IBTI_V_CURR,
82 	IBT_STORAGE_DEV,
83 	iser_ib_async_handler,
84 	NULL,
85 	"iSER"
86 };
87 
88 /*
89  * iser_ib_init
90  *
91  * This function registers the HCA drivers with IBTF and registers and binds
92  * iSER as a service with IBTF.
93  */
94 int
95 iser_ib_init(void)
96 {
97 	int		status;
98 
99 	/* Register with IBTF */
100 	status = ibt_attach(&iser_ib_modinfo, iser_state->is_dip, iser_state,
101 	    &iser_state->is_ibhdl);
102 	if (status != DDI_SUCCESS) {
103 		ISER_LOG(CE_NOTE, "iser_ib_init: ibt_attach failed (0x%x)",
104 		    status);
105 		return (DDI_FAILURE);
106 	}
107 
108 	/* Create the global work request kmem_cache */
109 	iser_state->iser_wr_cache = kmem_cache_create("iser_wr_cache",
110 	    sizeof (iser_wr_t), 0, NULL, NULL, NULL,
111 	    iser_state, NULL, KM_SLEEP);
112 
113 	/* Populate our list of HCAs */
114 	status = iser_ib_init_hcas();
115 	if (status != DDI_SUCCESS) {
116 		/* HCAs failed to initialize, tear it down */
117 		kmem_cache_destroy(iser_state->iser_wr_cache);
118 		(void) ibt_detach(iser_state->is_ibhdl);
119 		iser_state->is_ibhdl = NULL;
120 		ISER_LOG(CE_NOTE, "iser_ib_init: failed to initialize HCAs");
121 		return (DDI_FAILURE);
122 	}
123 
124 	/* Target will register iSER as a service with IBTF when required */
125 
126 	/* Target will bind this service when it comes online */
127 
128 	return (DDI_SUCCESS);
129 }
130 
131 /*
132  * iser_ib_fini
133  *
134  * This function unbinds and degisters the iSER service from IBTF
135  */
136 int
137 iser_ib_fini(void)
138 {
139 	/* IDM would have already disabled all the services */
140 
141 	/* Teardown the HCA list and associated resources */
142 	if (iser_ib_fini_hcas() != DDI_SUCCESS)
143 		return (DDI_FAILURE);
144 
145 	/* Teardown the global work request kmem_cache */
146 	kmem_cache_destroy(iser_state->iser_wr_cache);
147 
148 	/* Deregister with IBTF */
149 	if (iser_state->is_ibhdl != NULL) {
150 		(void) ibt_detach(iser_state->is_ibhdl);
151 		iser_state->is_ibhdl = NULL;
152 	}
153 
154 	return (DDI_SUCCESS);
155 }
156 
157 /*
158  * iser_ib_register_service
159  *
160  * This function registers the iSER service using the RDMA-Aware Service ID.
161  */
162 int
163 iser_ib_register_service(idm_svc_t *idm_svc)
164 {
165 	ibt_srv_desc_t	srvdesc;
166 	iser_svc_t	*iser_svc;
167 	int		status;
168 
169 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
170 
171 	/* Set up IBTI client callback handler from the CM */
172 	srvdesc.sd_handler = iser_ib_cm_handler;
173 
174 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
175 
176 	iser_svc = (iser_svc_t *)idm_svc->is_iser_svc;
177 
178 	/* Register the service on the specified port */
179 	status = ibt_register_service(
180 	    iser_state->is_ibhdl, &srvdesc,
181 	    iser_svc->is_svcid, 1, &iser_svc->is_srvhdl, NULL);
182 
183 	return (status);
184 }
185 
186 /*
187  * iser_ib_bind_service
188  *
189  * This function binds a given iSER service on all available HCA ports. The
190  * current specification does not allow user to specify transport bindings
191  * for each iscsi target. The ULP invokes this function to bind the target
192  * to all available iser ports after checking for the presence of an IB HCA.
193  * iSER is "configured" whenever an IB-capable IP address exists. The lack
194  * of active IB ports is a less-fatal condition, and sockets would be used
195  * as the transport even though an Infiniband HCA is configured but unusable.
196  *
197  */
198 int
199 iser_ib_bind_service(idm_svc_t *idm_svc)
200 {
201 	iser_hca_t	*hca;
202 	ib_gid_t	gid;
203 	int		num_ports = 0;
204 	int		num_binds = 0;
205 	int		num_inactive_binds = 0; /* if HCA ports inactive */
206 	int		status;
207 	int		i;
208 
209 	ASSERT(idm_svc != NULL);
210 	ASSERT(idm_svc->is_iser_svc != NULL);
211 
212 	/* Register the iSER service on all available ports */
213 	mutex_enter(&iser_state->is_hcalist_lock);
214 
215 	for (hca = list_head(&iser_state->is_hcalist);
216 	    hca != NULL;
217 	    hca = list_next(&iser_state->is_hcalist, hca)) {
218 
219 		for (i = 0; i < hca->hca_num_ports; i++) {
220 			num_ports++;
221 			if (hca->hca_port_info[i].p_linkstate !=
222 			    IBT_PORT_ACTIVE) {
223 				/*
224 				 * Move on. We will attempt to bind service
225 				 * in our async handler if the port comes up
226 				 * at a later time.
227 				 */
228 				num_inactive_binds++;
229 				continue;
230 			}
231 
232 			gid = hca->hca_port_info[i].p_sgid_tbl[0];
233 
234 			/* If the port is already bound, skip */
235 			if (iser_ib_get_bind(
236 			    idm_svc->is_iser_svc, hca->hca_guid, gid) == NULL) {
237 
238 				status = iser_ib_activate_port(
239 				    idm_svc, hca->hca_guid, gid);
240 				if (status != IBT_SUCCESS) {
241 					ISER_LOG(CE_NOTE,
242 					    "iser_ib_bind_service: "
243 					    "iser_ib_activate_port failure "
244 					    "(0x%x)", status);
245 					continue;
246 				}
247 			}
248 			num_binds++;
249 		}
250 	}
251 	mutex_exit(&iser_state->is_hcalist_lock);
252 
253 	if (num_binds) {
254 		ISER_LOG(CE_NOTE, "iser_ib_bind_service: Service available on "
255 		    "(%d) of (%d) ports", num_binds, num_ports);
256 		return (ISER_STATUS_SUCCESS);
257 	} else if (num_inactive_binds) {
258 		ISER_LOG(CE_NOTE, "iser_ib_bind_service: Could not bind "
259 		    "service, HCA ports are not active.");
260 		/*
261 		 * still considered success, the async handler will bind
262 		 * the service when the port comes up at a later time
263 		 */
264 		return (ISER_STATUS_SUCCESS);
265 	} else {
266 		ISER_LOG(CE_NOTE, "iser_ib_bind_service: Did not bind service");
267 		return (ISER_STATUS_FAIL);
268 	}
269 }
270 
271 /*
272  * iser_ib_unbind_service
273  *
274  * This function unbinds a given service on a all HCA ports
275  */
276 void
277 iser_ib_unbind_service(idm_svc_t *idm_svc)
278 {
279 	iser_svc_t	*iser_svc;
280 	iser_sbind_t	*is_sbind, *next_sb;
281 
282 	if (idm_svc != NULL && idm_svc->is_iser_svc != NULL) {
283 
284 		iser_svc = idm_svc->is_iser_svc;
285 
286 		for (is_sbind = list_head(&iser_svc->is_sbindlist);
287 		    is_sbind != NULL;
288 		    is_sbind = next_sb) {
289 			next_sb = list_next(&iser_svc->is_sbindlist, is_sbind);
290 			(void) ibt_unbind_service(iser_svc->is_srvhdl,
291 			    is_sbind->is_sbindhdl);
292 			list_remove(&iser_svc->is_sbindlist, is_sbind);
293 			kmem_free(is_sbind, sizeof (iser_sbind_t));
294 		}
295 	}
296 }
297 
298 /* ARGSUSED */
299 void
300 iser_ib_deregister_service(idm_svc_t *idm_svc)
301 {
302 	iser_svc_t	*iser_svc;
303 
304 	if (idm_svc != NULL && idm_svc->is_iser_svc != NULL) {
305 
306 		iser_svc = (iser_svc_t *)idm_svc->is_iser_svc;
307 		(void) ibt_deregister_service(iser_state->is_ibhdl,
308 		    iser_svc->is_srvhdl);
309 		(void) ibt_release_ip_sid(iser_svc->is_svcid);
310 	}
311 }
312 
313 /*
314  * iser_ib_get_paths
315  * This function finds the IB path between the local and the remote address.
316  *
317  */
318 int
319 iser_ib_get_paths(ibt_ip_addr_t *local_ip, ibt_ip_addr_t *remote_ip,
320     ibt_path_info_t *path, ibt_path_ip_src_t *path_src_ip)
321 {
322 	ibt_ip_path_attr_t	ipattr;
323 	int			status;
324 
325 	(void) bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
326 	ipattr.ipa_dst_ip	= remote_ip;
327 	ipattr.ipa_src_ip	= *local_ip;
328 	ipattr.ipa_max_paths	= 1;
329 	ipattr.ipa_ndst		= 1;
330 
331 	(void) bzero(path, sizeof (ibt_path_info_t));
332 	status = ibt_get_ip_paths(iser_state->is_ibhdl, IBT_PATH_NO_FLAGS,
333 	    &ipattr, path, NULL, path_src_ip);
334 	if (status != IBT_SUCCESS) {
335 		ISER_LOG(CE_NOTE, "ibt_get_ip_paths: ibt_get_ip_paths "
336 		    "failure: status (%d)", status);
337 		return (status);
338 	}
339 
340 	if (local_ip != NULL) {
341 		ISER_LOG(CE_NOTE, "iser_ib_get_paths success: IP[%x to %x]",
342 		    local_ip->un.ip4addr, remote_ip->un.ip4addr);
343 	} else {
344 		ISER_LOG(CE_NOTE, "iser_ib_get_paths success: "
345 		    "IP[INADDR_ANY to %x]", remote_ip->un.ip4addr);
346 	}
347 
348 	return (ISER_STATUS_SUCCESS);
349 }
350 
351 /*
352  * iser_ib_alloc_channel_nopathlookup
353  *
354  * This function allocates a reliable connected channel. This function does
355  * not invoke ibt_get_ip_paths() to do the path lookup. The HCA GUID and
356  * port are input to this function.
357  */
358 iser_chan_t *
359 iser_ib_alloc_channel_nopathlookup(ib_guid_t hca_guid, uint8_t hca_port)
360 {
361 	iser_hca_t	*hca;
362 	iser_chan_t	*chan;
363 
364 	/* Lookup the hca using the gid in the path info */
365 	hca = iser_ib_guid2hca(hca_guid);
366 	if (hca == NULL) {
367 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_nopathlookup: failed "
368 		    "to lookup HCA(%llx) handle", (longlong_t)hca_guid);
369 		return (NULL);
370 	}
371 
372 	chan = iser_ib_alloc_rc_channel(hca, hca_port);
373 	if (chan == NULL) {
374 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_nopathlookup: failed "
375 		    "to alloc channel on HCA(%llx) %d",
376 		    (longlong_t)hca_guid, hca_port);
377 		return (NULL);
378 	}
379 
380 	ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup success: "
381 	    "chanhdl (0x%p), HCA(%llx) %d",
382 	    (void *)chan->ic_chanhdl, (longlong_t)hca_guid, hca_port);
383 
384 	return (chan);
385 }
386 
387 /*
388  * iser_ib_alloc_channel_pathlookup
389  *
390  * This function allocates a reliable connected channel but first invokes
391  * ibt_get_ip_paths() with the given local and remote addres to get the
392  * HCA lgid and the port number.
393  */
394 iser_chan_t *
395 iser_ib_alloc_channel_pathlookup(
396     ibt_ip_addr_t *local_ip, ibt_ip_addr_t *remote_ip)
397 {
398 	ibt_path_info_t		ibt_path;
399 	ibt_path_ip_src_t	path_src_ip;
400 	ib_gid_t		lgid;
401 	uint8_t			hca_port; /* from path */
402 	iser_hca_t		*hca;
403 	iser_chan_t		*chan;
404 	int			status;
405 
406 	/* Lookup a path to the given destination */
407 	status = iser_ib_get_paths(
408 	    local_ip, remote_ip, &ibt_path, &path_src_ip);
409 
410 	if (status != ISER_STATUS_SUCCESS) {
411 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup: faild "
412 		    "Path lookup IP:[%llx to %llx] failed: status (%d)",
413 		    (longlong_t)local_ip->un.ip4addr,
414 		    (longlong_t)remote_ip->un.ip4addr,
415 		    status);
416 		return (NULL);
417 	}
418 
419 	/* get the local gid from the path info */
420 	lgid = ibt_path.pi_prim_cep_path.cep_adds_vect.av_sgid;
421 
422 	/* get the hca port from the path info */
423 	hca_port = ibt_path.pi_prim_cep_path.cep_hca_port_num;
424 
425 	/* Lookup the hca using the gid in the path info */
426 	hca = iser_ib_gid2hca(lgid);
427 	if (hca == NULL) {
428 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup: failed "
429 		    "to lookup HCA (%llx) handle",
430 		    (longlong_t)hca->hca_guid);
431 		return (NULL);
432 	}
433 
434 	chan = iser_ib_alloc_rc_channel(hca, hca_port);
435 	if (chan == NULL) {
436 		ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup: failed "
437 		    "to alloc channel from IP:[%llx to %llx] on HCA (%llx) %d",
438 		    (longlong_t)local_ip->un.ip4addr,
439 		    (longlong_t)remote_ip->un.ip4addr,
440 		    (longlong_t)hca->hca_guid, hca_port);
441 		return (NULL);
442 	}
443 
444 	ISER_LOG(CE_NOTE, "iser_ib_alloc_channel_pathlookup success: "
445 	    "chanhdl (0x%p), IP:[%llx to %llx], lgid (%llx:%llx), HCA(%llx) %d",
446 	    (void *)chan->ic_chanhdl,
447 	    (longlong_t)local_ip->un.ip4addr,
448 	    (longlong_t)remote_ip->un.ip4addr,
449 	    (longlong_t)lgid.gid_prefix, (longlong_t)lgid.gid_guid,
450 	    (longlong_t)hca->hca_guid, hca_port);
451 
452 	chan->ic_ibt_path	= ibt_path;
453 	chan->ic_localip	= path_src_ip.ip_primary;
454 	chan->ic_remoteip	= *remote_ip;
455 
456 	return (chan);
457 }
458 
459 /*
460  * iser_ib_alloc_rc_channel
461  *
462  * This function allocates a reliable communication channel using the specified
463  * channel attributes.
464  */
465 iser_chan_t *
466 iser_ib_alloc_rc_channel(iser_hca_t *hca, uint8_t hca_port)
467 {
468 
469 	iser_chan_t			*chan;
470 	ibt_rc_chan_alloc_args_t	chanargs;
471 	uint_t				sq_size, rq_size;
472 	int				status;
473 
474 	chan = kmem_zalloc(sizeof (iser_chan_t), KM_SLEEP);
475 
476 	mutex_init(&chan->ic_chan_lock, NULL, MUTEX_DRIVER, NULL);
477 	mutex_init(&chan->ic_sq_post_lock, NULL, MUTEX_DRIVER, NULL);
478 
479 	/* Set up the iSER channel handle with HCA */
480 	chan->ic_hca		= hca;
481 
482 	/*
483 	 * Determine the queue sizes, based upon the HCA query data.
484 	 * For our Work Queues, we will use either our default value,
485 	 * or the HCA's maximum value, whichever is smaller.
486 	 */
487 	sq_size = min(hca->hca_attr.hca_max_chan_sz, ISER_IB_SENDQ_SIZE);
488 	rq_size = min(hca->hca_attr.hca_max_chan_sz, ISER_IB_RECVQ_SIZE);
489 
490 	/*
491 	 * For our Completion Queues, we again check the device maximum.
492 	 * We want to end up with CQs that are the next size up from the
493 	 * WQs they are servicing so that they have some overhead.
494 	 */
495 	if (hca->hca_attr.hca_max_cq_sz >= (sq_size + 1)) {
496 		chan->ic_sendcq_sz = sq_size + 1;
497 	} else {
498 		chan->ic_sendcq_sz = hca->hca_attr.hca_max_cq_sz;
499 		sq_size = chan->ic_sendcq_sz - 1;
500 	}
501 
502 	if (hca->hca_attr.hca_max_cq_sz >= (rq_size + 1)) {
503 		chan->ic_recvcq_sz = rq_size + 1;
504 	} else {
505 		chan->ic_recvcq_sz = hca->hca_attr.hca_max_cq_sz;
506 		rq_size = chan->ic_recvcq_sz - 1;
507 	}
508 
509 	/* Initialize the iSER channel's QP handle */
510 	iser_ib_init_qp(chan, sq_size, rq_size);
511 
512 	/* Set up the Send Completion Queue */
513 	status = iser_ib_setup_cq(hca->hca_hdl, chan->ic_sendcq_sz,
514 	    &chan->ic_sendcq);
515 	if (status != ISER_STATUS_SUCCESS) {
516 		iser_ib_fini_qp(&chan->ic_qp);
517 		mutex_destroy(&chan->ic_chan_lock);
518 		mutex_destroy(&chan->ic_sq_post_lock);
519 		kmem_free(chan, sizeof (iser_chan_t));
520 		return (NULL);
521 	}
522 	ibt_set_cq_handler(chan->ic_sendcq, iser_ib_sendcq_handler, chan);
523 	(void) ibt_enable_cq_notify(chan->ic_sendcq, IBT_NEXT_COMPLETION);
524 
525 	/* Set up the Receive Completion Queue */
526 	status = iser_ib_setup_cq(hca->hca_hdl, chan->ic_recvcq_sz,
527 	    &chan->ic_recvcq);
528 	if (status != ISER_STATUS_SUCCESS) {
529 		(void) ibt_free_cq(chan->ic_sendcq);
530 		iser_ib_fini_qp(&chan->ic_qp);
531 		mutex_destroy(&chan->ic_chan_lock);
532 		mutex_destroy(&chan->ic_sq_post_lock);
533 		kmem_free(chan, sizeof (iser_chan_t));
534 		return (NULL);
535 	}
536 	ibt_set_cq_handler(chan->ic_recvcq, iser_ib_recvcq_handler, chan);
537 	(void) ibt_enable_cq_notify(chan->ic_recvcq, IBT_NEXT_COMPLETION);
538 
539 	/* Setup the channel arguments */
540 	iser_ib_setup_chanargs(hca_port, chan->ic_sendcq, chan->ic_recvcq,
541 	    sq_size, rq_size, hca->hca_pdhdl, &chanargs);
542 
543 	status = ibt_alloc_rc_channel(hca->hca_hdl,
544 	    IBT_ACHAN_NO_FLAGS, &chanargs, &chan->ic_chanhdl, NULL);
545 	if (status != IBT_SUCCESS) {
546 		ISER_LOG(CE_NOTE, "iser_ib_alloc_rc_channel: failed "
547 		    "ibt_alloc_rc_channel: status (%d)", status);
548 		(void) ibt_free_cq(chan->ic_sendcq);
549 		(void) ibt_free_cq(chan->ic_recvcq);
550 		iser_ib_fini_qp(&chan->ic_qp);
551 		mutex_destroy(&chan->ic_chan_lock);
552 		mutex_destroy(&chan->ic_sq_post_lock);
553 		kmem_free(chan, sizeof (iser_chan_t));
554 		return (NULL);
555 	}
556 
557 	/* Set the 'channel' as the client private data */
558 	(void) ibt_set_chan_private(chan->ic_chanhdl, chan);
559 
560 	return (chan);
561 }
562 
563 /*
564  * iser_ib_open_rc_channel
565  * This function opens a RC connection on the given allocated RC channel
566  */
567 int
568 iser_ib_open_rc_channel(iser_chan_t *chan)
569 {
570 	ibt_ip_cm_info_t	ipcm_info;
571 	iser_private_data_t	iser_priv_data;
572 	ibt_chan_open_args_t	ocargs;
573 	ibt_rc_returns_t	ocreturns;
574 	int			status;
575 
576 	mutex_enter(&chan->ic_chan_lock);
577 
578 	/*
579 	 * For connection establishment, the initiator sends a CM REQ using the
580 	 * iSER RDMA-Aware Service ID. Included are the source and destination
581 	 * IP addresses, and the src port.
582 	 */
583 	bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
584 	ipcm_info.src_addr = chan->ic_localip;
585 	ipcm_info.dst_addr = chan->ic_remoteip;
586 	ipcm_info.src_port = chan->ic_lport;
587 
588 	/*
589 	 * The CM Private Data field defines the iSER connection parameters
590 	 * such as zero based virtual address exception (ZBVAE) and Send with
591 	 * invalidate Exception (SIE).
592 	 *
593 	 * Solaris IBT does not currently support ZBVAE or SIE.
594 	 */
595 	iser_priv_data.rsvd1	= 0;
596 	iser_priv_data.sie	= 1;
597 	iser_priv_data.zbvae	= 1;
598 
599 	status = ibt_format_ip_private_data(&ipcm_info,
600 	    sizeof (iser_private_data_t), &iser_priv_data);
601 	if (status != IBT_SUCCESS) {
602 		ISER_LOG(CE_NOTE, "iser_ib_open_rc_channel failed: %d", status);
603 		mutex_exit(&chan->ic_chan_lock);
604 		return (status);
605 	}
606 
607 	/*
608 	 * Set the SID we are attempting to connect to, based upon the
609 	 * remote port number.
610 	 */
611 	chan->ic_ibt_path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, chan->ic_rport);
612 
613 	/* Set up the args for the channel open */
614 	bzero(&ocargs, sizeof (ibt_chan_open_args_t));
615 	ocargs.oc_path			= &chan->ic_ibt_path;
616 	ocargs.oc_cm_handler		= iser_ib_cm_handler;
617 	ocargs.oc_cm_clnt_private	= iser_state;
618 	ocargs.oc_rdma_ra_out		= 4;
619 	ocargs.oc_rdma_ra_in		= 4;
620 	ocargs.oc_path_retry_cnt	= 2;
621 	ocargs.oc_path_rnr_retry_cnt	= 2;
622 	ocargs.oc_priv_data_len		= sizeof (iser_private_data_t);
623 	ocargs.oc_priv_data		= &iser_priv_data;
624 
625 	bzero(&ocreturns, sizeof (ibt_rc_returns_t));
626 
627 	status = ibt_open_rc_channel(chan->ic_chanhdl,
628 	    IBT_OCHAN_NO_FLAGS, IBT_BLOCKING, &ocargs, &ocreturns);
629 
630 	if (status != IBT_SUCCESS) {
631 		ISER_LOG(CE_NOTE, "iser_ib_open_rc_channel failed: %d", status);
632 		mutex_exit(&chan->ic_chan_lock);
633 		return (status);
634 	}
635 
636 	mutex_exit(&chan->ic_chan_lock);
637 	return (IDM_STATUS_SUCCESS);
638 }
639 
640 /*
641  * iser_ib_close_rc_channel
642  * This function closes the RC channel related to this iser_chan handle.
643  * We invoke this in a non-blocking, no callbacks context.
644  */
645 void
646 iser_ib_close_rc_channel(iser_chan_t *chan)
647 {
648 	int			status;
649 
650 	mutex_enter(&chan->ic_chan_lock);
651 	status = ibt_close_rc_channel(chan->ic_chanhdl, IBT_BLOCKING, NULL,
652 	    0, NULL, NULL, 0);
653 	if (status != IBT_SUCCESS) {
654 		ISER_LOG(CE_NOTE, "iser_ib_close_rc_channel: "
655 		    "ibt_close_rc_channel failed: status (%d)", status);
656 	}
657 	mutex_exit(&chan->ic_chan_lock);
658 }
659 
660 /*
661  * iser_ib_free_rc_channel
662  *
663  * This function tears down an RC channel's QP initialization and frees it.
664  * Note that we do not need synchronization here; the channel has been
665  * closed already, so we should only have completion polling occuring.  Once
666  * complete, we are free to free the IBTF channel, WQ and CQ resources, and
667  * our own related resources.
668  */
669 void
670 iser_ib_free_rc_channel(iser_chan_t *chan)
671 {
672 	iser_qp_t	*iser_qp;
673 
674 	iser_qp = &chan->ic_qp;
675 
676 	/* Ensure the SQ is empty */
677 	while (chan->ic_sq_post_count != 0) {
678 		mutex_exit(&chan->ic_conn->ic_lock);
679 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
680 		mutex_enter(&chan->ic_conn->ic_lock);
681 	}
682 	mutex_destroy(&chan->ic_sq_post_lock);
683 
684 	/* Ensure the RQ is empty */
685 	(void) ibt_flush_channel(chan->ic_chanhdl);
686 	mutex_enter(&iser_qp->qp_lock);
687 	while (iser_qp->rq_level != 0) {
688 		mutex_exit(&iser_qp->qp_lock);
689 		mutex_exit(&chan->ic_conn->ic_lock);
690 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
691 		mutex_enter(&chan->ic_conn->ic_lock);
692 		mutex_enter(&iser_qp->qp_lock);
693 	}
694 
695 	/* Free our QP handle */
696 	mutex_exit(&iser_qp->qp_lock);
697 	(void) iser_ib_fini_qp(iser_qp);
698 
699 	/* Free the IBT channel resources */
700 	(void) ibt_free_channel(chan->ic_chanhdl);
701 	chan->ic_chanhdl = NULL;
702 
703 	/* Free the CQs */
704 	(void) ibt_free_cq(chan->ic_sendcq);
705 	(void) ibt_free_cq(chan->ic_recvcq);
706 
707 	/* Free the chan handle */
708 	mutex_destroy(&chan->ic_chan_lock);
709 	kmem_free(chan, sizeof (iser_chan_t));
710 }
711 
712 /*
713  * iser_ib_post_recv
714  *
715  * This function handles keeping the RQ full on a given channel.
716  * This routine will mostly be run on a taskq, and will check the
717  * current fill level of the RQ, and post as many WRs as necessary
718  * to fill it again.
719  */
720 
721 int
722 iser_ib_post_recv_async(ibt_channel_hdl_t chanhdl)
723 {
724 	iser_chan_t	*chan;
725 	int		status;
726 
727 	/* Pull our iSER channel handle from the private data */
728 	chan = (iser_chan_t *)ibt_get_chan_private(chanhdl);
729 
730 	/*
731 	 * Caller must check that chan->ic_conn->ic_stage indicates
732 	 * the connection is active (not closing, not closed) and
733 	 * it must hold the mutex cross the check and the call to this function
734 	 */
735 	ASSERT(mutex_owned(&chan->ic_conn->ic_lock));
736 	ASSERT((chan->ic_conn->ic_stage >= ISER_CONN_STAGE_ALLOCATED) &&
737 	    (chan->ic_conn->ic_stage <= ISER_CONN_STAGE_LOGGED_IN));
738 	idm_conn_hold(chan->ic_conn->ic_idmc);
739 	status = ddi_taskq_dispatch(iser_taskq, iser_ib_post_recv_task,
740 	    (void *)chanhdl, DDI_NOSLEEP);
741 	if (status != DDI_SUCCESS) {
742 		idm_conn_rele(chan->ic_conn->ic_idmc);
743 	}
744 
745 	return (status);
746 }
747 
748 static void
749 iser_ib_post_recv_task(void *arg)
750 {
751 	ibt_channel_hdl_t	chanhdl = arg;
752 	iser_chan_t		*chan;
753 
754 	/* Pull our iSER channel handle from the private data */
755 	chan = (iser_chan_t *)ibt_get_chan_private(chanhdl);
756 
757 	iser_ib_post_recv(chanhdl);
758 	idm_conn_rele(chan->ic_conn->ic_idmc);
759 }
760 
761 void
762 iser_ib_post_recv(ibt_channel_hdl_t chanhdl)
763 {
764 	iser_chan_t	*chan;
765 	iser_hca_t	*hca;
766 	iser_msg_t	*msg;
767 	ibt_recv_wr_t	*wrlist, wr[ISER_IB_RQ_POST_MAX];
768 	int		rq_space, msg_ret;
769 	int		total_num, npost;
770 	uint_t		nposted;
771 	int		status, i;
772 	iser_qp_t	*iser_qp;
773 
774 	/* Pull our iSER channel handle from the private data */
775 	chan = (iser_chan_t *)ibt_get_chan_private(chanhdl);
776 
777 	ASSERT(chan != NULL);
778 
779 	mutex_enter(&chan->ic_conn->ic_lock);
780 
781 	/* Bail out if the connection is closed; no need for more recv WRs */
782 	if ((chan->ic_conn->ic_stage == ISER_CONN_STAGE_CLOSING) ||
783 	    (chan->ic_conn->ic_stage == ISER_CONN_STAGE_CLOSED)) {
784 		mutex_exit(&chan->ic_conn->ic_lock);
785 		return;
786 	}
787 
788 	/* get the QP handle from the iser_chan */
789 	iser_qp = &chan->ic_qp;
790 
791 	hca = chan->ic_hca;
792 
793 	if (hca == NULL) {
794 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: unable to retrieve "
795 		    "HCA handle");
796 		mutex_exit(&chan->ic_conn->ic_lock);
797 		return;
798 	}
799 
800 	/* check for space to post on the RQ */
801 	mutex_enter(&iser_qp->qp_lock);
802 	rq_space = iser_qp->rq_depth - iser_qp->rq_level;
803 	if (rq_space == 0) {
804 		/* The RQ is full, clear the pending flag and return */
805 		iser_qp->rq_taskqpending = B_FALSE;
806 		mutex_exit(&iser_qp->qp_lock);
807 		mutex_exit(&chan->ic_conn->ic_lock);
808 		return;
809 	}
810 
811 	/* Keep track of the lowest value for rq_min_post_level */
812 	if (iser_qp->rq_level < iser_qp->rq_min_post_level)
813 		iser_qp->rq_min_post_level = iser_qp->rq_level;
814 
815 	mutex_exit(&iser_qp->qp_lock);
816 
817 	/* we've room to post, so pull from the msg cache */
818 	msg = iser_msg_get(hca, rq_space, &msg_ret);
819 	if (msg == NULL) {
820 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: no message handles "
821 		    "available in msg cache currently");
822 		/*
823 		 * There are no messages on the cache. Wait a half-
824 		 * second, then try again.
825 		 */
826 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
827 		status = iser_ib_post_recv_async(chanhdl);
828 		if (status != DDI_SUCCESS) {
829 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: failed to "
830 			    "redispatch routine");
831 			/* Failed to dispatch, clear pending flag */
832 			mutex_enter(&iser_qp->qp_lock);
833 			iser_qp->rq_taskqpending = B_FALSE;
834 			mutex_exit(&iser_qp->qp_lock);
835 		}
836 		mutex_exit(&chan->ic_conn->ic_lock);
837 		return;
838 	}
839 
840 	if (msg_ret != rq_space) {
841 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: requested number of "
842 		    "messages not allocated: requested (%d) allocated (%d)",
843 		    rq_space, msg_ret);
844 		/* We got some, but not all, of our requested depth */
845 		rq_space = msg_ret;
846 	}
847 
848 	/*
849 	 * Now, walk through the allocated WRs and post them,
850 	 * ISER_IB_RQ_POST_MAX (or less) at a time.
851 	 */
852 	wrlist = &wr[0];
853 	total_num = rq_space;
854 
855 	while (total_num) {
856 		/* determine the number to post on this iteration */
857 		npost = (total_num > ISER_IB_RQ_POST_MAX) ?
858 		    ISER_IB_RQ_POST_MAX : total_num;
859 
860 		/* build a list of WRs from the msg list */
861 		for (i = 0; i < npost; i++) {
862 			wrlist[i].wr_id		= (ibt_wrid_t)(uintptr_t)msg;
863 			wrlist[i].wr_nds	= ISER_IB_SGLIST_SIZE;
864 			wrlist[i].wr_sgl	= &msg->msg_ds;
865 			msg = msg->nextp;
866 		}
867 
868 		/* post the list to the RQ */
869 		nposted = 0;
870 		status = ibt_post_recv(chanhdl, wrlist, npost, &nposted);
871 		if ((status != IBT_SUCCESS) || (nposted != npost)) {
872 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: ibt_post_recv "
873 			    "failed: requested (%d) posted (%d) status (%d)",
874 			    npost, nposted, status);
875 			total_num -= nposted;
876 			break;
877 		}
878 
879 		/* decrement total number to post by the number posted */
880 		total_num -= nposted;
881 	}
882 
883 	mutex_enter(&iser_qp->qp_lock);
884 	if (total_num != 0) {
885 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: unable to fill RQ, "
886 		    "failed to post (%d) WRs", total_num);
887 		iser_qp->rq_level += rq_space - total_num;
888 	} else {
889 		iser_qp->rq_level += rq_space;
890 	}
891 
892 	/*
893 	 * Now that we've filled the RQ, check that all of the recv WRs
894 	 * haven't just been immediately consumed. If so, taskqpending is
895 	 * still B_TRUE, so we need to fire off a taskq thread to post
896 	 * more WRs.
897 	 */
898 	if (iser_qp->rq_level == 0) {
899 		mutex_exit(&iser_qp->qp_lock);
900 		status = iser_ib_post_recv_async(chanhdl);
901 		if (status != DDI_SUCCESS) {
902 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: failed to "
903 			    "dispatch followup routine");
904 			/* Failed to dispatch, clear pending flag */
905 			mutex_enter(&iser_qp->qp_lock);
906 			iser_qp->rq_taskqpending = B_FALSE;
907 			mutex_exit(&iser_qp->qp_lock);
908 		}
909 	} else {
910 		/*
911 		 * We're done, we've filled the RQ. Clear the taskq
912 		 * flag so that we can run again.
913 		 */
914 		iser_qp->rq_taskqpending = B_FALSE;
915 		mutex_exit(&iser_qp->qp_lock);
916 	}
917 
918 	mutex_exit(&chan->ic_conn->ic_lock);
919 }
920 
921 /*
922  * iser_ib_handle_portup_event()
923  * This handles the IBT_EVENT_PORT_UP unaffiliated asynchronous event.
924  *
925  * To facilitate a seamless bringover of the port and configure the CM service
926  * for inbound iSER service requests on this newly active port, the existing
927  * IDM services will be checked for iSER support.
928  * If an iSER service was already created, then this service will simply be
929  * bound to the gid of the newly active port. If on the other hand, the CM
930  * service did not exist, i.e. only socket communication, then a new CM
931  * service will be first registered with the saved service parameters and
932  * then bound to the newly active port.
933  *
934  */
935 /* ARGSUSED */
936 static void
937 iser_ib_handle_portup_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
938 {
939 	iser_hca_t		*hca;
940 	ib_gid_t		gid;
941 	idm_svc_t		*idm_svc;
942 	int			status;
943 
944 	ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event: HCA(0x%llx) port(%d)",
945 	    (longlong_t)event->ev_hca_guid, event->ev_port);
946 
947 	/*
948 	 * Query all ports on the HCA and update the port information
949 	 * maintainted in the iser_hca_t structure
950 	 */
951 	hca = iser_ib_guid2hca(event->ev_hca_guid);
952 	if (hca == NULL) {
953 
954 		/* HCA is just made available, first port on that HCA */
955 		hca = iser_ib_alloc_hca(event->ev_hca_guid);
956 		if (hca == NULL) {
957 			ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
958 			    "iser_ib_alloc_hca failed: HCA(0x%llx) port(%d)",
959 			    (longlong_t)event->ev_hca_guid, event->ev_port);
960 			return;
961 		}
962 		mutex_enter(&iser_state->is_hcalist_lock);
963 		list_insert_tail(&iser_state->is_hcalist, hca);
964 		iser_state->is_num_hcas++;
965 		mutex_exit(&iser_state->is_hcalist_lock);
966 
967 	} else {
968 
969 		status = iser_ib_update_hcaports(hca);
970 
971 		if (status != IBT_SUCCESS) {
972 			ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
973 			    "status(0x%x): iser_ib_update_hcaports failed: "
974 			    "HCA(0x%llx) port(%d)", status,
975 			    (longlong_t)event->ev_hca_guid, event->ev_port);
976 			return;
977 		}
978 	}
979 
980 	gid = hca->hca_port_info[event->ev_port - 1].p_sgid_tbl[0];
981 
982 	/*
983 	 * Iterate through the global list of IDM target services
984 	 * and check for existing iSER CM service.
985 	 */
986 	mutex_enter(&idm.idm_global_mutex);
987 	for (idm_svc = list_head(&idm.idm_tgt_svc_list);
988 	    idm_svc != NULL;
989 	    idm_svc = list_next(&idm.idm_tgt_svc_list, idm_svc)) {
990 
991 
992 		if (idm_svc->is_iser_svc == NULL) {
993 
994 			/* Establish a new CM service for iSER requests */
995 			status = iser_tgt_svc_create(
996 			    &idm_svc->is_svc_req, idm_svc);
997 
998 			if (status != IBT_SUCCESS) {
999 				ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
1000 				    "status(0x%x): iser_tgt_svc_create failed: "
1001 				    "HCA(0x%llx) port(%d)", status,
1002 				    (longlong_t)event->ev_hca_guid,
1003 				    event->ev_port);
1004 
1005 				continue;
1006 			}
1007 		}
1008 
1009 		status = iser_ib_activate_port(
1010 		    idm_svc, event->ev_hca_guid, gid);
1011 		if (status != IBT_SUCCESS) {
1012 
1013 			ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
1014 			    "status(0x%x): Bind service on port "
1015 			    "(%llx:%llx) failed",
1016 			    status, (longlong_t)gid.gid_prefix,
1017 			    (longlong_t)gid.gid_guid);
1018 
1019 			continue;
1020 		}
1021 		ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event: service bound "
1022 		    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
1023 		    event->ev_port);
1024 	}
1025 	mutex_exit(&idm.idm_global_mutex);
1026 
1027 	ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event success: "
1028 	    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
1029 	    event->ev_port);
1030 }
1031 
1032 /*
1033  * iser_ib_handle_portdown_event()
1034  * This handles the IBT_EVENT_PORT_DOWN unaffiliated asynchronous error.
1035  *
1036  * Unconfigure the CM service on the deactivated port and teardown the
1037  * connections that are using the CM service.
1038  */
1039 /* ARGSUSED */
1040 static void
1041 iser_ib_handle_portdown_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
1042 {
1043 	iser_hca_t		*hca;
1044 	ib_gid_t		gid;
1045 	int			status;
1046 
1047 	/*
1048 	 * Query all ports on the HCA and update the port information
1049 	 * maintainted in the iser_hca_t structure
1050 	 */
1051 	hca = iser_ib_guid2hca(event->ev_hca_guid);
1052 	ASSERT(hca != NULL);
1053 
1054 	status = iser_ib_update_hcaports(hca);
1055 	if (status != IBT_SUCCESS) {
1056 		ISER_LOG(CE_NOTE, "iser_ib_handle_portdown_event status(0x%x): "
1057 		    "ibt_ib_update_hcaports failed: HCA(0x%llx) port(%d)",
1058 		    status, (longlong_t)event->ev_hca_guid, event->ev_port);
1059 		return;
1060 	}
1061 
1062 	/* get the gid of the new port */
1063 	gid = hca->hca_port_info[event->ev_port - 1].p_sgid_tbl[0];
1064 	iser_ib_deactivate_port(event->ev_hca_guid, gid);
1065 
1066 	ISER_LOG(CE_NOTE, "iser_ib_handle_portdown_event success: "
1067 	    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
1068 	    event->ev_port);
1069 }
1070 
1071 /*
1072  * iser_ib_handle_hca_detach_event()
1073  * Quiesce all activity bound for the port, teardown the connection, unbind
1074  * iSER services on all ports and release the HCA handle.
1075  */
1076 /* ARGSUSED */
1077 static void
1078 iser_ib_handle_hca_detach_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
1079 {
1080 	iser_hca_t	*nexthca, *hca;
1081 	int		i, status;
1082 
1083 	ISER_LOG(CE_NOTE, "iser_ib_handle_hca_detach_event: HCA(0x%llx)",
1084 	    (longlong_t)event->ev_hca_guid);
1085 
1086 	hca = iser_ib_guid2hca(event->ev_hca_guid);
1087 	for (i = 0; i < hca->hca_num_ports; i++) {
1088 		iser_ib_deactivate_port(hca->hca_guid,
1089 		    hca->hca_port_info[i].p_sgid_tbl[0]);
1090 	}
1091 
1092 	/*
1093 	 * Update the HCA list maintained in the iser_state. Free the
1094 	 * resources allocated to the HCA, i.e. caches, protection domain
1095 	 */
1096 	mutex_enter(&iser_state->is_hcalist_lock);
1097 
1098 	for (hca = list_head(&iser_state->is_hcalist);
1099 	    hca != NULL;
1100 	    hca = nexthca) {
1101 
1102 		nexthca = list_next(&iser_state->is_hcalist, hca);
1103 
1104 		if (hca->hca_guid == event->ev_hca_guid) {
1105 
1106 			list_remove(&iser_state->is_hcalist, hca);
1107 			iser_state->is_num_hcas--;
1108 
1109 			status = iser_ib_free_hca(hca);
1110 			if (status != DDI_SUCCESS) {
1111 				ISER_LOG(CE_WARN, "iser_ib_handle_hca_detach: "
1112 				    "Failed to free hca(%p)", (void *)hca);
1113 				list_insert_tail(&iser_state->is_hcalist, hca);
1114 				iser_state->is_num_hcas++;
1115 			}
1116 			/* No way to return status to IBT if this fails */
1117 		}
1118 	}
1119 	mutex_exit(&iser_state->is_hcalist_lock);
1120 
1121 }
1122 
1123 /*
1124  * iser_ib_async_handler
1125  * An IBT Asynchronous Event handler is registered it with the framework and
1126  * passed via the ibt_attach() routine. This function handles the following
1127  * asynchronous events.
1128  * IBT_EVENT_PORT_UP
1129  * IBT_ERROR_PORT_DOWN
1130  * IBT_HCA_ATTACH_EVENT
1131  * IBT_HCA_DETACH_EVENT
1132  */
1133 /* ARGSUSED */
1134 void
1135 iser_ib_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code,
1136     ibt_async_event_t *event)
1137 {
1138 	switch (code) {
1139 	case IBT_EVENT_PORT_UP:
1140 		iser_ib_handle_portup_event(hdl, event);
1141 		break;
1142 
1143 	case IBT_ERROR_PORT_DOWN:
1144 		iser_ib_handle_portdown_event(hdl, event);
1145 		break;
1146 
1147 	case IBT_HCA_ATTACH_EVENT:
1148 		/*
1149 		 * A new HCA device is available for use, ignore this
1150 		 * event because the corresponding IBT_EVENT_PORT_UP
1151 		 * events will get triggered and handled accordingly.
1152 		 */
1153 		break;
1154 
1155 	case IBT_HCA_DETACH_EVENT:
1156 		iser_ib_handle_hca_detach_event(hdl, event);
1157 		break;
1158 
1159 	default:
1160 		break;
1161 	}
1162 }
1163 
1164 /*
1165  * iser_ib_init_hcas
1166  *
1167  * This function opens all the HCA devices, gathers the HCA state information
1168  * and adds the HCA handle for each HCA found in the iser_soft_state.
1169  */
1170 static int
1171 iser_ib_init_hcas(void)
1172 {
1173 	ib_guid_t	*guid;
1174 	int		num_hcas;
1175 	int		i;
1176 	iser_hca_t	*hca;
1177 
1178 	/* Retrieve the HCA list */
1179 	num_hcas = ibt_get_hca_list(&guid);
1180 	if (num_hcas == 0) {
1181 		/*
1182 		 * This shouldn't happen, but might if we have all HCAs
1183 		 * detach prior to initialization.
1184 		 */
1185 		return (DDI_FAILURE);
1186 	}
1187 
1188 	/* Initialize the hcalist lock */
1189 	mutex_init(&iser_state->is_hcalist_lock, NULL, MUTEX_DRIVER, NULL);
1190 
1191 	/* Create the HCA list */
1192 	list_create(&iser_state->is_hcalist, sizeof (iser_hca_t),
1193 	    offsetof(iser_hca_t, hca_node));
1194 
1195 	for (i = 0; i < num_hcas; i++) {
1196 
1197 		ISER_LOG(CE_NOTE, "iser_ib_init_hcas: initializing HCA "
1198 		    "(0x%llx)", (longlong_t)guid[i]);
1199 
1200 		hca = iser_ib_alloc_hca(guid[i]);
1201 		if (hca == NULL) {
1202 			/* This shouldn't happen, teardown and fail */
1203 			(void) iser_ib_fini_hcas();
1204 			(void) ibt_free_hca_list(guid, num_hcas);
1205 			return (DDI_FAILURE);
1206 		}
1207 
1208 		mutex_enter(&iser_state->is_hcalist_lock);
1209 		list_insert_tail(&iser_state->is_hcalist, hca);
1210 		iser_state->is_num_hcas++;
1211 		mutex_exit(&iser_state->is_hcalist_lock);
1212 
1213 	}
1214 
1215 	/* Free the IBT HCA list */
1216 	(void) ibt_free_hca_list(guid, num_hcas);
1217 
1218 	/* Check that we've initialized at least one HCA */
1219 	mutex_enter(&iser_state->is_hcalist_lock);
1220 	if (list_is_empty(&iser_state->is_hcalist)) {
1221 		ISER_LOG(CE_NOTE, "iser_ib_init_hcas: failed to initialize "
1222 		    "any HCAs");
1223 
1224 		mutex_exit(&iser_state->is_hcalist_lock);
1225 		(void) iser_ib_fini_hcas();
1226 		return (DDI_FAILURE);
1227 	}
1228 	mutex_exit(&iser_state->is_hcalist_lock);
1229 
1230 	return (DDI_SUCCESS);
1231 }
1232 
1233 /*
1234  * iser_ib_fini_hcas
1235  *
1236  * Teardown the iSER HCA list initialized above.
1237  */
1238 static int
1239 iser_ib_fini_hcas(void)
1240 {
1241 	iser_hca_t	*nexthca, *hca;
1242 	int		status;
1243 
1244 	mutex_enter(&iser_state->is_hcalist_lock);
1245 	for (hca = list_head(&iser_state->is_hcalist);
1246 	    hca != NULL;
1247 	    hca = nexthca) {
1248 
1249 		nexthca = list_next(&iser_state->is_hcalist, hca);
1250 
1251 		list_remove(&iser_state->is_hcalist, hca);
1252 
1253 		status = iser_ib_free_hca(hca);
1254 		if (status != IBT_SUCCESS) {
1255 			ISER_LOG(CE_NOTE, "iser_ib_fini_hcas: failed to free "
1256 			    "HCA during fini");
1257 			list_insert_tail(&iser_state->is_hcalist, hca);
1258 			return (DDI_FAILURE);
1259 		}
1260 
1261 		iser_state->is_num_hcas--;
1262 
1263 	}
1264 	mutex_exit(&iser_state->is_hcalist_lock);
1265 	list_destroy(&iser_state->is_hcalist);
1266 	mutex_destroy(&iser_state->is_hcalist_lock);
1267 
1268 	return (DDI_SUCCESS);
1269 }
1270 
1271 /*
1272  * iser_ib_alloc_hca
1273  *
1274  * This function opens the given HCA device, gathers the HCA state information
1275  * and adds the HCA handle
1276  */
1277 static iser_hca_t *
1278 iser_ib_alloc_hca(ib_guid_t guid)
1279 {
1280 	iser_hca_t	*hca;
1281 	int		status;
1282 
1283 	/* Allocate an iser_hca_t HCA handle */
1284 	hca = (iser_hca_t *)kmem_zalloc(sizeof (iser_hca_t), KM_SLEEP);
1285 
1286 	/* Open this HCA */
1287 	status = ibt_open_hca(iser_state->is_ibhdl, guid, &hca->hca_hdl);
1288 	if (status != IBT_SUCCESS) {
1289 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_open_hca failed:"
1290 		    " guid (0x%llx) status (0x%x)", (longlong_t)guid, status);
1291 		kmem_free(hca, sizeof (iser_hca_t));
1292 		return (NULL);
1293 	}
1294 
1295 	hca->hca_guid		= guid;
1296 	hca->hca_clnt_hdl	= iser_state->is_ibhdl;
1297 
1298 	/* Query the HCA */
1299 	status = ibt_query_hca(hca->hca_hdl, &hca->hca_attr);
1300 	if (status != IBT_SUCCESS) {
1301 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_query_hca "
1302 		    "failure: guid (0x%llx) status (0x%x)",
1303 		    (longlong_t)guid, status);
1304 		(void) ibt_close_hca(hca->hca_hdl);
1305 		kmem_free(hca, sizeof (iser_hca_t));
1306 		return (NULL);
1307 	}
1308 
1309 	/* Query all ports on the HCA */
1310 	status = ibt_query_hca_ports(hca->hca_hdl, 0,
1311 	    &hca->hca_port_info, &hca->hca_num_ports,
1312 	    &hca->hca_port_info_sz);
1313 	if (status != IBT_SUCCESS) {
1314 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: "
1315 		    "ibt_query_hca_ports failure: guid (0x%llx) "
1316 		    "status (0x%x)", (longlong_t)guid, status);
1317 		(void) ibt_close_hca(hca->hca_hdl);
1318 		kmem_free(hca, sizeof (iser_hca_t));
1319 		return (NULL);
1320 	}
1321 
1322 	/* Allocate a single PD on this HCA */
1323 	status = ibt_alloc_pd(hca->hca_hdl, IBT_PD_NO_FLAGS,
1324 	    &hca->hca_pdhdl);
1325 	if (status != IBT_SUCCESS) {
1326 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_alloc_pd "
1327 		    "failure: guid (0x%llx) status (0x%x)",
1328 		    (longlong_t)guid, status);
1329 		(void) ibt_close_hca(hca->hca_hdl);
1330 		ibt_free_portinfo(hca->hca_port_info, hca->hca_port_info_sz);
1331 		kmem_free(hca, sizeof (iser_hca_t));
1332 		return (NULL);
1333 	}
1334 
1335 	/* Initialize the message and data MR caches for this HCA */
1336 	iser_init_hca_caches(hca);
1337 
1338 	return (hca);
1339 }
1340 
1341 static int
1342 iser_ib_free_hca(iser_hca_t *hca)
1343 {
1344 	int			status;
1345 	ibt_hca_portinfo_t	*hca_port_info;
1346 	uint_t			hca_port_info_sz;
1347 
1348 	ASSERT(hca != NULL);
1349 	if (hca->hca_failed)
1350 		return (DDI_FAILURE);
1351 
1352 	hca_port_info = hca->hca_port_info;
1353 	hca_port_info_sz = hca->hca_port_info_sz;
1354 
1355 	/*
1356 	 * Free the memory regions before freeing
1357 	 * the associated protection domain
1358 	 */
1359 	iser_fini_hca_caches(hca);
1360 
1361 	status = ibt_free_pd(hca->hca_hdl, hca->hca_pdhdl);
1362 	if (status != IBT_SUCCESS) {
1363 		ISER_LOG(CE_NOTE, "iser_ib_free_hca: failed to free PD "
1364 		    "status=0x%x", status);
1365 		goto out_caches;
1366 	}
1367 
1368 	status = ibt_close_hca(hca->hca_hdl);
1369 	if (status != IBT_SUCCESS) {
1370 		ISER_LOG(CE_NOTE, "iser_ib_fini_hcas: failed to close HCA "
1371 		    "status=0x%x", status);
1372 		goto out_pd;
1373 	}
1374 
1375 	ibt_free_portinfo(hca_port_info, hca_port_info_sz);
1376 
1377 	kmem_free(hca, sizeof (iser_hca_t));
1378 	return (DDI_SUCCESS);
1379 
1380 	/*
1381 	 * We only managed to partially tear down the HCA, try to put it back
1382 	 * like it was before returning.
1383 	 */
1384 out_pd:
1385 	status = ibt_alloc_pd(hca->hca_hdl, IBT_PD_NO_FLAGS, &hca->hca_pdhdl);
1386 	if (status != IBT_SUCCESS) {
1387 		hca->hca_failed = B_TRUE;
1388 		/* Report error and exit */
1389 		ISER_LOG(CE_NOTE, "iser_ib_free_hca: could not re-alloc PD "
1390 		    "status=0x%x", status);
1391 		return (DDI_FAILURE);
1392 	}
1393 
1394 out_caches:
1395 	iser_init_hca_caches(hca);
1396 
1397 	return (DDI_FAILURE);
1398 }
1399 
1400 static int
1401 iser_ib_update_hcaports(iser_hca_t *hca)
1402 {
1403 	ibt_hca_portinfo_t	*pinfop, *oldpinfop;
1404 	uint_t			size, oldsize, nport;
1405 	int			status;
1406 
1407 	ASSERT(hca != NULL);
1408 
1409 	status = ibt_query_hca_ports(hca->hca_hdl, 0, &pinfop, &nport, &size);
1410 	if (status != IBT_SUCCESS) {
1411 		ISER_LOG(CE_NOTE, "ibt_query_hca_ports failed: %d", status);
1412 		return (status);
1413 	}
1414 
1415 	oldpinfop = hca->hca_port_info;
1416 	oldsize	= hca->hca_port_info_sz;
1417 	hca->hca_port_info = pinfop;
1418 	hca->hca_port_info_sz = size;
1419 
1420 	(void) ibt_free_portinfo(oldpinfop, oldsize);
1421 
1422 	return (IBT_SUCCESS);
1423 }
1424 
1425 /*
1426  * iser_ib_gid2hca
1427  * Given a gid, find the corresponding hca
1428  */
1429 iser_hca_t *
1430 iser_ib_gid2hca(ib_gid_t gid)
1431 {
1432 
1433 	iser_hca_t	*hca;
1434 	int		i;
1435 
1436 	mutex_enter(&iser_state->is_hcalist_lock);
1437 	for (hca = list_head(&iser_state->is_hcalist);
1438 	    hca != NULL;
1439 	    hca = list_next(&iser_state->is_hcalist, hca)) {
1440 
1441 		for (i = 0; i < hca->hca_num_ports; i++) {
1442 			if ((hca->hca_port_info[i].p_sgid_tbl[0].gid_prefix ==
1443 			    gid.gid_prefix) &&
1444 			    (hca->hca_port_info[i].p_sgid_tbl[0].gid_guid ==
1445 			    gid.gid_guid)) {
1446 
1447 				mutex_exit(&iser_state->is_hcalist_lock);
1448 
1449 				return (hca);
1450 			}
1451 		}
1452 	}
1453 	mutex_exit(&iser_state->is_hcalist_lock);
1454 	return (NULL);
1455 }
1456 
1457 /*
1458  * iser_ib_guid2hca
1459  * Given a HCA guid, find the corresponding HCA
1460  */
1461 iser_hca_t *
1462 iser_ib_guid2hca(ib_guid_t guid)
1463 {
1464 
1465 	iser_hca_t	*hca;
1466 
1467 	mutex_enter(&iser_state->is_hcalist_lock);
1468 	for (hca = list_head(&iser_state->is_hcalist);
1469 	    hca != NULL;
1470 	    hca = list_next(&iser_state->is_hcalist, hca)) {
1471 
1472 		if (hca->hca_guid == guid) {
1473 			mutex_exit(&iser_state->is_hcalist_lock);
1474 			return (hca);
1475 		}
1476 	}
1477 	mutex_exit(&iser_state->is_hcalist_lock);
1478 	return (NULL);
1479 }
1480 
1481 /*
1482  * iser_ib_conv_sockaddr2ibtaddr
1483  * This function converts a socket address into the IBT format
1484  */
1485 void iser_ib_conv_sockaddr2ibtaddr(
1486     idm_sockaddr_t *saddr, ibt_ip_addr_t *ibt_addr)
1487 {
1488 	if (saddr == NULL) {
1489 		ibt_addr->family = AF_UNSPEC;
1490 		ibt_addr->un.ip4addr = 0;
1491 	} else {
1492 		switch (saddr->sin.sa_family) {
1493 		case AF_INET:
1494 
1495 			ibt_addr->family	= saddr->sin4.sin_family;
1496 			ibt_addr->un.ip4addr	= saddr->sin4.sin_addr.s_addr;
1497 			break;
1498 
1499 		case AF_INET6:
1500 
1501 			ibt_addr->family	= saddr->sin6.sin6_family;
1502 			ibt_addr->un.ip6addr	= saddr->sin6.sin6_addr;
1503 			break;
1504 
1505 		default:
1506 			ibt_addr->family = AF_UNSPEC;
1507 		}
1508 
1509 	}
1510 }
1511 
1512 /*
1513  * iser_ib_conv_ibtaddr2sockaddr
1514  * This function converts an IBT ip address handle to a sockaddr
1515  */
1516 void iser_ib_conv_ibtaddr2sockaddr(struct sockaddr_storage *ss,
1517     ibt_ip_addr_t *ibt_addr, in_port_t port)
1518 {
1519 	struct sockaddr_in *sin;
1520 	struct sockaddr_in6 *sin6;
1521 
1522 	switch (ibt_addr->family) {
1523 	case AF_INET:
1524 	case AF_UNSPEC:
1525 
1526 		sin = (struct sockaddr_in *)ibt_addr;
1527 		sin->sin_port = ntohs(port);
1528 		bcopy(sin, ss, sizeof (struct sockaddr_in));
1529 		break;
1530 
1531 	case AF_INET6:
1532 
1533 		sin6 = (struct sockaddr_in6 *)ibt_addr;
1534 		sin6->sin6_port = ntohs(port);
1535 		bcopy(sin6, ss, sizeof (struct sockaddr_in6));
1536 		break;
1537 
1538 	default:
1539 		ISER_LOG(CE_NOTE, "iser_ib_conv_ibtaddr2sockaddr: "
1540 		    "unknown family type: 0x%x", ibt_addr->family);
1541 	}
1542 }
1543 
1544 /*
1545  * iser_ib_setup_cq
1546  * This function sets up the Completion Queue size and allocates the specified
1547  * Completion Queue
1548  */
1549 static int
1550 iser_ib_setup_cq(ibt_hca_hdl_t hca_hdl, uint_t cq_size, ibt_cq_hdl_t *cq_hdl)
1551 {
1552 
1553 	ibt_cq_attr_t		cq_attr;
1554 	int			status;
1555 
1556 	cq_attr.cq_size		= cq_size;
1557 	cq_attr.cq_sched	= 0;
1558 	cq_attr.cq_flags	= IBT_CQ_NO_FLAGS;
1559 
1560 	/* Allocate a Completion Queue */
1561 	status = ibt_alloc_cq(hca_hdl, &cq_attr, cq_hdl, NULL);
1562 	if (status != IBT_SUCCESS) {
1563 		ISER_LOG(CE_NOTE, "iser_ib_setup_cq: ibt_alloc_cq failure (%d)",
1564 		    status);
1565 		return (status);
1566 	}
1567 
1568 	return (ISER_STATUS_SUCCESS);
1569 }
1570 
1571 /*
1572  * iser_ib_setup_chanargs
1573  *
1574  */
1575 static void
1576 iser_ib_setup_chanargs(uint8_t hca_port, ibt_cq_hdl_t scq_hdl,
1577     ibt_cq_hdl_t rcq_hdl, uint_t sq_size, uint_t rq_size,
1578     ibt_pd_hdl_t hca_pdhdl, ibt_rc_chan_alloc_args_t *cargs)
1579 {
1580 
1581 	bzero(cargs, sizeof (ibt_rc_chan_alloc_args_t));
1582 
1583 	/*
1584 	 * Set up the size of the channels send queue, receive queue and the
1585 	 * maximum number of elements in a scatter gather list of work requests
1586 	 * posted to the send and receive queues.
1587 	 */
1588 	cargs->rc_sizes.cs_sq		= sq_size;
1589 	cargs->rc_sizes.cs_rq		= rq_size;
1590 	cargs->rc_sizes.cs_sq_sgl	= ISER_IB_SGLIST_SIZE;
1591 	cargs->rc_sizes.cs_rq_sgl	= ISER_IB_SGLIST_SIZE;
1592 
1593 	/*
1594 	 * All Work requests signaled on a WR basis will receive a send
1595 	 * request completion.
1596 	 */
1597 	cargs->rc_flags			= IBT_ALL_SIGNALED;
1598 
1599 	/* Enable RDMA read and RDMA write on the channel end points */
1600 	cargs->rc_control		= IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1601 
1602 	/* Set the local hca port on which the channel is allocated */
1603 	cargs->rc_hca_port_num		= hca_port;
1604 
1605 	/* Set the Send and Receive Completion Queue handles */
1606 	cargs->rc_scq			= scq_hdl;
1607 	cargs->rc_rcq			= rcq_hdl;
1608 
1609 	/* Set the protection domain associated with the channel */
1610 	cargs->rc_pd			= hca_pdhdl;
1611 
1612 	/* No SRQ usage */
1613 	cargs->rc_srq			= NULL;
1614 }
1615 
1616 /*
1617  * iser_ib_init_qp
1618  * Initialize the QP handle
1619  */
1620 void
1621 iser_ib_init_qp(iser_chan_t *chan, uint_t sq_size, uint_t rq_size)
1622 {
1623 	/* Initialize the handle lock */
1624 	mutex_init(&chan->ic_qp.qp_lock, NULL, MUTEX_DRIVER, NULL);
1625 
1626 	/* Record queue sizes */
1627 	chan->ic_qp.sq_size = sq_size;
1628 	chan->ic_qp.rq_size = rq_size;
1629 
1630 	/* Initialize the RQ monitoring data */
1631 	chan->ic_qp.rq_depth  = rq_size;
1632 	chan->ic_qp.rq_level  = 0;
1633 	chan->ic_qp.rq_lwm = (chan->ic_recvcq_sz * ISER_IB_RQ_LWM_PCT) / 100;
1634 
1635 	/* Initialize the taskq flag */
1636 	chan->ic_qp.rq_taskqpending = B_FALSE;
1637 }
1638 
1639 /*
1640  * iser_ib_fini_qp
1641  * Teardown the QP handle
1642  */
1643 void
1644 iser_ib_fini_qp(iser_qp_t *qp)
1645 {
1646 	/* Destroy the handle lock */
1647 	mutex_destroy(&qp->qp_lock);
1648 }
1649 
1650 static int
1651 iser_ib_activate_port(idm_svc_t *idm_svc, ib_guid_t guid, ib_gid_t gid)
1652 {
1653 	iser_svc_t	*iser_svc;
1654 	iser_sbind_t	*is_sbind;
1655 	int		status;
1656 
1657 	iser_svc = idm_svc->is_iser_svc;
1658 
1659 	/*
1660 	 * Save the address of the service bind handle in the
1661 	 * iser_svc_t to undo the service binding at a later time
1662 	 */
1663 	is_sbind = kmem_zalloc(sizeof (iser_sbind_t), KM_SLEEP);
1664 	is_sbind->is_gid	= gid;
1665 	is_sbind->is_guid	= guid;
1666 
1667 	status  = ibt_bind_service(iser_svc->is_srvhdl, gid, NULL,
1668 	    idm_svc, &is_sbind->is_sbindhdl);
1669 
1670 	if (status != IBT_SUCCESS) {
1671 		ISER_LOG(CE_NOTE, "iser_ib_activate_port: status(0x%x): "
1672 		    "Bind service(%llx) on port(%llx:%llx) failed",
1673 		    status, (longlong_t)iser_svc->is_svcid,
1674 		    (longlong_t)gid.gid_prefix, (longlong_t)gid.gid_guid);
1675 
1676 		kmem_free(is_sbind, sizeof (iser_sbind_t));
1677 
1678 		return (status);
1679 	}
1680 
1681 	list_insert_tail(&iser_svc->is_sbindlist, is_sbind);
1682 
1683 	return (IBT_SUCCESS);
1684 }
1685 
1686 static void
1687 iser_ib_deactivate_port(ib_guid_t hca_guid, ib_gid_t gid)
1688 {
1689 	iser_svc_t	*iser_svc;
1690 	iser_conn_t	*iser_conn;
1691 	iser_sbind_t	*is_sbind;
1692 	idm_conn_t	*idm_conn;
1693 
1694 	/*
1695 	 * Iterate through the global list of IDM target connections.
1696 	 * Issue a TRANSPORT_FAIL for any connections on this port, and
1697 	 * if there is a bound service running on the port, tear it down.
1698 	 */
1699 	mutex_enter(&idm.idm_global_mutex);
1700 	for (idm_conn = list_head(&idm.idm_tgt_conn_list);
1701 	    idm_conn != NULL;
1702 	    idm_conn = list_next(&idm.idm_tgt_conn_list, idm_conn)) {
1703 
1704 		if (idm_conn->ic_transport_type != IDM_TRANSPORT_TYPE_ISER) {
1705 			/* this is not an iSER connection, skip it */
1706 			continue;
1707 		}
1708 
1709 		iser_conn = idm_conn->ic_transport_private;
1710 		if (iser_conn->ic_chan->ic_ibt_path.pi_hca_guid != hca_guid) {
1711 			/* this iSER connection is on a different port */
1712 			continue;
1713 		}
1714 
1715 		/* Fail the transport for this connection */
1716 		idm_conn_event(idm_conn, CE_TRANSPORT_FAIL, IDM_STATUS_FAIL);
1717 
1718 		if (idm_conn->ic_conn_type == CONN_TYPE_INI) {
1719 			/* initiator connection, nothing else to do */
1720 			continue;
1721 		}
1722 
1723 		/* Check for a service binding */
1724 		iser_svc = idm_conn->ic_svc_binding->is_iser_svc;
1725 		is_sbind = iser_ib_get_bind(iser_svc, hca_guid, gid);
1726 		if (is_sbind != NULL) {
1727 			/* This service is still bound, tear it down */
1728 			(void) ibt_unbind_service(iser_svc->is_srvhdl,
1729 			    is_sbind->is_sbindhdl);
1730 			list_remove(&iser_svc->is_sbindlist, is_sbind);
1731 			kmem_free(is_sbind, sizeof (iser_sbind_t));
1732 		}
1733 	}
1734 	mutex_exit(&idm.idm_global_mutex);
1735 }
1736 
1737 static iser_sbind_t *
1738 iser_ib_get_bind(iser_svc_t *iser_svc, ib_guid_t hca_guid, ib_gid_t gid)
1739 {
1740 	iser_sbind_t	*is_sbind;
1741 
1742 	for (is_sbind = list_head(&iser_svc->is_sbindlist);
1743 	    is_sbind != NULL;
1744 	    is_sbind = list_next(&iser_svc->is_sbindlist, is_sbind)) {
1745 
1746 		if ((is_sbind->is_guid == hca_guid) &&
1747 		    (is_sbind->is_gid.gid_prefix == gid.gid_prefix) &&
1748 		    (is_sbind->is_gid.gid_guid == gid.gid_guid)) {
1749 			return (is_sbind);
1750 		}
1751 	}
1752 	return (NULL);
1753 }
1754