xref: /titanic_51/usr/src/uts/common/rpc/rpcib.c (revision 8eea8e29cc4374d1ee24c25a07f45af132db3499)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * The rpcib plugin. Implements the interface for RDMATF's
31  * interaction with IBTF.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/user.h>
37 #include <sys/systm.h>
38 #include <sys/sysmacros.h>
39 #include <sys/proc.h>
40 #include <sys/socket.h>
41 #include <sys/file.h>
42 #include <sys/stream.h>
43 #include <sys/strsubr.h>
44 #include <sys/stropts.h>
45 #include <sys/errno.h>
46 #include <sys/kmem.h>
47 #include <sys/debug.h>
48 #include <sys/systm.h>
49 #include <sys/pathname.h>
50 #include <sys/kstat.h>
51 #include <sys/t_lock.h>
52 #include <sys/ddi.h>
53 #include <sys/cmn_err.h>
54 #include <sys/time.h>
55 #include <sys/isa_defs.h>
56 #include <sys/callb.h>
57 #include <sys/sunddi.h>
58 #include <sys/sunndi.h>
59 
60 #include <sys/ib/ibtl/ibti.h>
61 #include <rpc/rpc.h>
62 #include <rpc/ib.h>
63 
64 #include <sys/modctl.h>
65 
66 #include <sys/pathname.h>
67 #include <sys/kstr.h>
68 #include <sys/sockio.h>
69 #include <sys/vnode.h>
70 #include <sys/tiuser.h>
71 #include <net/if.h>
72 #include <sys/cred.h>
73 
74 
75 extern char *inet_ntop(int, const void *, char *, int);
76 
77 
78 /*
79  * Prototype declarations for driver ops
80  */
81 
82 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
83 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
84 			    void *, void **);
85 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
86 
87 
88 /* rpcib cb_ops */
89 static struct cb_ops rpcib_cbops = {
90 	nulldev,		/* open */
91 	nulldev,		/* close */
92 	nodev,			/* strategy */
93 	nodev,			/* print */
94 	nodev,			/* dump */
95 	nodev,			/* read */
96 	nodev,			/* write */
97 	nodev,			/* ioctl */
98 	nodev,			/* devmap */
99 	nodev,			/* mmap */
100 	nodev,			/* segmap */
101 	nochpoll,		/* poll */
102 	ddi_prop_op,		/* prop_op */
103 	NULL,			/* stream */
104 	D_MP,			/* cb_flag */
105 	CB_REV,			/* rev */
106 	nodev,			/* int (*cb_aread)() */
107 	nodev			/* int (*cb_awrite)() */
108 };
109 
110 /*
111  * Device options
112  */
113 static struct dev_ops rpcib_ops = {
114 	DEVO_REV,		/* devo_rev, */
115 	0,			/* refcnt  */
116 	rpcib_getinfo,		/* info */
117 	nulldev,		/* identify */
118 	nulldev,		/* probe */
119 	rpcib_attach,		/* attach */
120 	rpcib_detach,		/* detach */
121 	nodev,			/* reset */
122 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
123 	NULL,			/* bus operations */
124 	NULL			/* power */
125 };
126 
127 /*
128  * Module linkage information.
129  */
130 
131 static struct modldrv rib_modldrv = {
132 	&mod_driverops,			    /* Driver module */
133 	"RPCIB plugin driver, ver %I%", /* Driver name and version */
134 	&rpcib_ops,		    /* Driver ops */
135 };
136 
137 static struct modlinkage rib_modlinkage = {
138 	MODREV_1,
139 	(void *)&rib_modldrv,
140 	NULL
141 };
142 
143 /*
144  * rib_stat: private data pointer used when registering
145  *	with the IBTF.  It is returned to the consumer
146  *	in all callbacks.
147  */
148 static rpcib_state_t *rib_stat = NULL;
149 
150 #define	RNR_RETRIES	2
151 #define	MAX_PORTS	2
152 
153 int preposted_rbufs = 16;
154 int send_threshold = 1;
155 
156 /*
157  * State of the plugin.
158  * ACCEPT = accepting new connections and requests.
159  * NO_ACCEPT = not accepting new connection and requests.
160  * This should eventually move to rpcib_state_t structure, since this
161  * will tell in which state the plugin is for a particular type of service
162  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
163  * state for one and in no_accept state for the other.
164  */
165 int		plugin_state;
166 kmutex_t	plugin_state_lock;
167 
168 
169 /*
170  * RPCIB RDMATF operations
171  */
172 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
173 static rdma_stat rib_disconnect(CONN *conn);
174 static void rib_listen(struct rdma_svc_data *rd);
175 static void rib_listen_stop(struct rdma_svc_data *rd);
176 static rdma_stat rib_registermem(CONN *conn, caddr_t buf, uint_t buflen,
177 	struct mrc *buf_handle);
178 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
179 	struct mrc buf_handle);
180 static rdma_stat rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen,
181 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle);
182 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
183 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle);
184 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
185 	caddr_t buf, int len, int cpu);
186 
187 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
188 
189 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
190 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
191 
192 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
193 
194 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
195 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
196 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
197 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
198 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
199 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
200 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
201 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **);
202 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
203 static rdma_stat rib_conn_release(CONN *conn);
204 static rdma_stat rib_getinfo(rdma_info_t *info);
205 static rdma_stat rib_register_ats(rib_hca_t *);
206 static void rib_deregister_ats();
207 static void rib_stop_services(rib_hca_t *);
208 
209 /*
210  * RPCIB addressing operations
211  */
212 char ** get_ip_addrs(int *count);
213 int get_interfaces(TIUSER *tiptr, int *num);
214 int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs);
215 int get_ibd_ipaddr(rpcib_ibd_insts_t *);
216 rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *);
217 void rib_get_ibd_insts(rpcib_ibd_insts_t *);
218 
219 
220 /*
221  * RDMA operations the RPCIB module exports
222  */
223 static rdmaops_t rib_ops = {
224 	rib_reachable,
225 	rib_conn_get,
226 	rib_conn_release,
227 	rib_listen,
228 	rib_listen_stop,
229 	rib_registermem,
230 	rib_deregistermem,
231 	rib_registermemsync,
232 	rib_deregistermemsync,
233 	rib_syncmem,
234 	rib_reg_buf_alloc,
235 	rib_reg_buf_free,
236 	rib_send,
237 	rib_send_resp,
238 	rib_post_resp,
239 	rib_post_recv,
240 	rib_recv,
241 	rib_read,
242 	rib_write,
243 	rib_getinfo
244 };
245 
246 /*
247  * RDMATF RPCIB plugin details
248  */
249 static rdma_mod_t rib_mod = {
250 	"ibtf",		/* api name */
251 	RDMATF_VERS_1,
252 	0,
253 	&rib_ops,	/* rdma op vector for ibtf */
254 };
255 
256 static rdma_stat open_hcas(rpcib_state_t *);
257 static rdma_stat rib_qp_init(rib_qp_t *, int);
258 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
259 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
260 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
261 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
262 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
263 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
264 	ibt_mr_hdl_t *, ibt_mr_desc_t *);
265 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *);
266 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
267 	rib_qp_t **);
268 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
269 	rib_qp_t **);
270 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
271 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
272 static int rib_free_sendwait(struct send_wid *);
273 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
274 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
275 static void rdma_done_rem_list(rib_qp_t *);
276 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
277 
278 static void rib_async_handler(void *,
279 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
280 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
281 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
282 static int rib_free_svc_recv(struct svc_recv *);
283 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
284 static void rib_free_wid(struct recv_wid *);
285 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
286 static void rib_detach_hca(rib_hca_t *);
287 static rdma_stat rib_chk_srv_ats(rib_hca_t *, struct netbuf *, int,
288 	ibt_path_info_t *);
289 
290 /*
291  * Registration with IBTF as a consumer
292  */
293 static struct ibt_clnt_modinfo_s rib_modinfo = {
294 	IBTI_V1,
295 	IBT_GENERIC,
296 	rib_async_handler,	/* async event handler */
297 	NULL,			/* Memory Region Handler */
298 	"nfs/ib"
299 };
300 
301 /*
302  * Global strucuture
303  */
304 
305 typedef struct rpcib_s {
306 	dev_info_t	*rpcib_dip;
307 	kmutex_t	rpcib_mutex;
308 } rpcib_t;
309 
310 rpcib_t rpcib;
311 
312 /*
313  * /etc/system controlled variable to control
314  * debugging in rpcib kernel module.
315  * Set it to values greater that 1 to control
316  * the amount of debugging messages required.
317  */
318 int rib_debug = 0;
319 
320 static int ats_running = 0;
321 int
322 _init(void)
323 {
324 	int		error;
325 
326 	error = mod_install((struct modlinkage *)&rib_modlinkage);
327 	if (error != 0) {
328 		/*
329 		 * Could not load module
330 		 */
331 		return (error);
332 	}
333 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
334 
335 	return (0);
336 }
337 
338 int
339 _fini()
340 {
341 	int status;
342 
343 	if ((status = rdma_unregister_mod(&rib_mod)) != RDMA_SUCCESS) {
344 		return (EBUSY);
345 	}
346 
347 	rib_deregister_ats();
348 
349 	/*
350 	 * Remove module
351 	 */
352 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
353 		(void) rdma_register_mod(&rib_mod);
354 		return (status);
355 	}
356 	mutex_destroy(&plugin_state_lock);
357 	return (0);
358 }
359 
360 int
361 _info(struct modinfo *modinfop)
362 {
363 	return (mod_info(&rib_modlinkage, modinfop));
364 }
365 
366 
367 /*
368  * rpcib_getinfo()
369  * Given the device number, return the devinfo pointer or the
370  * instance number.
371  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
372  */
373 
374 /*ARGSUSED*/
375 static int
376 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
377 {
378 	int ret = DDI_SUCCESS;
379 
380 	switch (cmd) {
381 	case DDI_INFO_DEVT2DEVINFO:
382 		if (rpcib.rpcib_dip != NULL)
383 			*result = rpcib.rpcib_dip;
384 		else {
385 			*result = NULL;
386 			ret = DDI_FAILURE;
387 		}
388 		break;
389 
390 	case DDI_INFO_DEVT2INSTANCE:
391 		*result = NULL;
392 		break;
393 
394 	default:
395 		ret = DDI_FAILURE;
396 	}
397 	return (ret);
398 }
399 
400 static int
401 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
402 {
403 	ibt_status_t	ibt_status;
404 	rdma_stat	r_status;
405 
406 	switch (cmd) {
407 	case DDI_ATTACH:
408 		break;
409 	case DDI_RESUME:
410 		return (DDI_SUCCESS);
411 	default:
412 		return (DDI_FAILURE);
413 	}
414 
415 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
416 
417 	mutex_enter(&rpcib.rpcib_mutex);
418 	if (rpcib.rpcib_dip != NULL) {
419 		mutex_exit(&rpcib.rpcib_mutex);
420 		return (DDI_FAILURE);
421 	}
422 	rpcib.rpcib_dip = dip;
423 	mutex_exit(&rpcib.rpcib_mutex);
424 	/*
425 	 * Create the "rpcib" minor-node.
426 	 */
427 	if (ddi_create_minor_node(dip,
428 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
429 		/* Error message, no cmn_err as they print on console */
430 		return (DDI_FAILURE);
431 	}
432 
433 	if (rib_stat == NULL) {
434 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
435 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
436 	}
437 
438 	rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids);
439 	if (rib_stat->hca_count < 1) {
440 		mutex_destroy(&rib_stat->open_hca_lock);
441 		kmem_free(rib_stat, sizeof (*rib_stat));
442 		rib_stat = NULL;
443 		return (DDI_FAILURE);
444 	}
445 
446 	ibt_status = ibt_attach(&rib_modinfo, dip,
447 			(void *)rib_stat, &rib_stat->ibt_clnt_hdl);
448 	if (ibt_status != IBT_SUCCESS) {
449 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
450 		mutex_destroy(&rib_stat->open_hca_lock);
451 		kmem_free(rib_stat, sizeof (*rib_stat));
452 		rib_stat = NULL;
453 		return (DDI_FAILURE);
454 	}
455 
456 	mutex_enter(&rib_stat->open_hca_lock);
457 	if (open_hcas(rib_stat) != RDMA_SUCCESS) {
458 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
459 		(void) ibt_detach(rib_stat->ibt_clnt_hdl);
460 		mutex_exit(&rib_stat->open_hca_lock);
461 		mutex_destroy(&rib_stat->open_hca_lock);
462 		kmem_free(rib_stat, sizeof (*rib_stat));
463 		rib_stat = NULL;
464 		return (DDI_FAILURE);
465 	}
466 	mutex_exit(&rib_stat->open_hca_lock);
467 
468 	/*
469 	 * Register with rdmatf
470 	 */
471 	rib_mod.rdma_count = rib_stat->hca_count;
472 	r_status = rdma_register_mod(&rib_mod);
473 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
474 		rib_detach_hca(rib_stat->hca);
475 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
476 		(void) ibt_detach(rib_stat->ibt_clnt_hdl);
477 		mutex_destroy(&rib_stat->open_hca_lock);
478 		kmem_free(rib_stat, sizeof (*rib_stat));
479 		rib_stat = NULL;
480 		return (DDI_FAILURE);
481 	}
482 
483 
484 	return (DDI_SUCCESS);
485 }
486 
487 /*ARGSUSED*/
488 static int
489 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
490 {
491 	switch (cmd) {
492 
493 	case DDI_DETACH:
494 		break;
495 
496 	case DDI_SUSPEND:
497 	default:
498 		return (DDI_FAILURE);
499 	}
500 
501 	/*
502 	 * Detach the hca and free resources
503 	 */
504 	mutex_enter(&plugin_state_lock);
505 	plugin_state = NO_ACCEPT;
506 	mutex_exit(&plugin_state_lock);
507 	rib_detach_hca(rib_stat->hca);
508 	ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
509 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
510 
511 	mutex_enter(&rpcib.rpcib_mutex);
512 	rpcib.rpcib_dip = NULL;
513 	mutex_exit(&rpcib.rpcib_mutex);
514 
515 	mutex_destroy(&rpcib.rpcib_mutex);
516 	return (DDI_SUCCESS);
517 }
518 
519 
520 static void
521 rib_deregister_ats()
522 {
523 	rib_hca_t		*hca;
524 	rib_service_t		*srv_list, *to_remove;
525 	ibt_status_t   		ibt_status;
526 
527 	/*
528 	 * deregister the Address Translation Service.
529 	 */
530 	hca = rib_stat->hca;
531 	rw_enter(&hca->service_list_lock, RW_WRITER);
532 	srv_list = hca->ats_list;
533 	while (srv_list != NULL) {
534 		to_remove = srv_list;
535 		srv_list = to_remove->srv_next;
536 
537 		ibt_status = ibt_deregister_ar(hca->ibt_clnt_hdl,
538 				&to_remove->srv_ar);
539 		if (ibt_status != IBT_SUCCESS) {
540 #ifdef DEBUG
541 		    if (rib_debug) {
542 			cmn_err(CE_WARN, "_fini: "
543 			    "ibt_deregister_ar FAILED"
544 				" status: %d", ibt_status);
545 		    }
546 #endif
547 		} else {
548 		    mutex_enter(&rib_stat->open_hca_lock);
549 		    ats_running = 0;
550 		    mutex_exit(&rib_stat->open_hca_lock);
551 #ifdef DEBUG
552 		    if (rib_debug) {
553 
554 			cmn_err(CE_NOTE, "_fini: "
555 			    "Successfully unregistered"
556 			    " ATS service: %s",
557 			    to_remove->srv_name);
558 		    }
559 #endif
560 		}
561 		kmem_free(to_remove, sizeof (rib_service_t));
562 	}
563 	hca->ats_list = NULL;
564 	rw_exit(&hca->service_list_lock);
565 }
566 
567 static void rib_rbufpool_free(rib_hca_t *, int);
568 static void rib_rbufpool_deregister(rib_hca_t *, int);
569 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
570 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
571 static rdma_stat rib_rem_replylist(rib_qp_t *);
572 static int rib_remreply(rib_qp_t *, struct reply *);
573 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
574 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
575 
576 /*
577  * One CQ pair per HCA
578  */
579 static rdma_stat
580 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
581 	rib_cq_t **cqp, rpcib_state_t *ribstat)
582 {
583 	rib_cq_t	*cq;
584 	ibt_cq_attr_t	cq_attr;
585 	uint32_t	real_size;
586 	ibt_status_t	status;
587 	rdma_stat	error = RDMA_SUCCESS;
588 
589 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
590 	cq->rib_hca = hca;
591 	cq_attr.cq_size = cq_size;
592 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
593 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
594 	    &real_size);
595 	if (status != IBT_SUCCESS) {
596 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
597 				" status=%d", status);
598 		error = RDMA_FAILED;
599 		goto fail;
600 	}
601 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat);
602 
603 	/*
604 	 * Enable CQ callbacks. CQ Callbacks are single shot
605 	 * (e.g. you have to call ibt_enable_cq_notify()
606 	 * after each callback to get another one).
607 	 */
608 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
609 	if (status != IBT_SUCCESS) {
610 		cmn_err(CE_WARN, "rib_create_cq: "
611 			"enable_cq_notify failed, status %d", status);
612 		error = RDMA_FAILED;
613 		goto fail;
614 	}
615 	*cqp = cq;
616 
617 	return (error);
618 fail:
619 	if (cq->rib_cq_hdl)
620 		(void) ibt_free_cq(cq->rib_cq_hdl);
621 	if (cq)
622 		kmem_free(cq, sizeof (rib_cq_t));
623 	return (error);
624 }
625 
626 static rdma_stat
627 open_hcas(rpcib_state_t *ribstat)
628 {
629 	rib_hca_t		*hca;
630 	ibt_status_t		ibt_status;
631 	rdma_stat		status;
632 	ibt_hca_portinfo_t	*pinfop;
633 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
634 	uint_t			size, cq_size;
635 	int			i;
636 
637 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
638 	if (ribstat->hcas == NULL)
639 		ribstat->hcas = kmem_zalloc(ribstat->hca_count *
640 				    sizeof (rib_hca_t), KM_SLEEP);
641 
642 	/*
643 	 * Open a hca and setup for RDMA
644 	 */
645 	for (i = 0; i < ribstat->hca_count; i++) {
646 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
647 				ribstat->hca_guids[i],
648 				&ribstat->hcas[i].hca_hdl);
649 		if (ibt_status != IBT_SUCCESS) {
650 			cmn_err(CE_WARN, "open_hcas: ibt_open_hca (%d) "
651 				"returned %d", i, ibt_status);
652 			continue;
653 		}
654 		ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
655 		hca = &(ribstat->hcas[i]);
656 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
657 		hca->state = HCA_INITED;
658 
659 		/*
660 		 * query HCA info
661 		 */
662 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
663 		if (ibt_status != IBT_SUCCESS) {
664 			cmn_err(CE_WARN, "open_hcas: ibt_query_hca "
665 			    "returned %d (hca_guid 0x%llx)",
666 			    ibt_status, (longlong_t)ribstat->hca_guids[i]);
667 			goto fail1;
668 		}
669 
670 		/*
671 		 * One PD (Protection Domain) per HCA.
672 		 * A qp is allowed to access a memory region
673 		 * only when it's in the same PD as that of
674 		 * the memory region.
675 		 */
676 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
677 		if (ibt_status != IBT_SUCCESS) {
678 			cmn_err(CE_WARN, "open_hcas: ibt_alloc_pd "
679 				"returned %d (hca_guid 0x%llx)",
680 				ibt_status, (longlong_t)ribstat->hca_guids[i]);
681 			goto fail1;
682 		}
683 
684 		/*
685 		 * query HCA ports
686 		 */
687 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
688 				0, &pinfop, &hca->hca_nports, &size);
689 		if (ibt_status != IBT_SUCCESS) {
690 			cmn_err(CE_WARN, "open_hcas: "
691 				"ibt_query_hca_ports returned %d "
692 				"(hca_guid 0x%llx)",
693 				ibt_status, (longlong_t)hca->hca_guid);
694 			goto fail2;
695 		}
696 		hca->hca_ports = pinfop;
697 		hca->hca_pinfosz = size;
698 		pinfop = NULL;
699 
700 		cq_size = DEF_CQ_SIZE; /* default cq size */
701 		/*
702 		 * Create 2 pairs of cq's (1 pair for client
703 		 * and the other pair for server) on this hca.
704 		 * If number of qp's gets too large, then several
705 		 * cq's will be needed.
706 		 */
707 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
708 				&hca->svc_rcq, ribstat);
709 		if (status != RDMA_SUCCESS) {
710 			goto fail3;
711 		}
712 
713 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
714 				&hca->svc_scq, ribstat);
715 		if (status != RDMA_SUCCESS) {
716 			goto fail3;
717 		}
718 
719 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
720 				&hca->clnt_rcq, ribstat);
721 		if (status != RDMA_SUCCESS) {
722 			goto fail3;
723 		}
724 
725 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
726 				&hca->clnt_scq, ribstat);
727 		if (status != RDMA_SUCCESS) {
728 			goto fail3;
729 		}
730 
731 		/*
732 		 * Create buffer pools.
733 		 * Note rib_rbuf_create also allocates memory windows.
734 		 */
735 		hca->recv_pool = rib_rbufpool_create(hca,
736 					RECV_BUFFER, MAX_BUFS);
737 		if (hca->recv_pool == NULL) {
738 			cmn_err(CE_WARN, "open_hcas: recv buf pool failed\n");
739 			goto fail3;
740 		}
741 
742 		hca->send_pool = rib_rbufpool_create(hca,
743 					SEND_BUFFER, MAX_BUFS);
744 		if (hca->send_pool == NULL) {
745 			cmn_err(CE_WARN, "open_hcas: send buf pool failed\n");
746 			rib_rbufpool_destroy(hca, RECV_BUFFER);
747 			goto fail3;
748 		}
749 
750 		/*
751 		 * Initialize the registered service list and
752 		 * the lock
753 		 */
754 		hca->service_list = NULL;
755 		rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
756 
757 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
758 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
759 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
760 			hca->iblock);
761 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
762 			hca->iblock);
763 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
764 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
765 		hca->inuse = TRUE;
766 		/*
767 		 * XXX One hca only. Add multi-hca functionality if needed
768 		 * later.
769 		 */
770 		ribstat->hca = hca;
771 		ribstat->nhca_inited++;
772 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
773 		break;
774 
775 fail3:
776 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
777 fail2:
778 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
779 fail1:
780 		(void) ibt_close_hca(hca->hca_hdl);
781 
782 	}
783 	if (ribstat->hca != NULL)
784 		return (RDMA_SUCCESS);
785 	else
786 		return (RDMA_FAILED);
787 }
788 
789 /*
790  * Callback routines
791  */
792 
793 /*
794  * SCQ handlers
795  */
796 /* ARGSUSED */
797 static void
798 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
799 {
800 	ibt_status_t	ibt_status;
801 	ibt_wc_t	wc;
802 	int		i;
803 
804 	/*
805 	 * Re-enable cq notify here to avoid missing any
806 	 * completion queue notification.
807 	 */
808 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
809 
810 	ibt_status = IBT_SUCCESS;
811 	while (ibt_status != IBT_CQ_EMPTY) {
812 	    bzero(&wc, sizeof (wc));
813 	    ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
814 	    if (ibt_status != IBT_SUCCESS)
815 		return;
816 
817 	/*
818 	 * Got a send completion
819 	 */
820 	    if (wc.wc_id != NULL) {	/* XXX can it be otherwise ???? */
821 		struct send_wid *wd = (struct send_wid *)wc.wc_id;
822 		CONN	*conn = qptoc(wd->qp);
823 
824 		mutex_enter(&wd->sendwait_lock);
825 		switch (wc.wc_status) {
826 		case IBT_WC_SUCCESS:
827 			wd->status = RDMA_SUCCESS;
828 			break;
829 		case IBT_WC_WR_FLUSHED_ERR:
830 			wd->status = RDMA_FAILED;
831 			break;
832 		default:
833 /*
834  *    RC Send Q Error Code		Local state     Remote State
835  *    ==================== 		===========     ============
836  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
837  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
838  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
839  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
840  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
841  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
842  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
843  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
844  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
845  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
846  *    IBT_WC_WR_FLUSHED_ERR               None            None
847  */
848 #ifdef DEBUG
849 	if (rib_debug > 1) {
850 	    if (wc.wc_status != IBT_WC_SUCCESS) {
851 		    cmn_err(CE_NOTE, "rib_clnt_scq_handler: "
852 			"WR completed in error, wc.wc_status:%d, "
853 			"wc_id:%llx\n", wc.wc_status, (longlong_t)wc.wc_id);
854 	    }
855 	}
856 #endif
857 			/*
858 			 * Channel in error state. Set connection to
859 			 * ERROR and cleanup will happen either from
860 			 * conn_release  or from rib_conn_get
861 			 */
862 			wd->status = RDMA_FAILED;
863 			mutex_enter(&conn->c_lock);
864 			if (conn->c_state != C_DISCONN_PEND)
865 				conn->c_state = C_ERROR;
866 			mutex_exit(&conn->c_lock);
867 			break;
868 		}
869 		if (wd->cv_sig == 1) {
870 			/*
871 			 * Notify poster
872 			 */
873 			cv_signal(&wd->wait_cv);
874 			mutex_exit(&wd->sendwait_lock);
875 		} else {
876 			/*
877 			 * Poster not waiting for notification.
878 			 * Free the send buffers and send_wid
879 			 */
880 			for (i = 0; i < wd->nsbufs; i++) {
881 				rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
882 					(void *)wd->sbufaddr[i]);
883 			}
884 			mutex_exit(&wd->sendwait_lock);
885 			(void) rib_free_sendwait(wd);
886 		}
887 	    }
888 	}
889 }
890 
891 /* ARGSUSED */
892 static void
893 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
894 {
895 	ibt_status_t	ibt_status;
896 	ibt_wc_t	wc;
897 	int		i;
898 
899 	/*
900 	 * Re-enable cq notify here to avoid missing any
901 	 * completion queue notification.
902 	 */
903 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
904 
905 	ibt_status = IBT_SUCCESS;
906 	while (ibt_status != IBT_CQ_EMPTY) {
907 	    bzero(&wc, sizeof (wc));
908 	    ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
909 	    if (ibt_status != IBT_SUCCESS)
910 		return;
911 
912 	/*
913 	 * Got a send completion
914 	 */
915 #ifdef DEBUG
916 	    if (rib_debug > 1 && wc.wc_status != IBT_WC_SUCCESS) {
917 		cmn_err(CE_NOTE, "rib_svc_scq_handler: WR completed in error "
918 			"wc.wc_status:%d, wc_id:%llX",
919 			wc.wc_status, (longlong_t)wc.wc_id);
920 	    }
921 #endif
922 	    if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
923 		struct send_wid *wd = (struct send_wid *)wc.wc_id;
924 
925 		mutex_enter(&wd->sendwait_lock);
926 		if (wd->cv_sig == 1) {
927 			/*
928 			 * Update completion status and notify poster
929 			 */
930 			if (wc.wc_status == IBT_WC_SUCCESS)
931 				wd->status = RDMA_SUCCESS;
932 			else
933 				wd->status = RDMA_FAILED;
934 			cv_signal(&wd->wait_cv);
935 			mutex_exit(&wd->sendwait_lock);
936 		} else {
937 			/*
938 			 * Poster not waiting for notification.
939 			 * Free the send buffers and send_wid
940 			 */
941 			for (i = 0; i < wd->nsbufs; i++) {
942 				rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
943 					(void *)wd->sbufaddr[i]);
944 			}
945 			mutex_exit(&wd->sendwait_lock);
946 			(void) rib_free_sendwait(wd);
947 		}
948 	    }
949 	}
950 }
951 
952 /*
953  * RCQ handler
954  */
955 /* ARGSUSED */
956 static void
957 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
958 {
959 	rib_qp_t	*qp;
960 	ibt_status_t	ibt_status;
961 	ibt_wc_t	wc;
962 	struct recv_wid	*rwid;
963 
964 	/*
965 	 * Re-enable cq notify here to avoid missing any
966 	 * completion queue notification.
967 	 */
968 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
969 
970 	ibt_status = IBT_SUCCESS;
971 	while (ibt_status != IBT_CQ_EMPTY) {
972 		bzero(&wc, sizeof (wc));
973 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
974 		if (ibt_status != IBT_SUCCESS)
975 		    return;
976 
977 		rwid = (struct recv_wid *)wc.wc_id;
978 		qp = rwid->qp;
979 		if (wc.wc_status == IBT_WC_SUCCESS) {
980 		    XDR			inxdrs, *xdrs;
981 		    uint_t		xid, vers, op, find_xid = 0;
982 		    struct reply	*r;
983 		    CONN *conn = qptoc(qp);
984 
985 		    xdrs = &inxdrs;
986 		    xdrmem_create(xdrs, (caddr_t)rwid->addr,
987 			wc.wc_bytes_xfer, XDR_DECODE);
988 		/*
989 		 * Treat xid as opaque (xid is the first entity
990 		 * in the rpc rdma message).
991 		 */
992 		    xid = *(uint32_t *)rwid->addr;
993 		/* Skip xid and set the xdr position accordingly. */
994 		    XDR_SETPOS(xdrs, sizeof (uint32_t));
995 		    (void) xdr_u_int(xdrs, &vers);
996 		    (void) xdr_u_int(xdrs, &op);
997 		    XDR_DESTROY(xdrs);
998 		    if (vers != RPCRDMA_VERS) {
999 			/*
1000 			 * Invalid RPC/RDMA version. Cannot interoperate.
1001 			 * Set connection to ERROR state and bail out.
1002 			 */
1003 			mutex_enter(&conn->c_lock);
1004 			if (conn->c_state != C_DISCONN_PEND)
1005 				conn->c_state = C_ERROR;
1006 			mutex_exit(&conn->c_lock);
1007 			rib_rbuf_free(conn, RECV_BUFFER, (void *)rwid->addr);
1008 			rib_free_wid(rwid);
1009 			continue;
1010 		    }
1011 
1012 		    mutex_enter(&qp->replylist_lock);
1013 		    for (r = qp->replylist; r != NULL; r = r->next) {
1014 			if (r->xid == xid) {
1015 			    find_xid = 1;
1016 			    switch (op) {
1017 			    case RDMA_MSG:
1018 			    case RDMA_NOMSG:
1019 			    case RDMA_MSGP:
1020 				r->status = RDMA_SUCCESS;
1021 				r->vaddr_cq = rwid->addr;
1022 				r->bytes_xfer = wc.wc_bytes_xfer;
1023 				cv_signal(&r->wait_cv);
1024 				break;
1025 			    default:
1026 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1027 						(void *)rwid->addr);
1028 				break;
1029 			    }
1030 			    break;
1031 			}
1032 		    }
1033 		    mutex_exit(&qp->replylist_lock);
1034 		    if (find_xid == 0) {
1035 			/* RPC caller not waiting for reply */
1036 #ifdef DEBUG
1037 			    if (rib_debug) {
1038 			cmn_err(CE_NOTE, "rib_clnt_rcq_handler: "
1039 			    "NO matching xid %u!\n", xid);
1040 			    }
1041 #endif
1042 			rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1043 				(void *)rwid->addr);
1044 		    }
1045 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1046 			CONN *conn = qptoc(qp);
1047 
1048 			/*
1049 			 * Connection being flushed. Just free
1050 			 * the posted buffer
1051 			 */
1052 			rib_rbuf_free(conn, RECV_BUFFER, (void *)rwid->addr);
1053 		} else {
1054 			CONN *conn = qptoc(qp);
1055 /*
1056  *  RC Recv Q Error Code		Local state     Remote State
1057  *  ====================		===========     ============
1058  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1059  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1060  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1061  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1062  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1063  *  IBT_WC_WR_FLUSHED_ERR               None            None
1064  */
1065 			/*
1066 			 * Channel in error state. Set connection
1067 			 * in ERROR state.
1068 			 */
1069 			mutex_enter(&conn->c_lock);
1070 			if (conn->c_state != C_DISCONN_PEND)
1071 				conn->c_state = C_ERROR;
1072 			mutex_exit(&conn->c_lock);
1073 			rib_rbuf_free(conn, RECV_BUFFER, (void *)rwid->addr);
1074 		}
1075 		rib_free_wid(rwid);
1076 	}
1077 }
1078 
1079 /* Server side */
1080 /* ARGSUSED */
1081 static void
1082 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1083 {
1084 	struct recv_data *rd;
1085 	rib_qp_t	*qp;
1086 	ibt_status_t	ibt_status;
1087 	ibt_wc_t	wc;
1088 	struct svc_recv	*s_recvp;
1089 	CONN		*conn;
1090 	mblk_t		*mp;
1091 
1092 	/*
1093 	 * Re-enable cq notify here to avoid missing any
1094 	 * completion queue notification.
1095 	 */
1096 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1097 
1098 	ibt_status = IBT_SUCCESS;
1099 	while (ibt_status != IBT_CQ_EMPTY) {
1100 		bzero(&wc, sizeof (wc));
1101 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1102 		if (ibt_status != IBT_SUCCESS)
1103 		    return;
1104 
1105 		s_recvp = (struct svc_recv *)wc.wc_id;
1106 		qp = s_recvp->qp;
1107 		conn = qptoc(qp);
1108 		mutex_enter(&qp->posted_rbufs_lock);
1109 		qp->n_posted_rbufs--;
1110 		if (qp->n_posted_rbufs == 0)
1111 			cv_signal(&qp->posted_rbufs_cv);
1112 		mutex_exit(&qp->posted_rbufs_lock);
1113 
1114 		if (wc.wc_status == IBT_WC_SUCCESS) {
1115 		    XDR		inxdrs, *xdrs;
1116 		    uint_t	xid, vers, op;
1117 
1118 		    xdrs = &inxdrs;
1119 		    /* s_recvp->vaddr stores data */
1120 		    xdrmem_create(xdrs, (caddr_t)s_recvp->vaddr,
1121 			wc.wc_bytes_xfer, XDR_DECODE);
1122 
1123 		/*
1124 		 * Treat xid as opaque (xid is the first entity
1125 		 * in the rpc rdma message).
1126 		 */
1127 		    xid = *(uint32_t *)s_recvp->vaddr;
1128 		/* Skip xid and set the xdr position accordingly. */
1129 		    XDR_SETPOS(xdrs, sizeof (uint32_t));
1130 		    if (!xdr_u_int(xdrs, &vers) ||
1131 			!xdr_u_int(xdrs, &op)) {
1132 			rib_rbuf_free(conn, RECV_BUFFER,
1133 				(void *)s_recvp->vaddr);
1134 			XDR_DESTROY(xdrs);
1135 #ifdef DEBUG
1136 			cmn_err(CE_NOTE, "rib_svc_rcq_handler: "
1137 			    "xdr_u_int failed for qp %p, wc_id=%llx",
1138 			    (void *)qp, (longlong_t)wc.wc_id);
1139 #endif
1140 			(void) rib_free_svc_recv(s_recvp);
1141 			continue;
1142 		    }
1143 		    XDR_DESTROY(xdrs);
1144 
1145 		    if (vers != RPCRDMA_VERS) {
1146 			/*
1147 			 * Invalid RPC/RDMA version. Drop rpc rdma message.
1148 			 */
1149 			rib_rbuf_free(conn, RECV_BUFFER,
1150 				(void *)s_recvp->vaddr);
1151 			(void) rib_free_svc_recv(s_recvp);
1152 			continue;
1153 		    }
1154 			/*
1155 			 * Is this for RDMA_DONE?
1156 			 */
1157 		    if (op == RDMA_DONE) {
1158 			rib_rbuf_free(conn, RECV_BUFFER,
1159 				(void *)s_recvp->vaddr);
1160 			/*
1161 			 * Wake up the thread waiting on
1162 			 * a RDMA_DONE for xid
1163 			 */
1164 			mutex_enter(&qp->rdlist_lock);
1165 			rdma_done_notify(qp, xid);
1166 			mutex_exit(&qp->rdlist_lock);
1167 			(void) rib_free_svc_recv(s_recvp);
1168 			continue;
1169 		    }
1170 
1171 		    mutex_enter(&plugin_state_lock);
1172 		    if (plugin_state == ACCEPT) {
1173 			while ((mp = allocb(sizeof (*rd), BPRI_LO)) == NULL)
1174 			    (void) strwaitbuf(sizeof (*rd), BPRI_LO);
1175 			/*
1176 			 * Plugin is in accept state, hence the master
1177 			 * transport queue for this is still accepting
1178 			 * requests. Hence we can call svc_queuereq to
1179 			 * queue this recieved msg.
1180 			 */
1181 			rd = (struct recv_data *)mp->b_rptr;
1182 			rd->conn = conn;
1183 			rd->rpcmsg.addr = (caddr_t)s_recvp->vaddr;
1184 			rd->rpcmsg.type = RECV_BUFFER;
1185 			rd->rpcmsg.len = wc.wc_bytes_xfer;
1186 			rd->status = wc.wc_status;
1187 			mutex_enter(&conn->c_lock);
1188 			conn->c_ref++;
1189 			mutex_exit(&conn->c_lock);
1190 			mp->b_wptr += sizeof (*rd);
1191 			svc_queuereq((queue_t *)rib_stat->q, mp);
1192 			mutex_exit(&plugin_state_lock);
1193 		    } else {
1194 			/*
1195 			 * The master transport for this is going
1196 			 * away and the queue is not accepting anymore
1197 			 * requests for krpc, so don't do anything, just
1198 			 * free the msg.
1199 			 */
1200 			mutex_exit(&plugin_state_lock);
1201 			rib_rbuf_free(conn, RECV_BUFFER,
1202 			(void *)s_recvp->vaddr);
1203 		    }
1204 		} else {
1205 			rib_rbuf_free(conn, RECV_BUFFER,
1206 				(void *)s_recvp->vaddr);
1207 		}
1208 		(void) rib_free_svc_recv(s_recvp);
1209 	}
1210 }
1211 
1212 /*
1213  * Handles DR event of IBT_HCA_DETACH_EVENT.
1214  */
1215 /* ARGSUSED */
1216 static void
1217 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1218 	ibt_async_code_t code, ibt_async_event_t *event)
1219 {
1220 
1221 	switch (code) {
1222 	case IBT_HCA_ATTACH_EVENT:
1223 		/* ignore */
1224 		break;
1225 	case IBT_HCA_DETACH_EVENT:
1226 	{
1227 		ASSERT(rib_stat->hca->hca_hdl == hca_hdl);
1228 		rib_detach_hca(rib_stat->hca);
1229 #ifdef DEBUG
1230 	cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1231 #endif
1232 		break;
1233 	}
1234 #ifdef DEBUG
1235 	case IBT_EVENT_PATH_MIGRATED:
1236 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PATH_MIGRATED\n");
1237 		break;
1238 	case IBT_EVENT_SQD:
1239 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1240 		break;
1241 	case IBT_EVENT_COM_EST:
1242 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1243 		break;
1244 	case IBT_ERROR_CATASTROPHIC_CHAN:
1245 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CATASTROPHIC_CHAN\n");
1246 		break;
1247 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1248 	cmn_err(CE_NOTE, "rib_async_handler(): "
1249 		"IBT_ERROR_INVALID_REQUEST_CHAN\n");
1250 		break;
1251 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1252 	cmn_err(CE_NOTE, "rib_async_handler(): "
1253 		"IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1254 		break;
1255 	case IBT_ERROR_PATH_MIGRATE_REQ:
1256 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PATH_MIGRATE_REQ\n");
1257 		break;
1258 	case IBT_ERROR_CQ:
1259 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1260 		break;
1261 	case IBT_ERROR_PORT_DOWN:
1262 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1263 		break;
1264 	case IBT_EVENT_PORT_UP:
1265 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1266 		break;
1267 	case IBT_ASYNC_OPAQUE1:
1268 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1269 		break;
1270 	case IBT_ASYNC_OPAQUE2:
1271 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1272 		break;
1273 	case IBT_ASYNC_OPAQUE3:
1274 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1275 		break;
1276 	case IBT_ASYNC_OPAQUE4:
1277 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1278 		break;
1279 #endif
1280 	default:
1281 		break;
1282 	}
1283 }
1284 
1285 /*
1286  * Client's reachable function.
1287  */
1288 static rdma_stat
1289 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1290 {
1291 	rib_hca_t	*hca;
1292 	rdma_stat	status;
1293 
1294 	/*
1295 	 * First check if a hca is still attached
1296 	 */
1297 	*handle = NULL;
1298 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
1299 	if (rib_stat->hca->state != HCA_INITED) {
1300 		rw_exit(&rib_stat->hca->state_lock);
1301 		return (RDMA_FAILED);
1302 	}
1303 	status = rib_ping_srv(addr_type, raddr, &hca);
1304 	rw_exit(&rib_stat->hca->state_lock);
1305 
1306 	if (status == RDMA_SUCCESS) {
1307 		*handle = (void *)hca;
1308 		/*
1309 		 * Register the Address translation service
1310 		 */
1311 		mutex_enter(&rib_stat->open_hca_lock);
1312 		if (ats_running == 0) {
1313 			if (rib_register_ats(rib_stat->hca)
1314 			    == RDMA_SUCCESS) {
1315 				ats_running = 1;
1316 				mutex_exit(&rib_stat->open_hca_lock);
1317 				return (RDMA_SUCCESS);
1318 			} else {
1319 				mutex_exit(&rib_stat->open_hca_lock);
1320 				return (RDMA_FAILED);
1321 			}
1322 		} else {
1323 			mutex_exit(&rib_stat->open_hca_lock);
1324 			return (RDMA_SUCCESS);
1325 		}
1326 	} else {
1327 		*handle = NULL;
1328 		if (rib_debug > 2)
1329 		    cmn_err(CE_WARN, "rib_reachable(): ping_srv failed.\n");
1330 		return (RDMA_FAILED);
1331 	}
1332 }
1333 
1334 /* Client side qp creation */
1335 static rdma_stat
1336 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1337 {
1338 	rib_qp_t	*kqp = NULL;
1339 	CONN		*conn;
1340 
1341 	ASSERT(qp != NULL);
1342 	*qp = NULL;
1343 
1344 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1345 	conn = qptoc(kqp);
1346 	kqp->hca = hca;
1347 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1348 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1349 
1350 	kqp->mode = RIB_CLIENT;
1351 	kqp->chan_flags = IBT_BLOCKING;
1352 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1353 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1354 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1355 
1356 	/*
1357 	 * Initialize
1358 	 */
1359 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1360 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1361 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1362 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1363 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1364 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1365 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1366 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1367 
1368 	*qp = kqp;
1369 	return (RDMA_SUCCESS);
1370 }
1371 
1372 /* Server side qp creation */
1373 static rdma_stat
1374 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1375 {
1376 	rib_qp_t	*kqp = NULL;
1377 	ibt_chan_sizes_t	chan_sizes;
1378 	ibt_rc_chan_alloc_args_t	qp_attr;
1379 	ibt_status_t		ibt_status;
1380 
1381 	ASSERT(qp != NULL);
1382 	*qp = NULL;
1383 
1384 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1385 	kqp->hca = hca;
1386 	kqp->port_num = port;
1387 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1388 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1389 
1390 	/*
1391 	 * Create the qp handle
1392 	 */
1393 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1394 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1395 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1396 	qp_attr.rc_pd = hca->pd_hdl;
1397 	qp_attr.rc_hca_port_num = port;
1398 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1399 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1400 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1401 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1402 	qp_attr.rc_clone_chan = NULL;
1403 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1404 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1405 
1406 	rw_enter(&hca->state_lock, RW_READER);
1407 	if (hca->state != HCA_DETACHED) {
1408 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1409 			IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1410 			&chan_sizes);
1411 	} else {
1412 		rw_exit(&hca->state_lock);
1413 		goto fail;
1414 	}
1415 	rw_exit(&hca->state_lock);
1416 
1417 	if (ibt_status != IBT_SUCCESS) {
1418 		cmn_err(CE_WARN, "rib_svc_create_chan: "
1419 			"ibt_alloc_rc_channel failed, ibt_status=%d.",
1420 			ibt_status);
1421 		goto fail;
1422 	}
1423 
1424 	kqp->mode = RIB_SERVER;
1425 	kqp->chan_flags = IBT_BLOCKING;
1426 	kqp->q = q;	/* server ONLY */
1427 
1428 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1429 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1430 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1431 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1432 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1433 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1434 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1435 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1436 	/*
1437 	 * Set the private data area to qp to be used in callbacks
1438 	 */
1439 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1440 	kqp->rdmaconn.c_state = C_CONNECTED;
1441 	*qp = kqp;
1442 	return (RDMA_SUCCESS);
1443 fail:
1444 	if (kqp)
1445 		kmem_free(kqp, sizeof (rib_qp_t));
1446 
1447 	return (RDMA_FAILED);
1448 }
1449 
1450 void
1451 rib_dump_pathrec(ibt_path_info_t *path_rec)
1452 {
1453 	ib_pkey_t	pkey;
1454 
1455 	if (rib_debug > 1) {
1456 	    cmn_err(CE_NOTE, "Path Record:\n");
1457 
1458 	    cmn_err(CE_NOTE, "Source HCA GUID = %llx\n",
1459 		(longlong_t)path_rec->pi_hca_guid);
1460 	    cmn_err(CE_NOTE, "Dest Service ID = %llx\n",
1461 		(longlong_t)path_rec->pi_sid);
1462 	    cmn_err(CE_NOTE, "Port Num        = %02d\n",
1463 		path_rec->pi_prim_cep_path.cep_hca_port_num);
1464 	    cmn_err(CE_NOTE, "P_Key Index     = %04d\n",
1465 		path_rec->pi_prim_cep_path.cep_pkey_ix);
1466 
1467 	    (void) ibt_index2pkey_byguid(path_rec->pi_hca_guid,
1468 			path_rec->pi_prim_cep_path.cep_hca_port_num,
1469 			path_rec->pi_prim_cep_path.cep_pkey_ix, &pkey);
1470 	    cmn_err(CE_NOTE, "P_Key		= 0x%x\n", pkey);
1471 
1472 
1473 	    cmn_err(CE_NOTE, "SGID:           = %llx:%llx\n",
1474 		(longlong_t)
1475 		path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_prefix,
1476 		(longlong_t)
1477 		path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_guid);
1478 
1479 	    cmn_err(CE_NOTE, "DGID:           = %llx:%llx\n",
1480 		(longlong_t)
1481 		path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_prefix,
1482 		(longlong_t)
1483 		path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_guid);
1484 
1485 	    cmn_err(CE_NOTE, "Path Rate       = %02x\n",
1486 		path_rec->pi_prim_cep_path.cep_adds_vect.av_srate);
1487 	    cmn_err(CE_NOTE, "SL              = %02x\n",
1488 		path_rec->pi_prim_cep_path.cep_adds_vect.av_srvl);
1489 	    cmn_err(CE_NOTE, "Prim Packet LT  = %02x\n",
1490 		path_rec->pi_prim_pkt_lt);
1491 	    cmn_err(CE_NOTE, "Path MTU        = %02x\n",
1492 		path_rec->pi_path_mtu);
1493 	}
1494 }
1495 
1496 /* ARGSUSED */
1497 ibt_cm_status_t
1498 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1499     ibt_cm_return_args_t *ret_args, void *priv_data,
1500     ibt_priv_data_len_t len)
1501 {
1502 	rpcib_state_t   *ribstat;
1503 	rib_hca_t	*hca;
1504 
1505 	ribstat = (rpcib_state_t *)clnt_hdl;
1506 	hca = (rib_hca_t *)ribstat->hca;
1507 
1508 	switch (event->cm_type) {
1509 
1510 	/* got a connection close event */
1511 	case IBT_CM_EVENT_CONN_CLOSED:
1512 	{
1513 		CONN	*conn;
1514 		rib_qp_t *qp;
1515 
1516 		/* check reason why connection was closed */
1517 		switch (event->cm_event.closed) {
1518 		case IBT_CM_CLOSED_DREP_RCVD:
1519 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1520 		case IBT_CM_CLOSED_DUP:
1521 		case IBT_CM_CLOSED_ABORT:
1522 		case IBT_CM_CLOSED_ALREADY:
1523 			/*
1524 			 * These cases indicate the local end initiated
1525 			 * the closing of the channel. Nothing to do here.
1526 			 */
1527 			break;
1528 		default:
1529 			/*
1530 			 * Reason for CONN_CLOSED event must be one of
1531 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1532 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1533 			 * the remote end is closing the channel. In these
1534 			 * cases free the channel and transition to error
1535 			 * state
1536 			 */
1537 			qp = ibt_get_chan_private(event->cm_channel);
1538 			conn = qptoc(qp);
1539 			mutex_enter(&conn->c_lock);
1540 			if (conn->c_state == C_DISCONN_PEND) {
1541 				mutex_exit(&conn->c_lock);
1542 				break;
1543 			}
1544 
1545 			conn->c_state = C_ERROR;
1546 
1547 			/*
1548 			 * Free the rc_channel. Channel has already
1549 			 * transitioned to ERROR state and WRs have been
1550 			 * FLUSHED_ERR already.
1551 			 */
1552 			(void) ibt_free_channel(qp->qp_hdl);
1553 			qp->qp_hdl = NULL;
1554 
1555 			/*
1556 			 * Free the conn if c_ref is down to 0 already
1557 			 */
1558 			if (conn->c_ref == 0) {
1559 				/*
1560 				 * Remove from list and free conn
1561 				 */
1562 				conn->c_state = C_DISCONN_PEND;
1563 				mutex_exit(&conn->c_lock);
1564 				(void) rib_disconnect_channel(conn,
1565 					&hca->cl_conn_list);
1566 			} else {
1567 				mutex_exit(&conn->c_lock);
1568 			}
1569 #ifdef DEBUG
1570 			if (rib_debug)
1571 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1572 					"(CONN_CLOSED) channel disconnected");
1573 #endif
1574 			break;
1575 		}
1576 		break;
1577 	}
1578 	default:
1579 		break;
1580 	}
1581 	return (IBT_CM_ACCEPT);
1582 }
1583 
1584 
1585 /* Check if server has done ATS registration */
1586 rdma_stat
1587 rib_chk_srv_ats(rib_hca_t *hca, struct netbuf *raddr,
1588 	int addr_type, ibt_path_info_t *path)
1589 {
1590 	struct sockaddr_in	*sin4;
1591 	struct sockaddr_in6	*sin6;
1592 	ibt_path_attr_t		path_attr;
1593 	ibt_status_t		ibt_status;
1594 	ib_pkey_t		pkey;
1595 	ibt_ar_t		ar_query, ar_result;
1596 	rib_service_t		*ats;
1597 	ib_gid_t		sgid;
1598 	ibt_path_info_t		paths[MAX_PORTS];
1599 	uint8_t			npaths, i;
1600 
1601 	(void) bzero(&path_attr, sizeof (ibt_path_attr_t));
1602 	(void) bzero(path, sizeof (ibt_path_info_t));
1603 
1604 	/*
1605 	 * Construct svc name
1606 	 */
1607 	path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
1608 	switch (addr_type) {
1609 	case AF_INET:
1610 		sin4 = (struct sockaddr_in *)raddr->buf;
1611 		(void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname,
1612 		    IB_SVC_NAME_LEN);
1613 		break;
1614 
1615 	case AF_INET6:
1616 		sin6 = (struct sockaddr_in6 *)raddr->buf;
1617 		(void) inet_ntop(AF_INET6, &sin6->sin6_addr,
1618 		    path_attr.pa_sname, IB_SVC_NAME_LEN);
1619 		break;
1620 
1621 	default:
1622 		kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1623 		return (RDMA_INVAL);
1624 	}
1625 	(void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN);
1626 
1627 	/*
1628 	 * Attempt a path to the server on an ATS-registered port.
1629 	 * Try all ATS-registered ports until one succeeds.
1630 	 * The first one that succeeds will be used to connect
1631 	 * to the server.  If none of them succeed, return RDMA_FAILED.
1632 	 */
1633 	rw_enter(&hca->state_lock, RW_READER);
1634 	if (hca->state != HCA_DETACHED) {
1635 	    rw_enter(&hca->service_list_lock, RW_READER);
1636 	    for (ats = hca->ats_list; ats != NULL; ats = ats->srv_next) {
1637 		path_attr.pa_hca_guid = hca->hca_guid;
1638 		path_attr.pa_hca_port_num = ats->srv_port;
1639 		ibt_status = ibt_get_paths(hca->ibt_clnt_hdl,
1640 			IBT_PATH_MULTI_SVC_DEST, &path_attr, 2, paths, &npaths);
1641 		if (ibt_status == IBT_SUCCESS ||
1642 			ibt_status == IBT_INSUFF_DATA) {
1643 		    for (i = 0; i < npaths; i++) {
1644 			if (paths[i].pi_hca_guid) {
1645 			/*
1646 			 * do ibt_query_ar()
1647 			 */
1648 			    sgid =
1649 				paths[i].pi_prim_cep_path.cep_adds_vect.av_sgid;
1650 
1651 			    (void) ibt_index2pkey_byguid(paths[i].pi_hca_guid,
1652 				paths[i].pi_prim_cep_path.cep_hca_port_num,
1653 				paths[i].pi_prim_cep_path.cep_pkey_ix, &pkey);
1654 
1655 			    bzero(&ar_query, sizeof (ar_query));
1656 			    bzero(&ar_result, sizeof (ar_result));
1657 			    ar_query.ar_gid =
1658 				paths[i].pi_prim_cep_path.cep_adds_vect.av_dgid;
1659 			    ar_query.ar_pkey = pkey;
1660 			    ibt_status = ibt_query_ar(&sgid, &ar_query,
1661 					&ar_result);
1662 			    if (ibt_status == IBT_SUCCESS) {
1663 #ifdef DEBUG
1664 				if (rib_debug > 1)
1665 				    rib_dump_pathrec(&paths[i]);
1666 #endif
1667 				bcopy(&paths[i], path,
1668 					sizeof (ibt_path_info_t));
1669 				rw_exit(&hca->service_list_lock);
1670 				kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1671 				rw_exit(&hca->state_lock);
1672 				return (RDMA_SUCCESS);
1673 			    }
1674 #ifdef DEBUG
1675 			    if (rib_debug) {
1676 				cmn_err(CE_NOTE, "rib_chk_srv_ats: "
1677 				    "ibt_query_ar FAILED, return\n");
1678 			    }
1679 #endif
1680 			}
1681 		    }
1682 		}
1683 	    }
1684 	    rw_exit(&hca->service_list_lock);
1685 	}
1686 	kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1687 	rw_exit(&hca->state_lock);
1688 	return (RDMA_FAILED);
1689 }
1690 
1691 
1692 /*
1693  * Connect to the server.
1694  */
1695 rdma_stat
1696 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path)
1697 {
1698 	ibt_chan_open_args_t	chan_args;	/* channel args */
1699 	ibt_chan_sizes_t	chan_sizes;
1700 	ibt_rc_chan_alloc_args_t	qp_attr;
1701 	ibt_status_t		ibt_status;
1702 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1703 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1704 
1705 	(void) bzero(&chan_args, sizeof (chan_args));
1706 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1707 
1708 	qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num;
1709 	/* Alloc a RC channel */
1710 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1711 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1712 	qp_attr.rc_pd = hca->pd_hdl;
1713 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1714 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1715 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1716 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1717 	qp_attr.rc_clone_chan = NULL;
1718 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1719 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1720 
1721 	chan_args.oc_path = path;
1722 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1723 	chan_args.oc_cm_clnt_private = (void *)rib_stat;
1724 	chan_args.oc_rdma_ra_out = 1;
1725 	chan_args.oc_rdma_ra_in = 1;
1726 	chan_args.oc_path_retry_cnt = 2;
1727 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1728 
1729 refresh:
1730 	rw_enter(&hca->state_lock, RW_READER);
1731 	if (hca->state != HCA_DETACHED) {
1732 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1733 			IBT_ACHAN_NO_FLAGS, &qp_attr, &qp->qp_hdl,
1734 			&chan_sizes);
1735 	} else {
1736 		rw_exit(&hca->state_lock);
1737 		return (RDMA_FAILED);
1738 	}
1739 	rw_exit(&hca->state_lock);
1740 
1741 	if (ibt_status != IBT_SUCCESS) {
1742 #ifdef DEBUG
1743 		cmn_err(CE_WARN, "rib_conn_to_srv: alloc_rc_channel "
1744 		"failed, ibt_status=%d.", ibt_status);
1745 #endif
1746 		return (RDMA_FAILED);
1747 	}
1748 
1749 	/* Connect to the Server */
1750 	(void) bzero(&ret_args, sizeof (ret_args));
1751 	mutex_enter(&qp->cb_lock);
1752 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1753 			IBT_BLOCKING, &chan_args, &ret_args);
1754 	if (ibt_status != IBT_SUCCESS) {
1755 #ifdef DEBUG
1756 		if (rib_debug)
1757 			cmn_err(CE_WARN, "rib_conn_to_srv: open_rc_channel"
1758 				" failed for qp %p, status=%d, "
1759 				"ret_args.rc_status=%d\n",
1760 				(void *)qp, ibt_status, ret_args.rc_status);
1761 #endif
1762 		(void) ibt_free_channel(qp->qp_hdl);
1763 		qp->qp_hdl = NULL;
1764 		mutex_exit(&qp->cb_lock);
1765 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1766 			ret_args.rc_status == IBT_CM_CONN_STALE) {
1767 			/*
1768 			 * Got IBT_CM_CONN_STALE probably because of stale
1769 			 * data on the passive end of a channel that existed
1770 			 * prior to reboot. Retry establishing a channel
1771 			 * REFRESH_ATTEMPTS times, during which time the
1772 			 * stale conditions on the server might clear up.
1773 			 */
1774 			goto refresh;
1775 		}
1776 		return (RDMA_FAILED);
1777 	}
1778 	mutex_exit(&qp->cb_lock);
1779 	/*
1780 	 * Set the private data area to qp to be used in callbacks
1781 	 */
1782 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1783 	return (RDMA_SUCCESS);
1784 }
1785 
1786 rdma_stat
1787 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca)
1788 {
1789 	struct sockaddr_in	*sin4;
1790 	struct sockaddr_in6	*sin6;
1791 	ibt_path_attr_t		path_attr;
1792 	ibt_path_info_t		path;
1793 	ibt_status_t		ibt_status;
1794 
1795 	ASSERT(raddr->buf != NULL);
1796 
1797 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1798 	bzero(&path, sizeof (ibt_path_info_t));
1799 
1800 	/*
1801 	 * Conctruct svc name
1802 	 */
1803 	path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
1804 	switch (addr_type) {
1805 	case AF_INET:
1806 		sin4 = (struct sockaddr_in *)raddr->buf;
1807 		(void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname,
1808 		    IB_SVC_NAME_LEN);
1809 		break;
1810 
1811 	case AF_INET6:
1812 		sin6 = (struct sockaddr_in6 *)raddr->buf;
1813 		(void) inet_ntop(AF_INET6, &sin6->sin6_addr,
1814 		    path_attr.pa_sname, IB_SVC_NAME_LEN);
1815 		break;
1816 
1817 	default:
1818 #ifdef	DEBUG
1819 	    if (rib_debug) {
1820 		cmn_err(CE_WARN, "rib_ping_srv: Address not recognized\n");
1821 	    }
1822 #endif
1823 		kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1824 		return (RDMA_INVAL);
1825 	}
1826 	(void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN);
1827 
1828 	ibt_status = ibt_get_paths(rib_stat->ibt_clnt_hdl,
1829 		IBT_PATH_NO_FLAGS, &path_attr, 1, &path, NULL);
1830 	kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1831 	if (ibt_status != IBT_SUCCESS) {
1832 	    if (rib_debug > 1) {
1833 		cmn_err(CE_WARN, "rib_ping_srv: ibt_get_paths FAILED!"
1834 			" status=%d\n", ibt_status);
1835 	    }
1836 	} else if (path.pi_hca_guid) {
1837 		ASSERT(path.pi_hca_guid == rib_stat->hca->hca_guid);
1838 		*hca = rib_stat->hca;
1839 		return (RDMA_SUCCESS);
1840 	}
1841 	return (RDMA_FAILED);
1842 }
1843 
1844 /*
1845  * Close channel, remove from connection list and
1846  * free up resources allocated for that channel.
1847  */
1848 rdma_stat
1849 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
1850 {
1851 	rib_qp_t	*qp = ctoqp(conn);
1852 	rib_hca_t	*hca;
1853 
1854 	/*
1855 	 * c_ref == 0 and connection is in C_DISCONN_PEND
1856 	 */
1857 	hca = qp->hca;
1858 	if (conn_list != NULL)
1859 		(void) rib_rm_conn(conn, conn_list);
1860 	if (qp->qp_hdl != NULL) {
1861 		/*
1862 		 * If the channel has not been establised,
1863 		 * ibt_flush_channel is called to flush outstanding WRs
1864 		 * on the Qs.  Otherwise, ibt_close_rc_channel() is
1865 		 * called.  The channel is then freed.
1866 		 */
1867 		if (conn_list != NULL)
1868 		    (void) ibt_close_rc_channel(qp->qp_hdl,
1869 			IBT_BLOCKING, NULL, 0, NULL, NULL, 0);
1870 		else
1871 		    (void) ibt_flush_channel(qp->qp_hdl);
1872 
1873 		mutex_enter(&qp->posted_rbufs_lock);
1874 		while (qp->n_posted_rbufs)
1875 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
1876 		mutex_exit(&qp->posted_rbufs_lock);
1877 		(void) ibt_free_channel(qp->qp_hdl);
1878 		qp->qp_hdl = NULL;
1879 	}
1880 	ASSERT(qp->rdlist == NULL);
1881 	if (qp->replylist != NULL) {
1882 		(void) rib_rem_replylist(qp);
1883 	}
1884 
1885 	cv_destroy(&qp->cb_conn_cv);
1886 	cv_destroy(&qp->posted_rbufs_cv);
1887 	mutex_destroy(&qp->cb_lock);
1888 
1889 	mutex_destroy(&qp->replylist_lock);
1890 	mutex_destroy(&qp->posted_rbufs_lock);
1891 	mutex_destroy(&qp->rdlist_lock);
1892 
1893 	cv_destroy(&conn->c_cv);
1894 	mutex_destroy(&conn->c_lock);
1895 
1896 	if (conn->c_raddr.buf != NULL) {
1897 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
1898 	}
1899 	if (conn->c_laddr.buf != NULL) {
1900 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
1901 	}
1902 	kmem_free(qp, sizeof (rib_qp_t));
1903 
1904 	/*
1905 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
1906 	 * then the hca is no longer being used.
1907 	 */
1908 	if (conn_list != NULL) {
1909 		rw_enter(&hca->state_lock, RW_READER);
1910 		if (hca->state == HCA_DETACHED) {
1911 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
1912 			if (hca->srv_conn_list.conn_hd == NULL) {
1913 				rw_enter(&hca->cl_conn_list.conn_lock,
1914 					RW_READER);
1915 				if (hca->cl_conn_list.conn_hd == NULL) {
1916 					mutex_enter(&hca->inuse_lock);
1917 					hca->inuse = FALSE;
1918 					cv_signal(&hca->cb_cv);
1919 					mutex_exit(&hca->inuse_lock);
1920 				}
1921 				rw_exit(&hca->cl_conn_list.conn_lock);
1922 			}
1923 			rw_exit(&hca->srv_conn_list.conn_lock);
1924 		}
1925 		rw_exit(&hca->state_lock);
1926 	}
1927 	return (RDMA_SUCCESS);
1928 }
1929 
1930 /*
1931  * Wait for send completion notification. Only on receiving a
1932  * notification be it a successful or error completion, free the
1933  * send_wid.
1934  */
1935 static rdma_stat
1936 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
1937 {
1938 	clock_t timout, cv_wait_ret;
1939 	rdma_stat error = RDMA_SUCCESS;
1940 	int	i;
1941 
1942 	/*
1943 	 * Wait for send to complete
1944 	 */
1945 	ASSERT(wd != NULL);
1946 	mutex_enter(&wd->sendwait_lock);
1947 	if (wd->status == (uint_t)SEND_WAIT) {
1948 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
1949 		    ddi_get_lbolt();
1950 		if (qp->mode == RIB_SERVER) {
1951 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
1952 				    &wd->sendwait_lock, timout)) > 0 &&
1953 			    wd->status == (uint_t)SEND_WAIT)
1954 				;
1955 			switch (cv_wait_ret) {
1956 			case -1:	/* timeout */
1957 #ifdef DEBUG
1958 				if (rib_debug > 2)
1959 					cmn_err(CE_WARN, "rib_sendwait: "
1960 					    "timed out qp %p\n", (void *)qp);
1961 #endif
1962 				wd->cv_sig = 0;		/* no signal needed */
1963 				error = RDMA_TIMEDOUT;
1964 				break;
1965 			default:	/* got send completion */
1966 				break;
1967 			}
1968 		} else {
1969 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
1970 				    &wd->sendwait_lock, timout)) > 0 &&
1971 			    wd->status == (uint_t)SEND_WAIT)
1972 				;
1973 			switch (cv_wait_ret) {
1974 			case -1:	/* timeout */
1975 #ifdef DEBUG
1976 				if (rib_debug > 2)
1977 					cmn_err(CE_WARN, "rib_sendwait: "
1978 					    "timed out qp %p\n", (void *)qp);
1979 #endif
1980 				wd->cv_sig = 0;		/* no signal needed */
1981 				error = RDMA_TIMEDOUT;
1982 				break;
1983 			case 0:		/* interrupted */
1984 #ifdef DEBUG
1985 				if (rib_debug > 2)
1986 					cmn_err(CE_NOTE, "rib_sendwait:"
1987 					    " interrupted on qp %p\n",
1988 					    (void *)qp);
1989 #endif
1990 				wd->cv_sig = 0;		/* no signal needed */
1991 				error = RDMA_INTR;
1992 				break;
1993 			default:	/* got send completion */
1994 				break;
1995 			}
1996 		}
1997 	}
1998 
1999 	if (wd->status != (uint_t)SEND_WAIT) {
2000 		/* got send completion */
2001 		if (wd->status != RDMA_SUCCESS) {
2002 		    error = wd->status;
2003 		    if (wd->status != RDMA_CONNLOST)
2004 			error = RDMA_FAILED;
2005 		}
2006 		for (i = 0; i < wd->nsbufs; i++) {
2007 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2008 				(void *)wd->sbufaddr[i]);
2009 		}
2010 		mutex_exit(&wd->sendwait_lock);
2011 		(void) rib_free_sendwait(wd);
2012 	} else {
2013 		mutex_exit(&wd->sendwait_lock);
2014 	}
2015 
2016 	return (error);
2017 }
2018 
2019 static struct send_wid *
2020 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2021 {
2022 	struct send_wid	*wd;
2023 
2024 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2025 	wd->xid = xid;
2026 	wd->cv_sig = cv_sig;
2027 	wd->qp = qp;
2028 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2029 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2030 	wd->status = (uint_t)SEND_WAIT;
2031 
2032 	return (wd);
2033 }
2034 
2035 static int
2036 rib_free_sendwait(struct send_wid *wdesc)
2037 {
2038 	cv_destroy(&wdesc->wait_cv);
2039 	mutex_destroy(&wdesc->sendwait_lock);
2040 	kmem_free(wdesc, sizeof (*wdesc));
2041 
2042 	return (0);
2043 }
2044 
2045 static rdma_stat
2046 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2047 {
2048 	mutex_enter(&qp->replylist_lock);
2049 	if (rep != NULL) {
2050 	    (void) rib_remreply(qp, rep);
2051 	    mutex_exit(&qp->replylist_lock);
2052 	    return (RDMA_SUCCESS);
2053 	}
2054 	mutex_exit(&qp->replylist_lock);
2055 	return (RDMA_FAILED);
2056 }
2057 
2058 /*
2059  * Send buffers are freed here only in case of error in posting
2060  * on QP. If the post succeeded, the send buffers are freed upon
2061  * send completion in rib_sendwait() or in the scq_handler.
2062  */
2063 rdma_stat
2064 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2065 	int send_sig, int cv_sig)
2066 {
2067 	struct send_wid	*wdesc;
2068 	struct clist	*clp;
2069 	ibt_status_t	ibt_status = IBT_SUCCESS;
2070 	rdma_stat	ret = RDMA_SUCCESS;
2071 	ibt_send_wr_t	tx_wr;
2072 	int		i, nds;
2073 	ibt_wr_ds_t	sgl[DSEG_MAX];
2074 	uint_t		total_msg_size;
2075 	rib_qp_t	*qp = ctoqp(conn);
2076 
2077 	ASSERT(cl != NULL);
2078 
2079 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2080 
2081 	nds = 0;
2082 	total_msg_size = 0;
2083 	clp = cl;
2084 	while (clp != NULL) {
2085 		if (nds >= DSEG_MAX) {
2086 			cmn_err(CE_WARN, "rib_send_and_wait: DSEG_MAX"
2087 			    " too small!");
2088 			return (RDMA_FAILED);
2089 		}
2090 		sgl[nds].ds_va = clp->c_saddr;
2091 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2092 		sgl[nds].ds_len = clp->c_len;
2093 		total_msg_size += clp->c_len;
2094 		clp = clp->c_next;
2095 		nds++;
2096 	}
2097 
2098 	if (send_sig) {
2099 		/* Set SEND_SIGNAL flag. */
2100 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2101 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2102 	} else {
2103 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2104 		wdesc = rib_init_sendwait(msgid, 0, qp);
2105 	}
2106 	wdesc->nsbufs = nds;
2107 	for (i = 0; i < nds; i++) {
2108 		wdesc->sbufaddr[i] = sgl[i].ds_va;
2109 	}
2110 
2111 	tx_wr.wr_id = (ibt_wrid_t)wdesc;
2112 	tx_wr.wr_opcode = IBT_WRC_SEND;
2113 	tx_wr.wr_trans = IBT_RC_SRV;
2114 	tx_wr.wr_nds = nds;
2115 	tx_wr.wr_sgl = sgl;
2116 
2117 	mutex_enter(&conn->c_lock);
2118 	if (conn->c_state & C_CONNECTED) {
2119 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2120 	}
2121 	if (((conn->c_state & C_CONNECTED) == 0) ||
2122 		ibt_status != IBT_SUCCESS) {
2123 		mutex_exit(&conn->c_lock);
2124 		for (i = 0; i < nds; i++) {
2125 			rib_rbuf_free(conn, SEND_BUFFER,
2126 				(void *)wdesc->sbufaddr[i]);
2127 		}
2128 		(void) rib_free_sendwait(wdesc);
2129 #ifdef DEBUG
2130 		if (rib_debug && ibt_status != IBT_SUCCESS)
2131 			cmn_err(CE_WARN, "rib_send_and_wait: ibt_post_send "
2132 				"failed! wr_id %llx on qpn %p, status=%d!",
2133 				(longlong_t)tx_wr.wr_id, (void *)qp,
2134 				ibt_status);
2135 #endif
2136 		return (RDMA_FAILED);
2137 	}
2138 	mutex_exit(&conn->c_lock);
2139 
2140 	if (send_sig) {
2141 	    if (cv_sig) {
2142 		/*
2143 		 * cv_wait for send to complete.
2144 		 * We can fail due to a timeout or signal or
2145 		 * unsuccessful send.
2146 		 */
2147 		ret = rib_sendwait(qp, wdesc);
2148 #ifdef DEBUG
2149 	    if (rib_debug > 2)
2150 		if (ret != 0) {
2151 		    cmn_err(CE_WARN, "rib_send_and_wait: rib_sendwait "
2152 			"FAILED, rdma stat=%d, wr_id %llx, qp %p!",
2153 			ret, (longlong_t)tx_wr.wr_id, (void *)qp);
2154 		}
2155 #endif
2156 		return (ret);
2157 	    }
2158 	}
2159 
2160 	return (RDMA_SUCCESS);
2161 }
2162 
2163 rdma_stat
2164 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2165 {
2166 	rdma_stat	ret;
2167 
2168 	/* send-wait & cv_signal */
2169 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1);
2170 
2171 	return (ret);
2172 }
2173 
2174 /*
2175  * Server interface (svc_rdma_ksend).
2176  * Send RPC reply and wait for RDMA_DONE.
2177  */
2178 rdma_stat
2179 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2180 {
2181 	rdma_stat ret = RDMA_SUCCESS;
2182 	struct rdma_done_list *rd;
2183 	clock_t timout, cv_wait_ret;
2184 	rib_qp_t *qp = ctoqp(conn);
2185 
2186 	mutex_enter(&qp->rdlist_lock);
2187 	rd = rdma_done_add(qp, msgid);
2188 
2189 	/* No cv_signal (whether send-wait or no-send-wait) */
2190 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0);
2191 	if (ret != RDMA_SUCCESS) {
2192 #ifdef DEBUG
2193 	    cmn_err(CE_WARN, "rib_send_resp: send_and_wait "
2194 		"failed, msgid %u, qp %p", msgid, (void *)qp);
2195 #endif
2196 	    rdma_done_rm(qp, rd);
2197 	    goto done;
2198 	}
2199 
2200 	/*
2201 	 * Wait for RDMA_DONE from remote end
2202 	 */
2203 	timout = drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2204 	cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, &qp->rdlist_lock,
2205 	    timout);
2206 	rdma_done_rm(qp, rd);
2207 	if (cv_wait_ret < 0) {
2208 #ifdef DEBUG
2209 		if (rib_debug > 1) {
2210 			cmn_err(CE_WARN, "rib_send_resp: RDMA_DONE not"
2211 			    " recv'd for qp %p, xid:%u\n",
2212 			    (void *)qp, msgid);
2213 		}
2214 #endif
2215 		ret = RDMA_TIMEDOUT;
2216 		goto done;
2217 	}
2218 
2219 done:
2220 	mutex_exit(&qp->rdlist_lock);
2221 	return (ret);
2222 }
2223 
2224 static struct recv_wid *
2225 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2226 {
2227 	struct recv_wid	*rwid;
2228 
2229 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2230 	rwid->xid = msgid;
2231 	rwid->addr = sgl->ds_va;
2232 	rwid->qp = qp;
2233 
2234 	return (rwid);
2235 }
2236 
2237 static void
2238 rib_free_wid(struct recv_wid *rwid)
2239 {
2240 	kmem_free(rwid, sizeof (struct recv_wid));
2241 }
2242 
2243 rdma_stat
2244 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2245 {
2246 	rib_qp_t	*qp = ctoqp(conn);
2247 	struct clist	*clp = cl;
2248 	struct reply	*rep;
2249 	struct recv_wid	*rwid;
2250 	int		nds;
2251 	ibt_wr_ds_t	sgl[DSEG_MAX];
2252 	ibt_recv_wr_t	recv_wr;
2253 	rdma_stat	ret;
2254 	ibt_status_t	ibt_status;
2255 
2256 	/*
2257 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2258 	 */
2259 
2260 	nds = 0;
2261 	while (cl != NULL) {
2262 		if (nds >= DSEG_MAX) {
2263 		    cmn_err(CE_WARN, "rib_clnt_post: DSEG_MAX too small!");
2264 		    ret = RDMA_FAILED;
2265 		    goto done;
2266 		}
2267 		sgl[nds].ds_va = cl->c_saddr;
2268 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2269 		sgl[nds].ds_len = cl->c_len;
2270 		cl = cl->c_next;
2271 		nds++;
2272 	}
2273 
2274 	if (nds != 1) {
2275 	    cmn_err(CE_WARN, "rib_clnt_post: nds!=1\n");
2276 	    ret = RDMA_FAILED;
2277 	    goto done;
2278 	}
2279 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2280 	recv_wr.wr_nds = nds;
2281 	recv_wr.wr_sgl = sgl;
2282 
2283 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2284 	if (rwid) {
2285 	    recv_wr.wr_id = (ibt_wrid_t)rwid;
2286 	} else {
2287 		cmn_err(CE_WARN, "rib_clnt_post: out of memory");
2288 		ret = RDMA_NORESOURCE;
2289 		goto done;
2290 	}
2291 	rep = rib_addreplylist(qp, msgid);
2292 	if (!rep) {
2293 		cmn_err(CE_WARN, "rib_clnt_post: out of memory");
2294 		rib_free_wid(rwid);
2295 		ret = RDMA_NORESOURCE;
2296 		goto done;
2297 	}
2298 
2299 	mutex_enter(&conn->c_lock);
2300 	if (conn->c_state & C_CONNECTED) {
2301 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2302 	}
2303 	if (((conn->c_state & C_CONNECTED) == 0) ||
2304 		ibt_status != IBT_SUCCESS) {
2305 		mutex_exit(&conn->c_lock);
2306 #ifdef DEBUG
2307 		cmn_err(CE_WARN, "rib_clnt_post: QPN %p failed in "
2308 		    "ibt_post_recv(), msgid=%d, status=%d",
2309 		    (void *)qp,  msgid, ibt_status);
2310 #endif
2311 		rib_free_wid(rwid);
2312 		(void) rib_rem_rep(qp, rep);
2313 		ret = RDMA_FAILED;
2314 		goto done;
2315 	}
2316 	mutex_exit(&conn->c_lock);
2317 	return (RDMA_SUCCESS);
2318 
2319 done:
2320 	while (clp != NULL) {
2321 	    rib_rbuf_free(conn, RECV_BUFFER, (void *)clp->c_saddr);
2322 	    clp = clp->c_next;
2323 	}
2324 	return (ret);
2325 }
2326 
2327 rdma_stat
2328 rib_svc_post(CONN* conn, struct clist *cl)
2329 {
2330 	rib_qp_t	*qp = ctoqp(conn);
2331 	struct svc_recv	*s_recvp;
2332 	int		nds;
2333 	ibt_wr_ds_t	sgl[DSEG_MAX];
2334 	ibt_recv_wr_t	recv_wr;
2335 	ibt_status_t	ibt_status;
2336 
2337 	nds = 0;
2338 	while (cl != NULL) {
2339 		if (nds >= DSEG_MAX) {
2340 		    cmn_err(CE_WARN, "rib_svc_post: DSEG_MAX too small!");
2341 		    return (RDMA_FAILED);
2342 		}
2343 		sgl[nds].ds_va = cl->c_saddr;
2344 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2345 		sgl[nds].ds_len = cl->c_len;
2346 		cl = cl->c_next;
2347 		nds++;
2348 	}
2349 
2350 	if (nds != 1) {
2351 	    cmn_err(CE_WARN, "rib_svc_post: nds!=1\n");
2352 	    rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)sgl[0].ds_va);
2353 	    return (RDMA_FAILED);
2354 	}
2355 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2356 	recv_wr.wr_nds = nds;
2357 	recv_wr.wr_sgl = sgl;
2358 
2359 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2360 	recv_wr.wr_id = (ibt_wrid_t)s_recvp; /* Use s_recvp's addr as wr id */
2361 	mutex_enter(&conn->c_lock);
2362 	if (conn->c_state & C_CONNECTED) {
2363 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2364 	}
2365 	if (((conn->c_state & C_CONNECTED) == 0) ||
2366 		ibt_status != IBT_SUCCESS) {
2367 		mutex_exit(&conn->c_lock);
2368 #ifdef DEBUG
2369 		cmn_err(CE_WARN, "rib_svc_post: QP %p failed in "
2370 		    "ibt_post_recv(), status=%d",
2371 		    (void *)qp, ibt_status);
2372 #endif
2373 		rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)sgl[0].ds_va);
2374 		(void) rib_free_svc_recv(s_recvp);
2375 		return (RDMA_FAILED);
2376 	}
2377 	mutex_exit(&conn->c_lock);
2378 
2379 	return (RDMA_SUCCESS);
2380 }
2381 
2382 /* Client */
2383 rdma_stat
2384 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2385 {
2386 
2387 	return (rib_clnt_post(conn, cl, msgid));
2388 }
2389 
2390 /* Server */
2391 rdma_stat
2392 rib_post_recv(CONN *conn, struct clist *cl)
2393 {
2394 	rib_qp_t	*qp = ctoqp(conn);
2395 
2396 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2397 		mutex_enter(&qp->posted_rbufs_lock);
2398 		qp->n_posted_rbufs++;
2399 		mutex_exit(&qp->posted_rbufs_lock);
2400 		return (RDMA_SUCCESS);
2401 	}
2402 	return (RDMA_FAILED);
2403 }
2404 
2405 /*
2406  * Client side only interface to "recv" the rpc reply buf
2407  * posted earlier by rib_post_resp(conn, cl, msgid).
2408  */
2409 rdma_stat
2410 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2411 {
2412 	struct reply *rep = NULL;
2413 	clock_t timout, cv_wait_ret;
2414 	rdma_stat ret = RDMA_SUCCESS;
2415 	rib_qp_t *qp = ctoqp(conn);
2416 
2417 	/*
2418 	 * Find the reply structure for this msgid
2419 	 */
2420 	mutex_enter(&qp->replylist_lock);
2421 
2422 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2423 	    if (rep->xid == msgid)
2424 		break;
2425 	}
2426 	if (rep != NULL) {
2427 		/*
2428 		 * If message not yet received, wait.
2429 		 */
2430 		if (rep->status == (uint_t)REPLY_WAIT) {
2431 			timout = ddi_get_lbolt() +
2432 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2433 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2434 				    &qp->replylist_lock, timout)) > 0 &&
2435 			    rep->status == (uint_t)REPLY_WAIT);
2436 
2437 			switch (cv_wait_ret) {
2438 			case -1:	/* timeout */
2439 				ret = RDMA_TIMEDOUT;
2440 				break;
2441 			case 0:
2442 				ret = RDMA_INTR;
2443 				break;
2444 			default:
2445 				break;
2446 			}
2447 		}
2448 
2449 		if (rep->status == RDMA_SUCCESS) {
2450 			struct clist *cl = NULL;
2451 
2452 			/*
2453 			 * Got message successfully
2454 			 */
2455 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2456 			    (caddr_t)rep->vaddr_cq, NULL, NULL);
2457 			*clp = cl;
2458 		} else {
2459 			if (rep->status != (uint_t)REPLY_WAIT) {
2460 				/*
2461 				 * Got error in reply message. Free
2462 				 * recv buffer here.
2463 				 */
2464 				ret = rep->status;
2465 				rib_rbuf_free(conn, RECV_BUFFER,
2466 					(caddr_t)rep->vaddr_cq);
2467 			}
2468 		}
2469 		(void) rib_remreply(qp, rep);
2470 	} else {
2471 		/*
2472 		 * No matching reply structure found for given msgid on the
2473 		 * reply wait list.
2474 		 */
2475 		ret = RDMA_INVAL;
2476 #ifdef DEBUG
2477 		cmn_err(CE_WARN, "rib_recv: no matching reply for "
2478 		    "xid %u, qp %p\n", msgid, (void *)qp);
2479 #endif
2480 	}
2481 
2482 	/*
2483 	 * Done.
2484 	 */
2485 	mutex_exit(&qp->replylist_lock);
2486 	return (ret);
2487 }
2488 
2489 /*
2490  * RDMA write a buffer to the remote address.
2491  */
2492 rdma_stat
2493 rib_write(CONN *conn, struct clist *cl, int wait)
2494 {
2495 	ibt_send_wr_t	tx_wr;
2496 	int		nds;
2497 	int		cv_sig;
2498 	ibt_wr_ds_t	sgl[DSEG_MAX];
2499 	struct send_wid	*wdesc;
2500 	ibt_status_t	ibt_status;
2501 	rdma_stat	ret = RDMA_SUCCESS;
2502 	rib_qp_t	*qp = ctoqp(conn);
2503 
2504 	if (cl == NULL) {
2505 		cmn_err(CE_WARN, "rib_write: NULL clist\n");
2506 		return (RDMA_FAILED);
2507 	}
2508 
2509 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2510 	/*
2511 	 * Remote address is at the head chunk item in list.
2512 	 */
2513 	tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr;
2514 	tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */
2515 
2516 	nds = 0;
2517 	while (cl != NULL) {
2518 		if (nds >= DSEG_MAX) {
2519 			cmn_err(CE_WARN, "rib_write: DSEG_MAX too small!");
2520 			return (RDMA_FAILED);
2521 		}
2522 		sgl[nds].ds_va = cl->c_saddr;
2523 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2524 		sgl[nds].ds_len = cl->c_len;
2525 		cl = cl->c_next;
2526 		nds++;
2527 	}
2528 
2529 	if (wait) {
2530 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2531 		cv_sig = 1;
2532 	} else {
2533 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2534 		cv_sig = 0;
2535 	}
2536 
2537 	wdesc = rib_init_sendwait(0, cv_sig, qp);
2538 	tx_wr.wr_id = (ibt_wrid_t)wdesc;
2539 	tx_wr.wr_opcode = IBT_WRC_RDMAW;
2540 	tx_wr.wr_trans = IBT_RC_SRV;
2541 	tx_wr.wr_nds = nds;
2542 	tx_wr.wr_sgl = sgl;
2543 
2544 	mutex_enter(&conn->c_lock);
2545 	if (conn->c_state & C_CONNECTED) {
2546 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2547 	}
2548 	if (((conn->c_state & C_CONNECTED) == 0) ||
2549 		ibt_status != IBT_SUCCESS) {
2550 		mutex_exit(&conn->c_lock);
2551 		(void) rib_free_sendwait(wdesc);
2552 		return (RDMA_FAILED);
2553 	}
2554 	mutex_exit(&conn->c_lock);
2555 
2556 	/*
2557 	 * Wait for send to complete
2558 	 */
2559 	if (wait) {
2560 		ret = rib_sendwait(qp, wdesc);
2561 		if (ret != 0) {
2562 			return (ret);
2563 		}
2564 	}
2565 	return (RDMA_SUCCESS);
2566 }
2567 
2568 /*
2569  * RDMA Read a buffer from the remote address.
2570  */
2571 rdma_stat
2572 rib_read(CONN *conn, struct clist *cl, int wait)
2573 {
2574 	ibt_send_wr_t	rx_wr;
2575 	int		nds;
2576 	int		cv_sig;
2577 	ibt_wr_ds_t	sgl[DSEG_MAX];	/* is 2 sufficient? */
2578 	struct send_wid	*wdesc;
2579 	ibt_status_t	ibt_status = IBT_SUCCESS;
2580 	rdma_stat	ret = RDMA_SUCCESS;
2581 	rib_qp_t	*qp = ctoqp(conn);
2582 
2583 	if (cl == NULL) {
2584 		cmn_err(CE_WARN, "rib_read: NULL clist\n");
2585 		return (RDMA_FAILED);
2586 	}
2587 
2588 	bzero(&rx_wr, sizeof (ibt_send_wr_t));
2589 	/*
2590 	 * Remote address is at the head chunk item in list.
2591 	 */
2592 	rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_saddr;
2593 	rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; /* rkey */
2594 
2595 	nds = 0;
2596 	while (cl != NULL) {
2597 		if (nds >= DSEG_MAX) {
2598 			cmn_err(CE_WARN, "rib_read: DSEG_MAX too small!");
2599 			return (RDMA_FAILED);
2600 		}
2601 		sgl[nds].ds_va = cl->c_daddr;
2602 		sgl[nds].ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2603 		sgl[nds].ds_len = cl->c_len;
2604 		cl = cl->c_next;
2605 		nds++;
2606 	}
2607 
2608 	if (wait) {
2609 		rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2610 		cv_sig = 1;
2611 	} else {
2612 		rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2613 		cv_sig = 0;
2614 	}
2615 
2616 	wdesc = rib_init_sendwait(0, cv_sig, qp);
2617 	rx_wr.wr_id = (ibt_wrid_t)wdesc;
2618 	rx_wr.wr_opcode = IBT_WRC_RDMAR;
2619 	rx_wr.wr_trans = IBT_RC_SRV;
2620 	rx_wr.wr_nds = nds;
2621 	rx_wr.wr_sgl = sgl;
2622 
2623 	mutex_enter(&conn->c_lock);
2624 	if (conn->c_state & C_CONNECTED) {
2625 		ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2626 	}
2627 	if (((conn->c_state & C_CONNECTED) == 0) ||
2628 		ibt_status != IBT_SUCCESS) {
2629 		mutex_exit(&conn->c_lock);
2630 #ifdef DEBUG
2631 		if (rib_debug && ibt_status != IBT_SUCCESS)
2632 			cmn_err(CE_WARN, "rib_read: FAILED post_sending RDMAR"
2633 				" wr_id %llx on qp %p, status=%d",
2634 				(longlong_t)rx_wr.wr_id, (void *)qp,
2635 				ibt_status);
2636 #endif
2637 		(void) rib_free_sendwait(wdesc);
2638 		return (RDMA_FAILED);
2639 	}
2640 	mutex_exit(&conn->c_lock);
2641 
2642 	/*
2643 	 * Wait for send to complete
2644 	 */
2645 	if (wait) {
2646 		ret = rib_sendwait(qp, wdesc);
2647 		if (ret != 0) {
2648 			return (ret);
2649 		}
2650 	}
2651 
2652 	return (RDMA_SUCCESS);
2653 }
2654 
2655 int
2656 is_for_ipv4(ibt_ar_t *result)
2657 {
2658 	int	i, size = sizeof (struct in_addr);
2659 	uint8_t	zero = 0;
2660 
2661 	for (i = 0; i < (ATS_AR_DATA_LEN - size); i++)
2662 		zero |= result->ar_data[i];
2663 	return (zero == 0);
2664 }
2665 
2666 /*
2667  * rib_srv_cm_handler()
2668  *    Connection Manager callback to handle RC connection requests.
2669  */
2670 /* ARGSUSED */
2671 static ibt_cm_status_t
2672 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2673 	ibt_cm_return_args_t *ret_args, void *priv_data,
2674 	ibt_priv_data_len_t len)
2675 {
2676 	queue_t		*q;
2677 	rib_qp_t	*qp;
2678 	rpcib_state_t	*ribstat;
2679 	rib_hca_t	*hca;
2680 	rdma_stat	status = RDMA_SUCCESS;
2681 	int		i;
2682 	struct clist	cl;
2683 	rdma_buf_t	rdbuf;
2684 	void		*buf = NULL;
2685 	ibt_cm_req_rcv_t	cm_req_rcv;
2686 	CONN		*conn;
2687 	ibt_status_t ibt_status;
2688 	ibt_ar_t	ar_query, ar_result;
2689 	ib_gid_t	sgid;
2690 
2691 
2692 	ASSERT(any != NULL);
2693 	ASSERT(event != NULL);
2694 
2695 	ribstat = (rpcib_state_t *)any;
2696 	hca = (rib_hca_t *)ribstat->hca;
2697 	ASSERT(hca != NULL);
2698 
2699 	/* got a connection request */
2700 	switch (event->cm_type) {
2701 	case IBT_CM_EVENT_REQ_RCV:
2702 		/*
2703 		 * If the plugin is in the NO_ACCEPT state, bail out.
2704 		 */
2705 		mutex_enter(&plugin_state_lock);
2706 		if (plugin_state == NO_ACCEPT) {
2707 			mutex_exit(&plugin_state_lock);
2708 			return (IBT_CM_REJECT);
2709 		}
2710 		mutex_exit(&plugin_state_lock);
2711 
2712 		/*
2713 		 * Need to send a MRA MAD to CM so that it does not
2714 		 * timeout on us.
2715 		 */
2716 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2717 			    event->cm_event.req.req_timeout * 8, NULL, 0);
2718 
2719 		mutex_enter(&rib_stat->open_hca_lock);
2720 		q = rib_stat->q;
2721 		mutex_exit(&rib_stat->open_hca_lock);
2722 		status = rib_svc_create_chan(hca, (caddr_t)q,
2723 			event->cm_event.req.req_prim_hca_port, &qp);
2724 		if (status) {
2725 #ifdef DEBUG
2726 			cmn_err(CE_WARN, "rib_srv_cm_handler: "
2727 			    "create_channel failed %d", status);
2728 #endif
2729 			return (IBT_CM_REJECT);
2730 		}
2731 		cm_req_rcv = event->cm_event.req;
2732 
2733 #ifdef DEBUG
2734 		if (rib_debug > 2) {
2735 		    cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2736 			"server recv'ed IBT_CM_EVENT_REQ_RCV\n");
2737 		    cmn_err(CE_NOTE, "\t\t SID:%llx\n",
2738 				(longlong_t)cm_req_rcv.req_service_id);
2739 		    cmn_err(CE_NOTE, "\t\t Local Port:%d\n",
2740 				cm_req_rcv.req_prim_hca_port);
2741 		    cmn_err(CE_NOTE,
2742 			"\t\t Remote GID:(prefix:%llx,guid:%llx)\n",
2743 			(longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_prefix,
2744 			(longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_guid);
2745 		    cmn_err(CE_NOTE, "\t\t Local GID:(prefix:%llx,guid:%llx)\n",
2746 			(longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_prefix,
2747 			(longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_guid);
2748 		    cmn_err(CE_NOTE, "\t\t Remote QPN:%u\n",
2749 			cm_req_rcv.req_remote_qpn);
2750 		    cmn_err(CE_NOTE, "\t\t Remote Q_Key:%x\n",
2751 			cm_req_rcv.req_remote_qkey);
2752 		    cmn_err(CE_NOTE, "\t\t Local QP %p (qp_hdl=%p)\n",
2753 			(void *)qp, (void *)qp->qp_hdl);
2754 		}
2755 
2756 		if (rib_debug > 2) {
2757 		    ibt_rc_chan_query_attr_t	chan_attrs;
2758 
2759 		    if (ibt_query_rc_channel(qp->qp_hdl, &chan_attrs)
2760 			== IBT_SUCCESS) {
2761 			cmn_err(CE_NOTE, "rib_svc_cm_handler: qp %p in "
2762 			    "CEP state %d\n", (void *)qp, chan_attrs.rc_state);
2763 		    }
2764 		}
2765 #endif
2766 
2767 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2768 		ret_args->cm_ret.rep.cm_rdma_ra_out = 1;
2769 		ret_args->cm_ret.rep.cm_rdma_ra_in = 1;
2770 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2771 
2772 		/*
2773 		 * Pre-posts RECV buffers
2774 		 */
2775 		conn = qptoc(qp);
2776 		for (i = 0; i < preposted_rbufs; i++) {
2777 		    bzero(&rdbuf, sizeof (rdbuf));
2778 		    rdbuf.type = RECV_BUFFER;
2779 		    buf = rib_rbuf_alloc(conn, &rdbuf);
2780 		    if (buf == NULL) {
2781 			cmn_err(CE_WARN, "rib_svc_cm_handler: "
2782 			    "No RECV_BUFFER buf!\n");
2783 			(void) rib_disconnect_channel(conn, NULL);
2784 			return (IBT_CM_REJECT);
2785 		    }
2786 
2787 		    bzero(&cl, sizeof (cl));
2788 		    cl.c_saddr = (uint64)rdbuf.addr;
2789 		    cl.c_len = rdbuf.len;
2790 		    cl.c_smemhandle.mrc_lmr = rdbuf.handle.mrc_lmr; /* lkey */
2791 		    cl.c_next = NULL;
2792 		    status = rib_post_recv(conn, &cl);
2793 		    if (status != RDMA_SUCCESS) {
2794 			cmn_err(CE_WARN, "rib_srv_cm_handler: failed "
2795 			    "posting RPC_REQ buf to qp %p!", (void *)qp);
2796 			(void) rib_disconnect_channel(conn, NULL);
2797 			return (IBT_CM_REJECT);
2798 		    }
2799 		}
2800 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
2801 
2802 		/*
2803 		 * Get the address translation service record from ATS
2804 		 */
2805 		rw_enter(&hca->state_lock, RW_READER);
2806 		if (hca->state == HCA_DETACHED) {
2807 		    rw_exit(&hca->state_lock);
2808 		    return (IBT_CM_REJECT);
2809 		}
2810 		rw_exit(&hca->state_lock);
2811 
2812 		for (i = 0; i < hca->hca_nports; i++) {
2813 		    ibt_status = ibt_get_port_state(hca->hca_hdl, i+1,
2814 					&sgid, NULL);
2815 		    if (ibt_status != IBT_SUCCESS) {
2816 			if (rib_debug) {
2817 			    cmn_err(CE_WARN, "rib_srv_cm_handler: "
2818 				"ibt_get_port_state FAILED!"
2819 				"status = %d\n", ibt_status);
2820 			}
2821 		    } else {
2822 			/*
2823 			 * do ibt_query_ar()
2824 			 */
2825 			bzero(&ar_query, sizeof (ar_query));
2826 			bzero(&ar_result, sizeof (ar_result));
2827 			ar_query.ar_gid = cm_req_rcv.req_prim_addr.av_dgid;
2828 			ar_query.ar_pkey = event->cm_event.req.req_pkey;
2829 			ibt_status = ibt_query_ar(&sgid, &ar_query,
2830 							&ar_result);
2831 			if (ibt_status != IBT_SUCCESS) {
2832 			    if (rib_debug) {
2833 				cmn_err(CE_WARN, "rib_srv_cm_handler: "
2834 				    "ibt_query_ar FAILED!"
2835 				    "status = %d\n", ibt_status);
2836 			    }
2837 			} else {
2838 			    conn = qptoc(qp);
2839 
2840 			    if (is_for_ipv4(&ar_result)) {
2841 				struct sockaddr_in *s;
2842 				int sin_size = sizeof (struct sockaddr_in);
2843 				int in_size = sizeof (struct in_addr);
2844 				uint8_t	*start_pos;
2845 
2846 				conn->c_raddr.maxlen =
2847 					conn->c_raddr.len = sin_size;
2848 				conn->c_raddr.buf = kmem_zalloc(sin_size,
2849 						KM_SLEEP);
2850 				s = (struct sockaddr_in *)conn->c_raddr.buf;
2851 				s->sin_family = AF_INET;
2852 				/*
2853 				 * For IPv4,  the IP addr is stored in
2854 				 * the last four bytes of ar_data.
2855 				 */
2856 				start_pos = ar_result.ar_data +
2857 					ATS_AR_DATA_LEN - in_size;
2858 				bcopy(start_pos, &s->sin_addr, in_size);
2859 				if (rib_debug > 1) {
2860 				    char print_addr[INET_ADDRSTRLEN];
2861 
2862 				    bzero(print_addr, INET_ADDRSTRLEN);
2863 				    (void) inet_ntop(AF_INET, &s->sin_addr,
2864 						print_addr, INET_ADDRSTRLEN);
2865 				    cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2866 					"remote clnt_addr: %s\n", print_addr);
2867 				}
2868 			    } else {
2869 				struct sockaddr_in6 *s6;
2870 				int sin6_size = sizeof (struct sockaddr_in6);
2871 
2872 				conn->c_raddr.maxlen =
2873 					conn->c_raddr.len = sin6_size;
2874 				conn->c_raddr.buf = kmem_zalloc(sin6_size,
2875 					KM_SLEEP);
2876 
2877 				s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
2878 				s6->sin6_family = AF_INET6;
2879 				/* sin6_addr is stored in ar_data */
2880 				bcopy(ar_result.ar_data, &s6->sin6_addr,
2881 					sizeof (struct in6_addr));
2882 				if (rib_debug > 1) {
2883 				    char print_addr[INET6_ADDRSTRLEN];
2884 
2885 				    bzero(print_addr, INET6_ADDRSTRLEN);
2886 				    (void) inet_ntop(AF_INET6, &s6->sin6_addr,
2887 						print_addr, INET6_ADDRSTRLEN);
2888 				    cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2889 					"remote clnt_addr: %s\n", print_addr);
2890 				}
2891 			    }
2892 			    return (IBT_CM_ACCEPT);
2893 			}
2894 		    }
2895 		}
2896 		if (rib_debug > 1) {
2897 		    cmn_err(CE_WARN, "rib_srv_cm_handler: "
2898 				"address record query failed!");
2899 		}
2900 		break;
2901 
2902 	case IBT_CM_EVENT_CONN_CLOSED:
2903 	{
2904 		CONN		*conn;
2905 		rib_qp_t	*qp;
2906 
2907 		switch (event->cm_event.closed) {
2908 		case IBT_CM_CLOSED_DREP_RCVD:
2909 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
2910 		case IBT_CM_CLOSED_DUP:
2911 		case IBT_CM_CLOSED_ABORT:
2912 		case IBT_CM_CLOSED_ALREADY:
2913 			/*
2914 			 * These cases indicate the local end initiated
2915 			 * the closing of the channel. Nothing to do here.
2916 			 */
2917 			break;
2918 		default:
2919 			/*
2920 			 * Reason for CONN_CLOSED event must be one of
2921 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
2922 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
2923 			 * the remote end is closing the channel. In these
2924 			 * cases free the channel and transition to error
2925 			 * state
2926 			 */
2927 			qp = ibt_get_chan_private(event->cm_channel);
2928 			conn = qptoc(qp);
2929 			mutex_enter(&conn->c_lock);
2930 			if (conn->c_state == C_DISCONN_PEND) {
2931 				mutex_exit(&conn->c_lock);
2932 				break;
2933 			}
2934 			conn->c_state = C_ERROR;
2935 
2936 			/*
2937 			 * Free the rc_channel. Channel has already
2938 			 * transitioned to ERROR state and WRs have been
2939 			 * FLUSHED_ERR already.
2940 			 */
2941 			(void) ibt_free_channel(qp->qp_hdl);
2942 			qp->qp_hdl = NULL;
2943 
2944 			/*
2945 			 * Free the conn if c_ref goes down to 0
2946 			 */
2947 			if (conn->c_ref == 0) {
2948 				/*
2949 				 * Remove from list and free conn
2950 				 */
2951 				conn->c_state = C_DISCONN_PEND;
2952 				mutex_exit(&conn->c_lock);
2953 				(void) rib_disconnect_channel(conn,
2954 					&hca->srv_conn_list);
2955 			} else {
2956 				mutex_exit(&conn->c_lock);
2957 			}
2958 #ifdef DEBUG
2959 			if (rib_debug)
2960 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2961 					" (CONN_CLOSED) channel disconnected");
2962 #endif
2963 			break;
2964 		}
2965 		break;
2966 	}
2967 	case IBT_CM_EVENT_CONN_EST:
2968 	/*
2969 	 * RTU received, hence connection established.
2970 	 */
2971 		if (rib_debug > 1)
2972 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2973 				"(CONN_EST) channel established");
2974 		break;
2975 
2976 	default:
2977 	    if (rib_debug > 2) {
2978 		/* Let CM handle the following events. */
2979 		if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
2980 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2981 			    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
2982 		} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
2983 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2984 			    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
2985 		} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
2986 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2987 			    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
2988 		} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
2989 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2990 			    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
2991 		} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
2992 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2993 			    "server recv'ed IBT_CM_EVENT_FAILURE\n");
2994 		}
2995 	    }
2996 	    return (IBT_CM_REJECT);
2997 	}
2998 
2999 	/* accept all other CM messages (i.e. let the CM handle them) */
3000 	return (IBT_CM_ACCEPT);
3001 }
3002 
3003 static rdma_stat
3004 rib_register_ats(rib_hca_t *hca)
3005 {
3006 	ibt_hca_portinfo_t	*port_infop;
3007 	uint_t			port_size;
3008 	uint_t			pki, i, num_ports, nbinds;
3009 	ibt_status_t		ibt_status;
3010 	rib_service_t		*new_service, *temp_srv;
3011 	rpcib_ats_t		*atsp;
3012 	rpcib_ibd_insts_t	ibds;
3013 	ib_pkey_t		pkey;
3014 	ibt_ar_t		ar;	/* address record */
3015 
3016 	/*
3017 	 * Query all ports for the given HCA
3018 	 */
3019 	rw_enter(&hca->state_lock, RW_READER);
3020 	if (hca->state != HCA_DETACHED) {
3021 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3022 		    &num_ports, &port_size);
3023 		rw_exit(&hca->state_lock);
3024 	} else {
3025 		rw_exit(&hca->state_lock);
3026 		return (RDMA_FAILED);
3027 	}
3028 	if (ibt_status != IBT_SUCCESS) {
3029 #ifdef DEBUG
3030 	    if (rib_debug) {
3031 		cmn_err(CE_NOTE, "rib_register_ats: FAILED in "
3032 		    "ibt_query_hca_ports, status = %d\n", ibt_status);
3033 	    }
3034 #endif
3035 		return (RDMA_FAILED);
3036 	}
3037 
3038 #ifdef	DEBUG
3039 	if (rib_debug > 1) {
3040 		cmn_err(CE_NOTE, "rib_register_ats: Ports detected "
3041 		    "%d\n", num_ports);
3042 
3043 		for (i = 0; i < num_ports; i++) {
3044 			if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3045 				cmn_err(CE_WARN, "rib_register_ats "
3046 				    "Port #: %d INACTIVE\n", i+1);
3047 			} else if (port_infop[i].p_linkstate ==
3048 			    IBT_PORT_ACTIVE) {
3049 				cmn_err(CE_NOTE, "rib_register_ats "
3050 				    "Port #: %d ACTIVE\n", i+1);
3051 			}
3052 		}
3053 	}
3054 #endif
3055 
3056 	ibds.rib_ibd_alloc = N_IBD_INSTANCES;
3057 	ibds.rib_ibd_cnt = 0;
3058 	ibds.rib_ats = (rpcib_ats_t *)kmem_zalloc(ibds.rib_ibd_alloc *
3059 			sizeof (rpcib_ats_t), KM_SLEEP);
3060 	rib_get_ibd_insts(&ibds);
3061 
3062 	if (ibds.rib_ibd_cnt == 0) {
3063 	    kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc *
3064 				sizeof (rpcib_ats_t));
3065 	    ibt_free_portinfo(port_infop, port_size);
3066 	    return (RDMA_FAILED);
3067 	}
3068 
3069 	/*
3070 	 * Get the IP addresses of active ports and
3071 	 * register them with ATS.  IPv4 addresses
3072 	 * have precedence over IPv6 addresses.
3073 	 */
3074 	if (get_ibd_ipaddr(&ibds) != 0) {
3075 #ifdef	DEBUG
3076 	    if (rib_debug > 1) {
3077 		cmn_err(CE_WARN, "rib_register_ats: "
3078 		    "get_ibd_ipaddr failed");
3079 	    }
3080 #endif
3081 	    kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc *
3082 				sizeof (rpcib_ats_t));
3083 	    ibt_free_portinfo(port_infop, port_size);
3084 	    return (RDMA_FAILED);
3085 	}
3086 
3087 	/*
3088 	 * Start ATS registration for active ports on this HCA.
3089 	 */
3090 	rw_enter(&hca->service_list_lock, RW_WRITER);
3091 	nbinds = 0;
3092 	new_service = NULL;
3093 	for (i = 0; i < num_ports; i++) {
3094 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3095 			continue;
3096 
3097 	    for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3098 		pkey = port_infop[i].p_pkey_tbl[pki];
3099 		if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) {
3100 		    ar.ar_gid = port_infop[i].p_sgid_tbl[0];
3101 		    ar.ar_pkey = pkey;
3102 		    atsp = get_ibd_entry(&ar.ar_gid, pkey, &ibds);
3103 		    if (atsp == NULL)
3104 			continue;
3105 		/*
3106 		 * store the sin[6]_addr in ar_data
3107 		 */
3108 		    (void) bzero(ar.ar_data, ATS_AR_DATA_LEN);
3109 		    if (atsp->ras_inet_type == AF_INET) {
3110 			uint8_t *start_pos;
3111 
3112 			/*
3113 			 * The ipv4 addr goes into the last
3114 			 * four bytes of ar_data.
3115 			 */
3116 			start_pos = ar.ar_data + ATS_AR_DATA_LEN -
3117 				sizeof (struct in_addr);
3118 			bcopy(&atsp->ras_sin.sin_addr, start_pos,
3119 				sizeof (struct in_addr));
3120 		    } else if (atsp->ras_inet_type == AF_INET6) {
3121 			bcopy(&atsp->ras_sin6.sin6_addr, ar.ar_data,
3122 				sizeof (struct in6_addr));
3123 		    } else
3124 			continue;
3125 
3126 		    ibt_status = ibt_register_ar(hca->ibt_clnt_hdl, &ar);
3127 		    if (ibt_status == IBT_SUCCESS) {
3128 #ifdef	DEBUG
3129 			if (rib_debug > 1) {
3130 				cmn_err(CE_WARN, "rib_register_ats: "
3131 				    "ibt_register_ar OK on port %d", i+1);
3132 			}
3133 #endif
3134 			/*
3135 			 * Allocate and prepare a service entry
3136 			 */
3137 			new_service = kmem_zalloc(sizeof (rib_service_t),
3138 				KM_SLEEP);
3139 			new_service->srv_port = i + 1;
3140 			new_service->srv_ar = ar;
3141 			new_service->srv_next = NULL;
3142 
3143 			/*
3144 			 * Add to the service list for this HCA
3145 			 */
3146 			new_service->srv_next = hca->ats_list;
3147 			hca->ats_list = new_service;
3148 			new_service = NULL;
3149 			nbinds ++;
3150 		    } else {
3151 #ifdef	DEBUG
3152 			if (rib_debug > 1) {
3153 			    cmn_err(CE_WARN, "rib_register_ats: "
3154 			    "ibt_register_ar FAILED on port %d", i+1);
3155 			}
3156 #endif
3157 		    }
3158 		}
3159 	    }
3160 	}
3161 
3162 #ifdef	DEBUG
3163 	if (rib_debug > 1) {
3164 		for (temp_srv = hca->ats_list; temp_srv != NULL;
3165 			temp_srv = temp_srv->srv_next) {
3166 				cmn_err(CE_NOTE, "Service: ATS, active on"
3167 					" port: %d\n", temp_srv->srv_port);
3168 		}
3169 	}
3170 #endif
3171 
3172 	rw_exit(&hca->service_list_lock);
3173 	kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc * sizeof (rpcib_ats_t));
3174 	ibt_free_portinfo(port_infop, port_size);
3175 
3176 	if (nbinds == 0) {
3177 #ifdef	DEBUG
3178 	if (rib_debug > 1) {
3179 		cmn_err(CE_WARN, "rib_register_ats FAILED!\n");
3180 	}
3181 #endif
3182 		return (RDMA_FAILED);
3183 	}
3184 	return (RDMA_SUCCESS);
3185 }
3186 
3187 static rdma_stat
3188 rib_register_service(rib_hca_t *hca, int service_type)
3189 {
3190 	ibt_srv_desc_t		sdesc;
3191 	ibt_srv_bind_t		sbind;
3192 	ibt_hca_portinfo_t	*port_infop;
3193 	ib_svc_id_t		srv_id;
3194 	ibt_srv_hdl_t		srv_hdl;
3195 	uint_t			port_size;
3196 	uint_t			pki, i, j, num_ports, nbinds;
3197 	ibt_status_t		ibt_status;
3198 	char			**addrs;
3199 	int			addr_count;
3200 	rib_service_t		*new_service, *temp_srv;
3201 	ib_pkey_t		pkey;
3202 
3203 	/*
3204 	 * Query all ports for the given HCA
3205 	 */
3206 	rw_enter(&hca->state_lock, RW_READER);
3207 	if (hca->state != HCA_DETACHED) {
3208 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3209 		    &num_ports, &port_size);
3210 		rw_exit(&hca->state_lock);
3211 	} else {
3212 		rw_exit(&hca->state_lock);
3213 		return (RDMA_FAILED);
3214 	}
3215 	if (ibt_status != IBT_SUCCESS) {
3216 #ifdef DEBUG
3217 		cmn_err(CE_NOTE, "rib_register_service: FAILED in "
3218 		    "ibt_query_hca_ports, status = %d\n", ibt_status);
3219 #endif
3220 		return (RDMA_FAILED);
3221 	}
3222 
3223 #ifdef	DEBUG
3224 	if (rib_debug > 1) {
3225 		cmn_err(CE_NOTE, "rib_register_service: Ports detected "
3226 		    "%d\n", num_ports);
3227 
3228 		for (i = 0; i < num_ports; i++) {
3229 			if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3230 				cmn_err(CE_WARN, "rib_register_service "
3231 				    "Port #: %d INACTIVE\n", i+1);
3232 			} else if (port_infop[i].p_linkstate ==
3233 			    IBT_PORT_ACTIVE) {
3234 				cmn_err(CE_NOTE, "rib_register_service "
3235 				    "Port #: %d ACTIVE\n", i+1);
3236 			}
3237 		}
3238 	}
3239 #endif
3240 	/*
3241 	 * Get all the IP addresses on this system to register the
3242 	 * given "service type" on all DNS recognized IP addrs.
3243 	 * Each service type such as NFS will have all the systems
3244 	 * IP addresses as its different names. For now the only
3245 	 * type of service we support in RPCIB is NFS.
3246 	 */
3247 	addrs = get_ip_addrs(&addr_count);
3248 	if (addrs == NULL) {
3249 #ifdef DEBUG
3250 		if (rib_debug) {
3251 		    cmn_err(CE_WARN, "rib_register_service: "
3252 			"get_ip_addrs failed\n");
3253 		}
3254 #endif
3255 		ibt_free_portinfo(port_infop, port_size);
3256 		return (RDMA_FAILED);
3257 	}
3258 
3259 #ifdef	DEBUG
3260 	if (rib_debug > 1) {
3261 		for (i = 0; i < addr_count; i++)
3262 			cmn_err(CE_NOTE, "addr %d: %s\n", i, addrs[i]);
3263 	}
3264 #endif
3265 
3266 	rw_enter(&hca->service_list_lock, RW_WRITER);
3267 	/*
3268 	 * Start registering and binding service to active
3269 	 * on active ports on this HCA.
3270 	 */
3271 	nbinds = 0;
3272 	new_service = NULL;
3273 
3274 	/*
3275 	 * We use IP addresses as the service names for
3276 	 * service registration.  Register each of them
3277 	 * with CM to obtain a svc_id and svc_hdl.  We do not
3278 	 * register the service with machine's loopback address.
3279 	 */
3280 	for (j = 1; j < addr_count; j++) {
3281 	    (void) bzero(&srv_id, sizeof (ib_svc_id_t));
3282 	    (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3283 	    (void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3284 
3285 	    sdesc.sd_handler = rib_srv_cm_handler;
3286 	    sdesc.sd_flags = 0;
3287 
3288 	    ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3289 			    &sdesc, 0, 1, &srv_hdl, &srv_id);
3290 	    if (ibt_status != IBT_SUCCESS) {
3291 #ifdef DEBUG
3292 		if (rib_debug) {
3293 		    cmn_err(CE_WARN, "rib_register_service: "
3294 			"ibt_register_service FAILED, status "
3295 			"= %d\n", ibt_status);
3296 		}
3297 #endif
3298 		/*
3299 		 * No need to go on, since we failed to obtain
3300 		 * a srv_id and srv_hdl. Move on to the next
3301 		 * IP addr as a service name.
3302 		 */
3303 		continue;
3304 	    }
3305 	    for (i = 0; i < num_ports; i++) {
3306 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3307 			continue;
3308 
3309 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3310 		    pkey = port_infop[i].p_pkey_tbl[pki];
3311 		    if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) {
3312 
3313 			/*
3314 			 * Allocate and prepare a service entry
3315 			 */
3316 			new_service = kmem_zalloc(1 * sizeof (rib_service_t),
3317 			    KM_SLEEP);
3318 			new_service->srv_type = service_type;
3319 			new_service->srv_port = i + 1;
3320 			new_service->srv_id = srv_id;
3321 			new_service->srv_hdl = srv_hdl;
3322 			new_service->srv_sbind_hdl = kmem_zalloc(1 *
3323 			    sizeof (ibt_sbind_hdl_t), KM_SLEEP);
3324 
3325 			new_service->srv_name = kmem_zalloc(IB_SVC_NAME_LEN,
3326 			    KM_SLEEP);
3327 			(void) bcopy(addrs[j], new_service->srv_name,
3328 			    IB_SVC_NAME_LEN);
3329 			(void) strlcat(new_service->srv_name, "::NFS",
3330 				IB_SVC_NAME_LEN);
3331 			new_service->srv_next = NULL;
3332 
3333 			/*
3334 			 * Bind the service, specified by the IP address,
3335 			 * to the port/pkey using the srv_hdl returned
3336 			 * from ibt_register_service().
3337 			 */
3338 			(void) bzero(&sbind, sizeof (ibt_srv_bind_t));
3339 			sbind.sb_pkey = pkey;
3340 			sbind.sb_lease = 0xFFFFFFFF;
3341 			sbind.sb_key[0] = NFS_SEC_KEY0;
3342 			sbind.sb_key[1] = NFS_SEC_KEY1;
3343 			sbind.sb_name = new_service->srv_name;
3344 
3345 #ifdef	DEBUG
3346 			if (rib_debug > 1) {
3347 				cmn_err(CE_NOTE, "rib_register_service: "
3348 				    "binding service using name: %s\n",
3349 				    sbind.sb_name);
3350 			}
3351 #endif
3352 			ibt_status = ibt_bind_service(srv_hdl,
3353 			    port_infop[i].p_sgid_tbl[0], &sbind, rib_stat,
3354 			    new_service->srv_sbind_hdl);
3355 			if (ibt_status != IBT_SUCCESS) {
3356 #ifdef	DEBUG
3357 			    if (rib_debug) {
3358 				cmn_err(CE_WARN, "rib_register_service: FAILED"
3359 				    " in ibt_bind_service, status = %d\n",
3360 				    ibt_status);
3361 			    }
3362 #endif
3363 				kmem_free(new_service->srv_sbind_hdl,
3364 				    sizeof (ibt_sbind_hdl_t));
3365 				kmem_free(new_service->srv_name,
3366 				    IB_SVC_NAME_LEN);
3367 				kmem_free(new_service,
3368 				    sizeof (rib_service_t));
3369 				new_service = NULL;
3370 				continue;
3371 			}
3372 #ifdef	DEBUG
3373 			if (rib_debug > 1) {
3374 				if (ibt_status == IBT_SUCCESS)
3375 					cmn_err(CE_NOTE, "rib_regstr_service: "
3376 					    "Serv: %s REGISTERED on port: %d",
3377 					    sbind.sb_name, i+1);
3378 			}
3379 #endif
3380 			/*
3381 			 * Add to the service list for this HCA
3382 			 */
3383 			new_service->srv_next = hca->service_list;
3384 			hca->service_list = new_service;
3385 			new_service = NULL;
3386 			nbinds ++;
3387 		    }
3388 		}
3389 	    }
3390 	}
3391 	rw_exit(&hca->service_list_lock);
3392 
3393 #ifdef	DEBUG
3394 	if (rib_debug > 1) {
3395 		/*
3396 		 * Change this print to a more generic one, as rpcib
3397 		 * is supposed to handle multiple service types.
3398 		 */
3399 		for (temp_srv = hca->service_list; temp_srv != NULL;
3400 			temp_srv = temp_srv->srv_next) {
3401 				cmn_err(CE_NOTE, "NFS-IB, active on port:"
3402 					" %d\n"
3403 					"Using name: %s", temp_srv->srv_port,
3404 					temp_srv->srv_name);
3405 		}
3406 	}
3407 #endif
3408 
3409 	ibt_free_portinfo(port_infop, port_size);
3410 	for (i = 0; i < addr_count; i++) {
3411 		if (addrs[i])
3412 			kmem_free(addrs[i], IB_SVC_NAME_LEN);
3413 	}
3414 	kmem_free(addrs, addr_count * sizeof (char *));
3415 
3416 	if (nbinds == 0) {
3417 #ifdef	DEBUG
3418 	    if (rib_debug) {
3419 		cmn_err(CE_WARN, "rib_register_service: "
3420 		    "bind_service FAILED!\n");
3421 	    }
3422 #endif
3423 		return (RDMA_FAILED);
3424 	} else {
3425 		/*
3426 		 * Put this plugin into accept state, since atleast
3427 		 * one registration was successful.
3428 		 */
3429 		mutex_enter(&plugin_state_lock);
3430 		plugin_state = ACCEPT;
3431 		mutex_exit(&plugin_state_lock);
3432 		return (RDMA_SUCCESS);
3433 	}
3434 }
3435 
3436 void
3437 rib_listen(struct rdma_svc_data *rd)
3438 {
3439 	rdma_stat status = RDMA_SUCCESS;
3440 
3441 	rd->active = 0;
3442 	rd->err_code = RDMA_FAILED;
3443 
3444 	/*
3445 	 * First check if a hca is still attached
3446 	 */
3447 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
3448 	if (rib_stat->hca->state != HCA_INITED) {
3449 		rw_exit(&rib_stat->hca->state_lock);
3450 		return;
3451 	}
3452 	rw_exit(&rib_stat->hca->state_lock);
3453 
3454 	rib_stat->q = &rd->q;
3455 	/*
3456 	 * Register the Address translation service
3457 	 */
3458 	mutex_enter(&rib_stat->open_hca_lock);
3459 	if (ats_running == 0) {
3460 		if (rib_register_ats(rib_stat->hca) != RDMA_SUCCESS) {
3461 #ifdef	DEBUG
3462 		    if (rib_debug) {
3463 			cmn_err(CE_WARN,
3464 			    "rib_listen(): ats registration failed!");
3465 		    }
3466 #endif
3467 		    mutex_exit(&rib_stat->open_hca_lock);
3468 		    return;
3469 		} else {
3470 			ats_running = 1;
3471 		}
3472 	}
3473 	mutex_exit(&rib_stat->open_hca_lock);
3474 
3475 	/*
3476 	 * Right now the only service type is NFS. Hence force feed this
3477 	 * value. Ideally to communicate the service type it should be
3478 	 * passed down in rdma_svc_data.
3479 	 */
3480 	rib_stat->service_type = NFS;
3481 	status = rib_register_service(rib_stat->hca, NFS);
3482 	if (status != RDMA_SUCCESS) {
3483 		rd->err_code = status;
3484 		return;
3485 	}
3486 	/*
3487 	 * Service active on an HCA, check rd->err_code for more
3488 	 * explainable errors.
3489 	 */
3490 	rd->active = 1;
3491 	rd->err_code = status;
3492 }
3493 
3494 /* XXXX */
3495 /* ARGSUSED */
3496 static void
3497 rib_listen_stop(struct rdma_svc_data *svcdata)
3498 {
3499 	rib_hca_t		*hca;
3500 
3501 	/*
3502 	 * KRPC called the RDMATF to stop the listeners, this means
3503 	 * stop sending incomming or recieved requests to KRPC master
3504 	 * transport handle for RDMA-IB. This is also means that the
3505 	 * master transport handle, responsible for us, is going away.
3506 	 */
3507 	mutex_enter(&plugin_state_lock);
3508 	plugin_state = NO_ACCEPT;
3509 	if (svcdata != NULL)
3510 		svcdata->active = 0;
3511 	mutex_exit(&plugin_state_lock);
3512 
3513 	/*
3514 	 * First check if a hca is still attached
3515 	 */
3516 	hca = rib_stat->hca;
3517 	rw_enter(&hca->state_lock, RW_READER);
3518 	if (hca->state != HCA_INITED) {
3519 		rw_exit(&hca->state_lock);
3520 		return;
3521 	}
3522 	rib_stop_services(hca);
3523 	rw_exit(&hca->state_lock);
3524 }
3525 
3526 /*
3527  * Traverse the HCA's service list to unbind and deregister services.
3528  * Instead of unbinding the service for a service handle by
3529  * calling ibt_unbind_service() for each port/pkey, we unbind
3530  * all the services for the service handle by making only one
3531  * call to ibt_unbind_all_services().  Then, we deregister the
3532  * service for the service handle.
3533  *
3534  * When traversing the entries in service_list, we compare the
3535  * srv_hdl of the current entry with that of the next.  If they
3536  * are different or if the next entry is NULL, the current entry
3537  * marks the last binding of the service handle.  In this case,
3538  * call ibt_unbind_all_services() and deregister the service for
3539  * the service handle.  If they are the same, the current and the
3540  * next entries are bound to the same service handle.  In this
3541  * case, move on to the next entry.
3542  */
3543 static void
3544 rib_stop_services(rib_hca_t *hca)
3545 {
3546 	rib_service_t		*srv_list, *to_remove;
3547 	ibt_status_t   		ibt_status;
3548 
3549 	/*
3550 	 * unbind and deregister the services for this service type.
3551 	 * Right now there is only one service type. In future it will
3552 	 * be passed down to this function.
3553 	 */
3554 	rw_enter(&hca->service_list_lock, RW_WRITER);
3555 	srv_list = hca->service_list;
3556 	while (srv_list != NULL) {
3557 		to_remove = srv_list;
3558 		srv_list = to_remove->srv_next;
3559 		if (srv_list == NULL || bcmp(to_remove->srv_hdl,
3560 		    srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) {
3561 
3562 		    ibt_status = ibt_unbind_all_services(to_remove->srv_hdl);
3563 		    if (ibt_status != IBT_SUCCESS) {
3564 			cmn_err(CE_WARN, "rib_listen_stop: "
3565 			    "ibt_unbind_all_services FAILED"
3566 				" status: %d\n", ibt_status);
3567 		    }
3568 
3569 		    ibt_status =
3570 			ibt_deregister_service(hca->ibt_clnt_hdl,
3571 				to_remove->srv_hdl);
3572 		    if (ibt_status != IBT_SUCCESS) {
3573 			cmn_err(CE_WARN, "rib_listen_stop: "
3574 			    "ibt_deregister_service FAILED"
3575 				" status: %d\n", ibt_status);
3576 		    }
3577 
3578 #ifdef	DEBUG
3579 		    if (rib_debug > 1) {
3580 			if (ibt_status == IBT_SUCCESS)
3581 				cmn_err(CE_NOTE, "rib_listen_stop: "
3582 				    "Successfully stopped and"
3583 				    " UNREGISTERED service: %s\n",
3584 				    to_remove->srv_name);
3585 		    }
3586 #endif
3587 		}
3588 		kmem_free(to_remove->srv_name, IB_SVC_NAME_LEN);
3589 		kmem_free(to_remove->srv_sbind_hdl,
3590 			sizeof (ibt_sbind_hdl_t));
3591 
3592 		kmem_free(to_remove, sizeof (rib_service_t));
3593 	}
3594 	hca->service_list = NULL;
3595 	rw_exit(&hca->service_list_lock);
3596 }
3597 
3598 static struct svc_recv *
3599 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3600 {
3601 	struct svc_recv	*recvp;
3602 
3603 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3604 	recvp->vaddr = sgl->ds_va;
3605 	recvp->qp = qp;
3606 	recvp->bytes_xfer = 0;
3607 	return (recvp);
3608 }
3609 
3610 static int
3611 rib_free_svc_recv(struct svc_recv *recvp)
3612 {
3613 	kmem_free(recvp, sizeof (*recvp));
3614 
3615 	return (0);
3616 }
3617 
3618 static struct reply *
3619 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3620 {
3621 	struct reply	*rep;
3622 
3623 
3624 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3625 	if (rep == NULL) {
3626 		mutex_exit(&qp->replylist_lock);
3627 		cmn_err(CE_WARN, "rib_addreplylist: no memory\n");
3628 		return (NULL);
3629 	}
3630 	rep->xid = msgid;
3631 	rep->vaddr_cq = NULL;
3632 	rep->bytes_xfer = 0;
3633 	rep->status = (uint_t)REPLY_WAIT;
3634 	rep->prev = NULL;
3635 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3636 
3637 	mutex_enter(&qp->replylist_lock);
3638 	if (qp->replylist) {
3639 		rep->next = qp->replylist;
3640 		qp->replylist->prev = rep;
3641 	}
3642 	qp->rep_list_size++;
3643 	if (rib_debug > 1)
3644 	    cmn_err(CE_NOTE, "rib_addreplylist: qp:%p, rep_list_size:%d\n",
3645 		(void *)qp, qp->rep_list_size);
3646 	qp->replylist = rep;
3647 	mutex_exit(&qp->replylist_lock);
3648 
3649 	return (rep);
3650 }
3651 
3652 static rdma_stat
3653 rib_rem_replylist(rib_qp_t *qp)
3654 {
3655 	struct reply	*r, *n;
3656 
3657 	mutex_enter(&qp->replylist_lock);
3658 	for (r = qp->replylist; r != NULL; r = n) {
3659 		n = r->next;
3660 		(void) rib_remreply(qp, r);
3661 	}
3662 	mutex_exit(&qp->replylist_lock);
3663 
3664 	return (RDMA_SUCCESS);
3665 }
3666 
3667 static int
3668 rib_remreply(rib_qp_t *qp, struct reply *rep)
3669 {
3670 
3671 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3672 	if (rep->prev) {
3673 		rep->prev->next = rep->next;
3674 	}
3675 	if (rep->next) {
3676 		rep->next->prev = rep->prev;
3677 	}
3678 	if (qp->replylist == rep)
3679 		qp->replylist = rep->next;
3680 
3681 	cv_destroy(&rep->wait_cv);
3682 	qp->rep_list_size--;
3683 	if (rib_debug > 1)
3684 	    cmn_err(CE_NOTE, "rib_remreply: qp:%p, rep_list_size:%d\n",
3685 		(void *)qp, qp->rep_list_size);
3686 
3687 	kmem_free(rep, sizeof (*rep));
3688 
3689 	return (0);
3690 }
3691 
3692 rdma_stat
3693 rib_registermem(CONN *conn, caddr_t buf, uint_t buflen,
3694 	struct mrc *buf_handle)
3695 {
3696 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3697 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3698 	rdma_stat	status;
3699 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3700 
3701 	/*
3702 	 * Note: ALL buffer pools use the same memory type RDMARW.
3703 	 */
3704 	status = rib_reg_mem(hca, buf, buflen, 0, &mr_hdl, &mr_desc);
3705 	if (status == RDMA_SUCCESS) {
3706 		buf_handle->mrc_linfo = (uint64_t)mr_hdl;
3707 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3708 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3709 	} else {
3710 		buf_handle->mrc_linfo = NULL;
3711 		buf_handle->mrc_lmr = 0;
3712 		buf_handle->mrc_rmr = 0;
3713 	}
3714 	return (status);
3715 }
3716 
3717 static rdma_stat
3718 rib_reg_mem(rib_hca_t *hca, caddr_t buf, uint_t size, ibt_mr_flags_t spec,
3719 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3720 {
3721 	ibt_mr_attr_t	mem_attr;
3722 	ibt_status_t	ibt_status;
3723 
3724 	mem_attr.mr_vaddr = (uint64_t)buf;
3725 	mem_attr.mr_len = (ib_msglen_t)size;
3726 	mem_attr.mr_as = NULL;
3727 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3728 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3729 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3730 
3731 	rw_enter(&hca->state_lock, RW_READER);
3732 	if (hca->state == HCA_INITED) {
3733 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3734 					&mem_attr, mr_hdlp, mr_descp);
3735 		rw_exit(&hca->state_lock);
3736 	} else {
3737 		rw_exit(&hca->state_lock);
3738 		return (RDMA_FAILED);
3739 	}
3740 
3741 	if (ibt_status != IBT_SUCCESS) {
3742 		cmn_err(CE_WARN, "rib_reg_mem: ibt_register_mr "
3743 			"(spec:%d) failed for addr %llX, status %d",
3744 			spec, (longlong_t)mem_attr.mr_vaddr, ibt_status);
3745 		return (RDMA_FAILED);
3746 	}
3747 	return (RDMA_SUCCESS);
3748 }
3749 
3750 rdma_stat
3751 rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen,
3752 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle)
3753 {
3754 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3755 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3756 	rdma_stat	status;
3757 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3758 
3759 	/*
3760 	 * Non-coherent memory registration.
3761 	 */
3762 	status = rib_reg_mem(hca, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl,
3763 			&mr_desc);
3764 	if (status == RDMA_SUCCESS) {
3765 		buf_handle->mrc_linfo = (uint64_t)mr_hdl;
3766 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3767 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3768 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3769 	} else {
3770 		buf_handle->mrc_linfo = NULL;
3771 		buf_handle->mrc_lmr = 0;
3772 		buf_handle->mrc_rmr = 0;
3773 	}
3774 	return (status);
3775 }
3776 
3777 /* ARGSUSED */
3778 rdma_stat
3779 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3780 {
3781 	rib_hca_t *hca = (ctoqp(conn))->hca;
3782 
3783 	/*
3784 	 * Allow memory deregistration even if HCA is
3785 	 * getting detached. Need all outstanding
3786 	 * memory registrations to be deregistered
3787 	 * before HCA_DETACH_EVENT can be accepted.
3788 	 */
3789 	(void) ibt_deregister_mr(hca->hca_hdl,
3790 			(ibt_mr_hdl_t)buf_handle.mrc_linfo);
3791 	return (RDMA_SUCCESS);
3792 }
3793 
3794 /* ARGSUSED */
3795 rdma_stat
3796 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3797 		RIB_SYNCMEM_HANDLE sync_handle)
3798 {
3799 	(void) rib_deregistermem(conn, buf, buf_handle);
3800 
3801 	return (RDMA_SUCCESS);
3802 }
3803 
3804 /* ARGSUSED */
3805 rdma_stat
3806 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3807 		int len, int cpu)
3808 {
3809 	ibt_status_t	status;
3810 	rib_hca_t *hca = (ctoqp(conn))->hca;
3811 	ibt_mr_sync_t	mr_segment;
3812 
3813 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3814 	mr_segment.ms_vaddr = (ib_vaddr_t)buf;
3815 	mr_segment.ms_len = (ib_memlen_t)len;
3816 	if (cpu) {
3817 		/* make incoming data visible to memory */
3818 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3819 	} else {
3820 		/* make memory changes visible to IO */
3821 		mr_segment.ms_flags = IBT_SYNC_READ;
3822 	}
3823 	rw_enter(&hca->state_lock, RW_READER);
3824 	if (hca->state == HCA_INITED) {
3825 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3826 		rw_exit(&hca->state_lock);
3827 	} else {
3828 		rw_exit(&hca->state_lock);
3829 		return (RDMA_FAILED);
3830 	}
3831 
3832 	if (status == IBT_SUCCESS)
3833 		return (RDMA_SUCCESS);
3834 	else {
3835 #ifdef DEBUG
3836 		cmn_err(CE_WARN, "rib_syncmem: ibt_sync_mr failed with %d\n",
3837 			status);
3838 #endif
3839 		return (RDMA_FAILED);
3840 	}
3841 }
3842 
3843 /*
3844  * XXXX	????
3845  */
3846 static rdma_stat
3847 rib_getinfo(rdma_info_t *info)
3848 {
3849 	/*
3850 	 * XXXX	Hack!
3851 	 */
3852 	info->addrlen = 16;
3853 	info->mts = 1000000;
3854 	info->mtu = 1000000;
3855 
3856 	return (RDMA_SUCCESS);
3857 }
3858 
3859 rib_bufpool_t *
3860 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3861 {
3862 	rib_bufpool_t	*rbp = NULL;
3863 	bufpool_t	*bp = NULL;
3864 	caddr_t		buf;
3865 	ibt_mr_attr_t	mem_attr;
3866 	ibt_status_t	ibt_status;
3867 	int		i, j;
3868 
3869 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3870 
3871 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3872 			num * sizeof (void *), KM_SLEEP);
3873 
3874 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3875 	bp->numelems = num;
3876 
3877 	switch (ptype) {
3878 	    case SEND_BUFFER:
3879 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3880 		/* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */
3881 		bp->rsize = RPC_MSG_SZ;
3882 		break;
3883 	    case RECV_BUFFER:
3884 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3885 		/* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */
3886 		bp->rsize = RPC_BUF_SIZE;
3887 		break;
3888 	    default:
3889 		goto fail;
3890 	}
3891 
3892 	/*
3893 	 * Register the pool.
3894 	 */
3895 	bp->bufsize = num * bp->rsize;
3896 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3897 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3898 			sizeof (ibt_mr_hdl_t), KM_SLEEP);
3899 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3900 			sizeof (ibt_mr_desc_t), KM_SLEEP);
3901 
3902 	rw_enter(&hca->state_lock, RW_READER);
3903 	if (hca->state != HCA_INITED) {
3904 		rw_exit(&hca->state_lock);
3905 		goto fail;
3906 	}
3907 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3908 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3909 		mem_attr.mr_vaddr = (uint64_t)buf;
3910 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3911 		mem_attr.mr_as = NULL;
3912 		ibt_status = ibt_register_mr(hca->hca_hdl,
3913 			hca->pd_hdl, &mem_attr, &rbp->mr_hdl[i],
3914 			&rbp->mr_desc[i]);
3915 		if (ibt_status != IBT_SUCCESS) {
3916 		    for (j = 0; j < i; j++) {
3917 			(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[j]);
3918 		    }
3919 		    rw_exit(&hca->state_lock);
3920 		    goto fail;
3921 		}
3922 	}
3923 	rw_exit(&hca->state_lock);
3924 
3925 	buf = (caddr_t)bp->buf;
3926 	for (i = 0; i < num; i++, buf += bp->rsize) {
3927 		bp->buflist[i] = (void *)buf;
3928 	}
3929 	bp->buffree = num - 1;	/* no. of free buffers */
3930 	rbp->bpool = bp;
3931 
3932 	return (rbp);
3933 fail:
3934 	if (bp) {
3935 	    if (bp->buf)
3936 		kmem_free(bp->buf, bp->bufsize);
3937 	    kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3938 	}
3939 	if (rbp) {
3940 	    if (rbp->mr_hdl)
3941 		kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3942 	    if (rbp->mr_desc)
3943 		kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3944 	    kmem_free(rbp, sizeof (rib_bufpool_t));
3945 	}
3946 	return (NULL);
3947 }
3948 
3949 static void
3950 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3951 {
3952 	int i;
3953 	rib_bufpool_t *rbp = NULL;
3954 	bufpool_t *bp;
3955 
3956 	/*
3957 	 * Obtain pool address based on type of pool
3958 	 */
3959 	switch (ptype) {
3960 		case SEND_BUFFER:
3961 			rbp = hca->send_pool;
3962 			break;
3963 		case RECV_BUFFER:
3964 			rbp = hca->recv_pool;
3965 			break;
3966 		default:
3967 			return;
3968 	}
3969 	if (rbp == NULL)
3970 		return;
3971 
3972 	bp = rbp->bpool;
3973 
3974 	/*
3975 	 * Deregister the pool memory and free it.
3976 	 */
3977 	for (i = 0; i < bp->numelems; i++) {
3978 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3979 	}
3980 }
3981 
3982 static void
3983 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3984 {
3985 
3986 	rib_bufpool_t *rbp = NULL;
3987 	bufpool_t *bp;
3988 
3989 	/*
3990 	 * Obtain pool address based on type of pool
3991 	 */
3992 	switch (ptype) {
3993 		case SEND_BUFFER:
3994 			rbp = hca->send_pool;
3995 			break;
3996 		case RECV_BUFFER:
3997 			rbp = hca->recv_pool;
3998 			break;
3999 		default:
4000 			return;
4001 	}
4002 	if (rbp == NULL)
4003 		return;
4004 
4005 	bp = rbp->bpool;
4006 
4007 	/*
4008 	 * Free the pool memory.
4009 	 */
4010 	if (rbp->mr_hdl)
4011 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4012 
4013 	if (rbp->mr_desc)
4014 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4015 
4016 	if (bp->buf)
4017 		kmem_free(bp->buf, bp->bufsize);
4018 	mutex_destroy(&bp->buflock);
4019 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4020 	kmem_free(rbp, sizeof (rib_bufpool_t));
4021 }
4022 
4023 void
4024 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4025 {
4026 	/*
4027 	 * Deregister the pool memory and free it.
4028 	 */
4029 	rib_rbufpool_deregister(hca, ptype);
4030 	rib_rbufpool_free(hca, ptype);
4031 }
4032 
4033 /*
4034  * Fetch a buffer from the pool of type specified in rdbuf->type.
4035  */
4036 static rdma_stat
4037 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4038 {
4039 
4040 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4041 	if (rdbuf->addr) {
4042 		switch (rdbuf->type) {
4043 		case SEND_BUFFER:
4044 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
4045 			break;
4046 		case RECV_BUFFER:
4047 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
4048 			break;
4049 		default:
4050 			rdbuf->len = 0;
4051 		}
4052 		return (RDMA_SUCCESS);
4053 	} else
4054 		return (RDMA_FAILED);
4055 }
4056 
4057 
4058 /*
4059  * Fetch a buffer of specified type.
4060  * Note that rdbuf->handle is mw's rkey.
4061  */
4062 static void *
4063 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4064 {
4065 	rib_qp_t	*qp = ctoqp(conn);
4066 	rib_hca_t	*hca = qp->hca;
4067 	rdma_btype	ptype = rdbuf->type;
4068 	void		*buf;
4069 	rib_bufpool_t	*rbp = NULL;
4070 	bufpool_t	*bp;
4071 	int		i;
4072 
4073 	/*
4074 	 * Obtain pool address based on type of pool
4075 	 */
4076 	switch (ptype) {
4077 		case SEND_BUFFER:
4078 			rbp = hca->send_pool;
4079 			break;
4080 		case RECV_BUFFER:
4081 			rbp = hca->recv_pool;
4082 			break;
4083 		default:
4084 			return (NULL);
4085 	}
4086 	if (rbp == NULL)
4087 		return (NULL);
4088 
4089 	bp = rbp->bpool;
4090 
4091 	mutex_enter(&bp->buflock);
4092 	if (bp->buffree < 0) {
4093 		cmn_err(CE_WARN, "rib_rbuf_alloc: No free buffers!");
4094 		mutex_exit(&bp->buflock);
4095 		return (NULL);
4096 	}
4097 
4098 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4099 	buf = bp->buflist[bp->buffree];
4100 	rdbuf->addr = buf;
4101 	rdbuf->len = bp->rsize;
4102 	for (i = bp->numelems - 1; i >= 0; i--) {
4103 	    if ((ib_vaddr_t)buf == rbp->mr_desc[i].md_vaddr) {
4104 		rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey;
4105 		rdbuf->handle.mrc_linfo = (uint64_t)rbp->mr_hdl[i];
4106 		rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey;
4107 		bp->buffree--;
4108 		if (rib_debug > 1)
4109 		    cmn_err(CE_NOTE, "rib_rbuf_alloc: %d free bufs "
4110 			"(type %d)\n", bp->buffree+1, ptype);
4111 
4112 		mutex_exit(&bp->buflock);
4113 
4114 		return (buf);
4115 	    }
4116 	}
4117 	cmn_err(CE_WARN, "rib_rbuf_alloc: NO matching buf %p of "
4118 		"type %d found!", buf, ptype);
4119 	mutex_exit(&bp->buflock);
4120 
4121 	return (NULL);
4122 }
4123 
4124 static void
4125 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4126 {
4127 
4128 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4129 }
4130 
4131 static void
4132 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4133 {
4134 	rib_qp_t *qp = ctoqp(conn);
4135 	rib_hca_t *hca = qp->hca;
4136 	rib_bufpool_t *rbp = NULL;
4137 	bufpool_t *bp;
4138 
4139 	/*
4140 	 * Obtain pool address based on type of pool
4141 	 */
4142 	switch (ptype) {
4143 		case SEND_BUFFER:
4144 			rbp = hca->send_pool;
4145 			break;
4146 		case RECV_BUFFER:
4147 			rbp = hca->recv_pool;
4148 			break;
4149 		default:
4150 			return;
4151 	}
4152 	if (rbp == NULL)
4153 		return;
4154 
4155 	bp = rbp->bpool;
4156 
4157 	mutex_enter(&bp->buflock);
4158 	if (++bp->buffree >= bp->numelems) {
4159 		/*
4160 		 * Should never happen
4161 		 */
4162 		cmn_err(CE_WARN, "rib_rbuf_free: One (type %d) "
4163 			"too many frees!", ptype);
4164 		bp->buffree--;
4165 	} else {
4166 		bp->buflist[bp->buffree] = buf;
4167 		if (rib_debug > 1)
4168 		    cmn_err(CE_NOTE, "rib_rbuf_free: %d free bufs "
4169 			"(type %d)\n", bp->buffree+1, ptype);
4170 	}
4171 	mutex_exit(&bp->buflock);
4172 }
4173 
4174 static rdma_stat
4175 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4176 {
4177 	rw_enter(&connlist->conn_lock, RW_WRITER);
4178 	if (connlist->conn_hd) {
4179 		cn->c_next = connlist->conn_hd;
4180 		connlist->conn_hd->c_prev = cn;
4181 	}
4182 	connlist->conn_hd = cn;
4183 	rw_exit(&connlist->conn_lock);
4184 
4185 	return (RDMA_SUCCESS);
4186 }
4187 
4188 static rdma_stat
4189 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4190 {
4191 	rw_enter(&connlist->conn_lock, RW_WRITER);
4192 	if (cn->c_prev) {
4193 		cn->c_prev->c_next = cn->c_next;
4194 	}
4195 	if (cn->c_next) {
4196 		cn->c_next->c_prev = cn->c_prev;
4197 	}
4198 	if (connlist->conn_hd == cn)
4199 		connlist->conn_hd = cn->c_next;
4200 	rw_exit(&connlist->conn_lock);
4201 
4202 	return (RDMA_SUCCESS);
4203 }
4204 
4205 /*
4206  * Connection management.
4207  * IBTF does not support recycling of channels. So connections are only
4208  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR or
4209  * C_DISCONN_PEND state. No C_IDLE state.
4210  * C_CONN_PEND state: Connection establishment in progress to the server.
4211  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4212  * It has an RC channel associated with it. ibt_post_send/recv are allowed
4213  * only in this state.
4214  * C_ERROR state: A connection transitions to this state when WRs on the
4215  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4216  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4217  * C_DISCONN_PEND state: When a connection is in C_ERROR state and when
4218  * c_ref drops to 0 (this indicates that RPC has no more references to this
4219  * connection), the connection should be destroyed. A connection transitions
4220  * into this state when it is being destroyed.
4221  */
4222 static rdma_stat
4223 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn)
4224 {
4225 	CONN *cn;
4226 	int status = RDMA_SUCCESS;
4227 	rib_hca_t *hca = (rib_hca_t *)handle;
4228 	rib_qp_t *qp;
4229 	clock_t cv_stat, timout;
4230 	ibt_path_info_t path;
4231 
4232 again:
4233 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4234 	cn = hca->cl_conn_list.conn_hd;
4235 	while (cn != NULL) {
4236 		/*
4237 		 * First, clear up any connection in the ERROR state
4238 		 */
4239 		mutex_enter(&cn->c_lock);
4240 		if (cn->c_state == C_ERROR) {
4241 			if (cn->c_ref == 0) {
4242 				/*
4243 				 * Remove connection from list and destroy it.
4244 				 */
4245 				cn->c_state = C_DISCONN_PEND;
4246 				mutex_exit(&cn->c_lock);
4247 				rw_exit(&hca->cl_conn_list.conn_lock);
4248 				(void) rib_disconnect_channel(cn,
4249 				    &hca->cl_conn_list);
4250 				goto again;
4251 			}
4252 			mutex_exit(&cn->c_lock);
4253 			cn = cn->c_next;
4254 			continue;
4255 		} else if (cn->c_state == C_DISCONN_PEND) {
4256 			mutex_exit(&cn->c_lock);
4257 			cn = cn->c_next;
4258 			continue;
4259 		}
4260 		if ((cn->c_raddr.len == svcaddr->len) &&
4261 		    bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) {
4262 			/*
4263 			 * Our connection. Give up conn list lock
4264 			 * as we are done traversing the list.
4265 			 */
4266 			rw_exit(&hca->cl_conn_list.conn_lock);
4267 			if (cn->c_state == C_CONNECTED) {
4268 				cn->c_ref++;	/* sharing a conn */
4269 				mutex_exit(&cn->c_lock);
4270 				*conn = cn;
4271 				return (status);
4272 			}
4273 			if (cn->c_state == C_CONN_PEND) {
4274 				/*
4275 				 * Hold a reference to this conn before
4276 				 * we give up the lock.
4277 				 */
4278 				cn->c_ref++;
4279 				timout =  ddi_get_lbolt() +
4280 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
4281 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4282 					&cn->c_lock, timout)) > 0 &&
4283 					cn->c_state == C_CONN_PEND)
4284 					;
4285 				if (cv_stat == 0) {
4286 					cn->c_ref--;
4287 					mutex_exit(&cn->c_lock);
4288 					return (RDMA_INTR);
4289 				}
4290 				if (cv_stat < 0) {
4291 					cn->c_ref--;
4292 					mutex_exit(&cn->c_lock);
4293 					return (RDMA_TIMEDOUT);
4294 				}
4295 				if (cn->c_state == C_CONNECTED) {
4296 					*conn = cn;
4297 					mutex_exit(&cn->c_lock);
4298 					return (status);
4299 				} else {
4300 					cn->c_ref--;
4301 					mutex_exit(&cn->c_lock);
4302 					return (RDMA_TIMEDOUT);
4303 				}
4304 			}
4305 		}
4306 		mutex_exit(&cn->c_lock);
4307 		cn = cn->c_next;
4308 	}
4309 	rw_exit(&hca->cl_conn_list.conn_lock);
4310 
4311 	status = rib_chk_srv_ats(hca, svcaddr, addr_type, &path);
4312 	if (status != RDMA_SUCCESS) {
4313 #ifdef DEBUG
4314 		if (rib_debug) {
4315 			cmn_err(CE_WARN, "rib_conn_get: "
4316 				"No server ATS record!");
4317 		}
4318 #endif
4319 		return (RDMA_FAILED);
4320 	}
4321 
4322 	/*
4323 	 * Channel to server doesn't exist yet, create one.
4324 	 */
4325 	if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) {
4326 		return (RDMA_FAILED);
4327 	}
4328 	cn = qptoc(qp);
4329 	cn->c_state = C_CONN_PEND;
4330 	cn->c_ref = 1;
4331 
4332 	/*
4333 	 * Add to conn list.
4334 	 * We had given up the READER lock. In the time since then,
4335 	 * another thread might have created the connection we are
4336 	 * trying here. But for now, that is quiet alright - there
4337 	 * might be two connections between a pair of hosts instead
4338 	 * of one. If we really want to close that window,
4339 	 * then need to check the list after acquiring the
4340 	 * WRITER lock.
4341 	 */
4342 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4343 	status = rib_conn_to_srv(hca, qp, &path);
4344 	mutex_enter(&cn->c_lock);
4345 	if (status == RDMA_SUCCESS) {
4346 		cn->c_state = C_CONNECTED;
4347 		*conn = cn;
4348 	} else {
4349 		cn->c_state = C_ERROR;
4350 		cn->c_ref--;
4351 #ifdef DEBUG
4352 		if (rib_debug) {
4353 			cmn_err(CE_WARN, "rib_conn_get: FAILED creating"
4354 			    " a channel!");
4355 		}
4356 #endif
4357 	}
4358 	cv_broadcast(&cn->c_cv);
4359 	mutex_exit(&cn->c_lock);
4360 	return (status);
4361 }
4362 
4363 static rdma_stat
4364 rib_conn_release(CONN *conn)
4365 {
4366 	rib_qp_t	*qp = ctoqp(conn);
4367 
4368 	mutex_enter(&conn->c_lock);
4369 	conn->c_ref--;
4370 
4371 	/*
4372 	 * If a conn is C_ERROR, close the channel.
4373 	 * If it's CONNECTED, keep it that way.
4374 	 */
4375 	if (conn->c_ref == 0 && (conn->c_state &  C_ERROR)) {
4376 		conn->c_state = C_DISCONN_PEND;
4377 		mutex_exit(&conn->c_lock);
4378 		if (qp->mode == RIB_SERVER)
4379 			(void) rib_disconnect_channel(conn,
4380 			    &qp->hca->srv_conn_list);
4381 		else
4382 			(void) rib_disconnect_channel(conn,
4383 			    &qp->hca->cl_conn_list);
4384 		return (RDMA_SUCCESS);
4385 	}
4386 	mutex_exit(&conn->c_lock);
4387 	return (RDMA_SUCCESS);
4388 }
4389 
4390 /*
4391  * Add at front of list
4392  */
4393 static struct rdma_done_list *
4394 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4395 {
4396 	struct rdma_done_list *rd;
4397 
4398 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4399 
4400 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4401 	rd->xid = xid;
4402 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4403 
4404 	rd->prev = NULL;
4405 	rd->next = qp->rdlist;
4406 	if (qp->rdlist != NULL)
4407 		qp->rdlist->prev = rd;
4408 	qp->rdlist = rd;
4409 
4410 	return (rd);
4411 }
4412 
4413 static void
4414 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4415 {
4416 	struct rdma_done_list *r;
4417 
4418 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4419 
4420 	r = rd->next;
4421 	if (r != NULL) {
4422 		r->prev = rd->prev;
4423 	}
4424 
4425 	r = rd->prev;
4426 	if (r != NULL) {
4427 		r->next = rd->next;
4428 	} else {
4429 		qp->rdlist = rd->next;
4430 	}
4431 
4432 	cv_destroy(&rd->rdma_done_cv);
4433 	kmem_free(rd, sizeof (*rd));
4434 }
4435 
4436 static void
4437 rdma_done_rem_list(rib_qp_t *qp)
4438 {
4439 	struct rdma_done_list	*r, *n;
4440 
4441 	mutex_enter(&qp->rdlist_lock);
4442 	for (r = qp->rdlist; r != NULL; r = n) {
4443 		n = r->next;
4444 		rdma_done_rm(qp, r);
4445 	}
4446 	mutex_exit(&qp->rdlist_lock);
4447 }
4448 
4449 static void
4450 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4451 {
4452 	struct rdma_done_list *r = qp->rdlist;
4453 
4454 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4455 
4456 	while (r) {
4457 		if (r->xid == xid) {
4458 			cv_signal(&r->rdma_done_cv);
4459 			return;
4460 		} else {
4461 			r = r->next;
4462 		}
4463 	}
4464 	if (rib_debug > 1) {
4465 	    cmn_err(CE_WARN, "rdma_done_notify: "
4466 		"No matching xid for %u, qp %p\n", xid, (void *)qp);
4467 	}
4468 }
4469 
4470 rpcib_ats_t *
4471 get_ibd_entry(ib_gid_t *gid, ib_pkey_t pkey, rpcib_ibd_insts_t *ibds)
4472 {
4473 	rpcib_ats_t		*atsp;
4474 	int			i;
4475 
4476 	for (i = 0, atsp = ibds->rib_ats; i < ibds->rib_ibd_cnt; i++, atsp++) {
4477 		if (atsp->ras_port_gid.gid_prefix == gid->gid_prefix &&
4478 		    atsp->ras_port_gid.gid_guid == gid->gid_guid &&
4479 		    atsp->ras_pkey == pkey) {
4480 			return (atsp);
4481 		}
4482 	}
4483 	return (NULL);
4484 }
4485 
4486 int
4487 rib_get_ibd_insts_cb(dev_info_t *dip, void *arg)
4488 {
4489 	rpcib_ibd_insts_t *ibds = (rpcib_ibd_insts_t *)arg;
4490 	rpcib_ats_t	*atsp;
4491 	ib_pkey_t	pkey;
4492 	uint8_t		port;
4493 	ib_guid_t	hca_guid;
4494 	ib_gid_t	port_gid;
4495 
4496 	if ((i_ddi_node_state(dip) >= DS_ATTACHED) &&
4497 	    (strcmp(ddi_node_name(dip), "ibport") == 0) &&
4498 	    (strstr(ddi_get_name_addr(dip), "ipib") != NULL)) {
4499 
4500 		if (ibds->rib_ibd_cnt >= ibds->rib_ibd_alloc) {
4501 		    rpcib_ats_t	*tmp;
4502 
4503 		    tmp = (rpcib_ats_t *)kmem_zalloc((ibds->rib_ibd_alloc +
4504 			N_IBD_INSTANCES) * sizeof (rpcib_ats_t), KM_SLEEP);
4505 		    bcopy(ibds->rib_ats, tmp,
4506 			ibds->rib_ibd_alloc * sizeof (rpcib_ats_t));
4507 		    kmem_free(ibds->rib_ats,
4508 			ibds->rib_ibd_alloc * sizeof (rpcib_ats_t));
4509 		    ibds->rib_ats = tmp;
4510 		    ibds->rib_ibd_alloc += N_IBD_INSTANCES;
4511 		}
4512 		if (((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY,
4513 			dip, 0, "hca-guid", 0)) == 0) ||
4514 		    ((port = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4515 			0, "port-number", 0)) == 0) ||
4516 		    (ibt_get_port_state_byguid(hca_guid, port,
4517 			&port_gid, NULL) != IBT_SUCCESS) ||
4518 		    ((pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4519 			"port-pkey", IB_PKEY_INVALID_LIMITED)) <=
4520 			IB_PKEY_INVALID_FULL)) {
4521 		    return (DDI_WALK_CONTINUE);
4522 		}
4523 		atsp = &ibds->rib_ats[ibds->rib_ibd_cnt];
4524 		atsp->ras_inst = ddi_get_instance(dip);
4525 		atsp->ras_pkey = pkey;
4526 		atsp->ras_port_gid = port_gid;
4527 		ibds->rib_ibd_cnt++;
4528 	}
4529 	return (DDI_WALK_CONTINUE);
4530 }
4531 
4532 void
4533 rib_get_ibd_insts(rpcib_ibd_insts_t *ibds)
4534 {
4535 	ddi_walk_devs(ddi_root_node(), rib_get_ibd_insts_cb, ibds);
4536 }
4537 
4538 /*
4539  * Return ibd interfaces and ibd instances.
4540  */
4541 int
4542 get_ibd_ipaddr(rpcib_ibd_insts_t *ibds)
4543 {
4544 	TIUSER			*tiptr, *tiptr6;
4545 	vnode_t			*kvp, *kvp6;
4546 	vnode_t			*vp = NULL, *vp6 = NULL;
4547 	struct strioctl		iocb;
4548 	struct lifreq		lif_req;
4549 	int			k, ip_cnt;
4550 	rpcib_ats_t		*atsp;
4551 
4552 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP,
4553 		&kvp) == 0) {
4554 	    if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
4555 		&tiptr, CRED()) == 0) {
4556 		vp = tiptr->fp->f_vnode;
4557 	    } else {
4558 		VN_RELE(kvp);
4559 	    }
4560 	}
4561 
4562 	if (lookupname("/dev/udp6", UIO_SYSSPACE, FOLLOW, NULLVPP,
4563 		&kvp6) == 0) {
4564 	    if (t_kopen((file_t *)NULL, kvp6->v_rdev, FREAD|FWRITE,
4565 		&tiptr6, CRED()) == 0) {
4566 		vp6 = tiptr6->fp->f_vnode;
4567 	    } else {
4568 		VN_RELE(kvp6);
4569 	    }
4570 	}
4571 
4572 	if (vp == NULL && vp6 == NULL)
4573 		return (-1);
4574 
4575 	/* Get ibd ip's */
4576 	ip_cnt = 0;
4577 	for (k = 0, atsp = ibds->rib_ats; k < ibds->rib_ibd_cnt; k++, atsp++) {
4578 		/* IPv4 */
4579 	    if (vp != NULL) {
4580 		(void) bzero((void *)&lif_req, sizeof (struct lifreq));
4581 		(void) snprintf(lif_req.lifr_name,
4582 			sizeof (lif_req.lifr_name), "%s%d",
4583 			IBD_NAME, atsp->ras_inst);
4584 
4585 		(void) bzero((void *)&iocb, sizeof (struct strioctl));
4586 		iocb.ic_cmd = SIOCGLIFADDR;
4587 		iocb.ic_timout = 0;
4588 		iocb.ic_len = sizeof (struct lifreq);
4589 		iocb.ic_dp = (caddr_t)&lif_req;
4590 		if (kstr_ioctl(vp, I_STR, (intptr_t)&iocb) == 0) {
4591 		    atsp->ras_inet_type = AF_INET;
4592 		    bcopy(&lif_req.lifr_addr, &atsp->ras_sin,
4593 			sizeof (struct sockaddr_in));
4594 		    ip_cnt++;
4595 		    continue;
4596 		}
4597 	    }
4598 		/* Try IPv6 */
4599 	    if (vp6 != NULL) {
4600 		(void) bzero((void *)&lif_req, sizeof (struct lifreq));
4601 		(void) snprintf(lif_req.lifr_name,
4602 			sizeof (lif_req.lifr_name), "%s%d",
4603 			IBD_NAME, atsp->ras_inst);
4604 
4605 		(void) bzero((void *)&iocb, sizeof (struct strioctl));
4606 		iocb.ic_cmd = SIOCGLIFADDR;
4607 		iocb.ic_timout = 0;
4608 		iocb.ic_len = sizeof (struct lifreq);
4609 		iocb.ic_dp = (caddr_t)&lif_req;
4610 		if (kstr_ioctl(vp6, I_STR, (intptr_t)&iocb) == 0) {
4611 
4612 		    atsp->ras_inet_type = AF_INET6;
4613 		    bcopy(&lif_req.lifr_addr, &atsp->ras_sin6,
4614 			    sizeof (struct sockaddr_in6));
4615 		    ip_cnt++;
4616 		}
4617 	    }
4618 	}
4619 
4620 	if (vp6 != NULL) {
4621 	    (void) t_kclose(tiptr6, 0);
4622 	    VN_RELE(kvp6);
4623 	}
4624 	if (vp != NULL) {
4625 	    (void) t_kclose(tiptr, 0);
4626 	    VN_RELE(kvp);
4627 	}
4628 
4629 	if (ip_cnt == 0)
4630 	    return (-1);
4631 	else
4632 	    return (0);
4633 }
4634 
4635 char **
4636 get_ip_addrs(int *count)
4637 {
4638 	TIUSER			*tiptr;
4639 	vnode_t			*kvp;
4640 	int			num_of_ifs;
4641 	char			**addresses;
4642 	int			return_code;
4643 
4644 	/*
4645 	 * Open a device for doing down stream kernel ioctls
4646 	 */
4647 	return_code = lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW,
4648 	    NULLVPP, &kvp);
4649 	if (return_code != 0) {
4650 		cmn_err(CE_NOTE, "get_Ip_addrs: lookupname failed\n");
4651 		*count = -1;
4652 		return (NULL);
4653 	}
4654 
4655 	return_code = t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
4656 	    &tiptr, CRED());
4657 	if (return_code != 0) {
4658 		cmn_err(CE_NOTE, "get_Ip_addrs: t_kopen failed\n");
4659 		VN_RELE(kvp);
4660 		*count = -1;
4661 		return (NULL);
4662 	}
4663 
4664 	/*
4665 	 * Perform the first ioctl to get the number of interfaces
4666 	 */
4667 	return_code = get_interfaces(tiptr, &num_of_ifs);
4668 	if (return_code != 0 || num_of_ifs == 0) {
4669 		cmn_err(CE_NOTE, "get_Ip_addrs: get_interfaces failed\n");
4670 		(void) t_kclose(tiptr, 0);
4671 		VN_RELE(kvp);
4672 		*count = -1;
4673 		return (NULL);
4674 	}
4675 
4676 	/*
4677 	 * Perform the second ioctl to get the address on each interface
4678 	 * found.
4679 	 */
4680 	addresses = kmem_zalloc(num_of_ifs * sizeof (char *), KM_SLEEP);
4681 	return_code = find_addrs(tiptr, addresses, num_of_ifs);
4682 	if (return_code <= 0) {
4683 		cmn_err(CE_NOTE, "get_Ip_addrs: find_addrs failed\n");
4684 		(void) t_kclose(tiptr, 0);
4685 		kmem_free(addresses, num_of_ifs * sizeof (char *));
4686 		VN_RELE(kvp);
4687 		*count = -1;
4688 		return (NULL);
4689 	}
4690 
4691 	*count = return_code;
4692 	VN_RELE(kvp);
4693 	(void) t_kclose(tiptr, 0);
4694 	return (addresses);
4695 }
4696 
4697 int
4698 get_interfaces(TIUSER *tiptr, int *num)
4699 {
4700 	struct lifnum		if_buf;
4701 	struct strioctl		iocb;
4702 	vnode_t			*vp;
4703 	int			return_code;
4704 
4705 	/*
4706 	 * Prep the number of interfaces request buffer for ioctl
4707 	 */
4708 	(void) bzero((void *)&if_buf, sizeof (struct lifnum));
4709 	if_buf.lifn_family = AF_UNSPEC;
4710 	if_buf.lifn_flags = 0;
4711 
4712 	/*
4713 	 * Prep the kernel ioctl buffer and send it down stream
4714 	 */
4715 	(void) bzero((void *)&iocb, sizeof (struct strioctl));
4716 	iocb.ic_cmd = SIOCGLIFNUM;
4717 	iocb.ic_timout = 0;
4718 	iocb.ic_len = sizeof (if_buf);
4719 	iocb.ic_dp = (caddr_t)&if_buf;
4720 
4721 	vp = tiptr->fp->f_vnode;
4722 	return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4723 	if (return_code != 0) {
4724 		cmn_err(CE_NOTE, "get_interfaces: kstr_ioctl failed\n");
4725 		*num = -1;
4726 		return (-1);
4727 	}
4728 
4729 	*num = if_buf.lifn_count;
4730 #ifdef	DEBUG
4731 	if (rib_debug > 1)
4732 		cmn_err(CE_NOTE, "Number of interfaces detected: %d\n",
4733 		    if_buf.lifn_count);
4734 #endif
4735 	return (0);
4736 }
4737 
4738 int
4739 find_addrs(TIUSER *tiptr, char **addrs, int num_ifs)
4740 {
4741 	struct lifconf		lifc;
4742 	struct lifreq		*if_data_buf;
4743 	struct strioctl		iocb;
4744 	caddr_t			request_buffer;
4745 	struct sockaddr_in	*sin4;
4746 	struct sockaddr_in6	*sin6;
4747 	vnode_t			*vp;
4748 	int			i, count, return_code;
4749 
4750 	/*
4751 	 * Prep the buffer for requesting all interface's info
4752 	 */
4753 	(void) bzero((void *)&lifc, sizeof (struct lifconf));
4754 	lifc.lifc_family = AF_UNSPEC;
4755 	lifc.lifc_flags = 0;
4756 	lifc.lifc_len = num_ifs * sizeof (struct lifreq);
4757 
4758 	request_buffer = kmem_zalloc(num_ifs * sizeof (struct lifreq),
4759 	    KM_SLEEP);
4760 
4761 	lifc.lifc_buf = request_buffer;
4762 
4763 	/*
4764 	 * Prep the kernel ioctl buffer and send it down stream
4765 	 */
4766 	(void) bzero((void *)&iocb, sizeof (struct strioctl));
4767 	iocb.ic_cmd = SIOCGLIFCONF;
4768 	iocb.ic_timout = 0;
4769 	iocb.ic_len = sizeof (struct lifconf);
4770 	iocb.ic_dp = (caddr_t)&lifc;
4771 
4772 	vp = tiptr->fp->f_vnode;
4773 	return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4774 	if (return_code != 0) {
4775 		cmn_err(CE_NOTE, "find_addrs: kstr_ioctl failed\n");
4776 		kmem_free(request_buffer, num_ifs * sizeof (struct lifreq));
4777 		return (-1);
4778 	}
4779 
4780 	/*
4781 	 * Extract addresses and fill them in the requested array
4782 	 * IB_SVC_NAME_LEN is defined to be 64 so it  covers both IPv4 &
4783 	 * IPv6. Here count is the number of IP addresses collected.
4784 	 */
4785 	if_data_buf = lifc.lifc_req;
4786 	count = 0;
4787 	for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--,
4788 	if_data_buf++) {
4789 		if (if_data_buf->lifr_addr.ss_family == AF_INET) {
4790 			sin4 = (struct sockaddr_in *)&if_data_buf->lifr_addr;
4791 			addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
4792 			(void) inet_ntop(AF_INET, &sin4->sin_addr,
4793 			    addrs[count], IB_SVC_NAME_LEN);
4794 			count ++;
4795 		}
4796 
4797 		if (if_data_buf->lifr_addr.ss_family == AF_INET6) {
4798 			sin6 = (struct sockaddr_in6 *)&if_data_buf->lifr_addr;
4799 			addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
4800 			(void) inet_ntop(AF_INET6, &sin6->sin6_addr,
4801 			    addrs[count], IB_SVC_NAME_LEN);
4802 			count ++;
4803 		}
4804 	}
4805 
4806 	kmem_free(request_buffer, num_ifs * sizeof (struct lifreq));
4807 	return (count);
4808 }
4809 
4810 /*
4811  * Goes through all connections and closes the channel
4812  * This will cause all the WRs on those channels to be
4813  * flushed.
4814  */
4815 static void
4816 rib_close_channels(rib_conn_list_t *connlist)
4817 {
4818 	CONN 		*conn;
4819 	rib_qp_t	*qp;
4820 
4821 	rw_enter(&connlist->conn_lock, RW_READER);
4822 	conn = connlist->conn_hd;
4823 	while (conn != NULL) {
4824 		mutex_enter(&conn->c_lock);
4825 		qp = ctoqp(conn);
4826 		if (conn->c_state & C_CONNECTED) {
4827 			/*
4828 			 * Live connection in CONNECTED state.
4829 			 * Call ibt_close_rc_channel in nonblocking mode
4830 			 * with no callbacks.
4831 			 */
4832 			conn->c_state = C_ERROR;
4833 			(void) ibt_close_rc_channel(qp->qp_hdl,
4834 				IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
4835 			(void) ibt_free_channel(qp->qp_hdl);
4836 			qp->qp_hdl = NULL;
4837 		} else {
4838 			if (conn->c_state == C_ERROR &&
4839 				qp->qp_hdl != NULL) {
4840 				/*
4841 				 * Connection in ERROR state but
4842 				 * channel is not yet freed.
4843 				 */
4844 				(void) ibt_close_rc_channel(qp->qp_hdl,
4845 					IBT_NOCALLBACKS, NULL, 0, NULL,
4846 					NULL, 0);
4847 				(void) ibt_free_channel(qp->qp_hdl);
4848 				qp->qp_hdl = NULL;
4849 			}
4850 		}
4851 		mutex_exit(&conn->c_lock);
4852 		conn = conn->c_next;
4853 	}
4854 	rw_exit(&connlist->conn_lock);
4855 }
4856 
4857 /*
4858  * Frees up all connections that are no longer being referenced
4859  */
4860 static void
4861 rib_purge_connlist(rib_conn_list_t *connlist)
4862 {
4863 	CONN 		*conn;
4864 
4865 top:
4866 	rw_enter(&connlist->conn_lock, RW_READER);
4867 	conn = connlist->conn_hd;
4868 	while (conn != NULL) {
4869 		mutex_enter(&conn->c_lock);
4870 
4871 		/*
4872 		 * At this point connection is either in ERROR
4873 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4874 		 * then some other thread is culling that connection.
4875 		 * If not and if c_ref is 0, then destroy the connection.
4876 		 */
4877 		if (conn->c_ref == 0 &&
4878 			conn->c_state != C_DISCONN_PEND) {
4879 			/*
4880 			 * Cull the connection
4881 			 */
4882 			conn->c_state = C_DISCONN_PEND;
4883 			mutex_exit(&conn->c_lock);
4884 			rw_exit(&connlist->conn_lock);
4885 			(void) rib_disconnect_channel(conn, connlist);
4886 			goto top;
4887 		} else {
4888 			/*
4889 			 * conn disconnect already scheduled or will
4890 			 * happen from conn_release when c_ref drops to 0.
4891 			 */
4892 			mutex_exit(&conn->c_lock);
4893 		}
4894 		conn = conn->c_next;
4895 	}
4896 	rw_exit(&connlist->conn_lock);
4897 
4898 	/*
4899 	 * At this point, only connections with c_ref != 0 are on the list
4900 	 */
4901 }
4902 
4903 /*
4904  * Cleans and closes up all uses of the HCA
4905  */
4906 static void
4907 rib_detach_hca(rib_hca_t *hca)
4908 {
4909 
4910 	/*
4911 	 * Stop all services on the HCA
4912 	 * Go through cl_conn_list and close all rc_channels
4913 	 * Go through svr_conn_list and close all rc_channels
4914 	 * Free connections whose c_ref has dropped to 0
4915 	 * Destroy all CQs
4916 	 * Deregister and released all buffer pool memory after all
4917 	 * connections are destroyed
4918 	 * Free the protection domain
4919 	 * ibt_close_hca()
4920 	 */
4921 	rw_enter(&hca->state_lock, RW_WRITER);
4922 	if (hca->state == HCA_DETACHED) {
4923 		rw_exit(&hca->state_lock);
4924 		return;
4925 	}
4926 
4927 	hca->state = HCA_DETACHED;
4928 	rib_stat->nhca_inited--;
4929 
4930 	rib_stop_services(hca);
4931 	rib_deregister_ats();
4932 	rib_close_channels(&hca->cl_conn_list);
4933 	rib_close_channels(&hca->srv_conn_list);
4934 	rw_exit(&hca->state_lock);
4935 
4936 	rib_purge_connlist(&hca->cl_conn_list);
4937 	rib_purge_connlist(&hca->srv_conn_list);
4938 
4939 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4940 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4941 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4942 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4943 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4944 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4945 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4946 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4947 
4948 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4949 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4950 	if (hca->srv_conn_list.conn_hd == NULL &&
4951 		hca->cl_conn_list.conn_hd == NULL) {
4952 		/*
4953 		 * conn_lists are NULL, so destroy
4954 		 * buffers, close hca and be done.
4955 		 */
4956 		rib_rbufpool_destroy(hca, RECV_BUFFER);
4957 		rib_rbufpool_destroy(hca, SEND_BUFFER);
4958 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4959 		(void) ibt_close_hca(hca->hca_hdl);
4960 		hca->hca_hdl = NULL;
4961 	}
4962 	rw_exit(&hca->cl_conn_list.conn_lock);
4963 	rw_exit(&hca->srv_conn_list.conn_lock);
4964 
4965 	if (hca->hca_hdl != NULL) {
4966 		mutex_enter(&hca->inuse_lock);
4967 		while (hca->inuse)
4968 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4969 		mutex_exit(&hca->inuse_lock);
4970 		/*
4971 		 * conn_lists are now NULL, so destroy
4972 		 * buffers, close hca and be done.
4973 		 */
4974 		rib_rbufpool_destroy(hca, RECV_BUFFER);
4975 		rib_rbufpool_destroy(hca, SEND_BUFFER);
4976 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4977 		(void) ibt_close_hca(hca->hca_hdl);
4978 		hca->hca_hdl = NULL;
4979 	}
4980 }
4981