xref: /illumos-gate/usr/src/uts/common/rpc/rpcib.c (revision d6bb6a8465e557cb946ef49d56ed3202f6218652)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * The rpcib plugin. Implements the interface for RDMATF's
31  * interaction with IBTF.
32  */
33 
34 #include <sys/param.h>
35 #include <sys/types.h>
36 #include <sys/user.h>
37 #include <sys/systm.h>
38 #include <sys/sysmacros.h>
39 #include <sys/proc.h>
40 #include <sys/socket.h>
41 #include <sys/file.h>
42 #include <sys/stream.h>
43 #include <sys/strsubr.h>
44 #include <sys/stropts.h>
45 #include <sys/errno.h>
46 #include <sys/kmem.h>
47 #include <sys/debug.h>
48 #include <sys/systm.h>
49 #include <sys/pathname.h>
50 #include <sys/kstat.h>
51 #include <sys/t_lock.h>
52 #include <sys/ddi.h>
53 #include <sys/cmn_err.h>
54 #include <sys/time.h>
55 #include <sys/isa_defs.h>
56 #include <sys/callb.h>
57 #include <sys/sunddi.h>
58 #include <sys/sunndi.h>
59 
60 #include <sys/ib/ibtl/ibti.h>
61 #include <rpc/rpc.h>
62 #include <rpc/ib.h>
63 
64 #include <sys/modctl.h>
65 
66 #include <sys/pathname.h>
67 #include <sys/kstr.h>
68 #include <sys/sockio.h>
69 #include <sys/vnode.h>
70 #include <sys/tiuser.h>
71 #include <net/if.h>
72 #include <sys/cred.h>
73 
74 
75 extern char *inet_ntop(int, const void *, char *, int);
76 
77 
78 /*
79  * Prototype declarations for driver ops
80  */
81 
82 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
83 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
84 			    void *, void **);
85 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
86 
87 
88 /* rpcib cb_ops */
89 static struct cb_ops rpcib_cbops = {
90 	nulldev,		/* open */
91 	nulldev,		/* close */
92 	nodev,			/* strategy */
93 	nodev,			/* print */
94 	nodev,			/* dump */
95 	nodev,			/* read */
96 	nodev,			/* write */
97 	nodev,			/* ioctl */
98 	nodev,			/* devmap */
99 	nodev,			/* mmap */
100 	nodev,			/* segmap */
101 	nochpoll,		/* poll */
102 	ddi_prop_op,		/* prop_op */
103 	NULL,			/* stream */
104 	D_MP,			/* cb_flag */
105 	CB_REV,			/* rev */
106 	nodev,			/* int (*cb_aread)() */
107 	nodev			/* int (*cb_awrite)() */
108 };
109 
110 /*
111  * Device options
112  */
113 static struct dev_ops rpcib_ops = {
114 	DEVO_REV,		/* devo_rev, */
115 	0,			/* refcnt  */
116 	rpcib_getinfo,		/* info */
117 	nulldev,		/* identify */
118 	nulldev,		/* probe */
119 	rpcib_attach,		/* attach */
120 	rpcib_detach,		/* detach */
121 	nodev,			/* reset */
122 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
123 	NULL,			/* bus operations */
124 	NULL			/* power */
125 };
126 
127 /*
128  * Module linkage information.
129  */
130 
131 static struct modldrv rib_modldrv = {
132 	&mod_driverops,			    /* Driver module */
133 	"RPCIB plugin driver, ver %I%", /* Driver name and version */
134 	&rpcib_ops,		    /* Driver ops */
135 };
136 
137 static struct modlinkage rib_modlinkage = {
138 	MODREV_1,
139 	(void *)&rib_modldrv,
140 	NULL
141 };
142 
143 /*
144  * rib_stat: private data pointer used when registering
145  *	with the IBTF.  It is returned to the consumer
146  *	in all callbacks.
147  */
148 static rpcib_state_t *rib_stat = NULL;
149 
150 #define	RNR_RETRIES	2
151 #define	MAX_PORTS	2
152 
153 int preposted_rbufs = 16;
154 int send_threshold = 1;
155 
156 /*
157  * State of the plugin.
158  * ACCEPT = accepting new connections and requests.
159  * NO_ACCEPT = not accepting new connection and requests.
160  * This should eventually move to rpcib_state_t structure, since this
161  * will tell in which state the plugin is for a particular type of service
162  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
163  * state for one and in no_accept state for the other.
164  */
165 int		plugin_state;
166 kmutex_t	plugin_state_lock;
167 
168 
169 /*
170  * RPCIB RDMATF operations
171  */
172 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
173 static rdma_stat rib_disconnect(CONN *conn);
174 static void rib_listen(struct rdma_svc_data *rd);
175 static void rib_listen_stop(struct rdma_svc_data *rd);
176 static rdma_stat rib_registermem(CONN *conn, caddr_t buf, uint_t buflen,
177 	struct mrc *buf_handle);
178 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
179 	struct mrc buf_handle);
180 static rdma_stat rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen,
181 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle);
182 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
183 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle);
184 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
185 	caddr_t buf, int len, int cpu);
186 
187 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
188 
189 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
190 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
191 
192 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
193 
194 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
195 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
196 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
197 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
198 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
199 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
200 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
201 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **);
202 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
203 static rdma_stat rib_conn_release(CONN *conn);
204 static rdma_stat rib_getinfo(rdma_info_t *info);
205 static rdma_stat rib_register_ats(rib_hca_t *);
206 static void rib_deregister_ats();
207 static void rib_stop_services(rib_hca_t *);
208 
209 /*
210  * RPCIB addressing operations
211  */
212 char ** get_ip_addrs(int *count);
213 int get_interfaces(TIUSER *tiptr, int *num);
214 int find_addrs(TIUSER *tiptr, char **addrs, int num_ifs);
215 int get_ibd_ipaddr(rpcib_ibd_insts_t *);
216 rpcib_ats_t *get_ibd_entry(ib_gid_t *, ib_pkey_t, rpcib_ibd_insts_t *);
217 void rib_get_ibd_insts(rpcib_ibd_insts_t *);
218 
219 
220 /*
221  * RDMA operations the RPCIB module exports
222  */
223 static rdmaops_t rib_ops = {
224 	rib_reachable,
225 	rib_conn_get,
226 	rib_conn_release,
227 	rib_listen,
228 	rib_listen_stop,
229 	rib_registermem,
230 	rib_deregistermem,
231 	rib_registermemsync,
232 	rib_deregistermemsync,
233 	rib_syncmem,
234 	rib_reg_buf_alloc,
235 	rib_reg_buf_free,
236 	rib_send,
237 	rib_send_resp,
238 	rib_post_resp,
239 	rib_post_recv,
240 	rib_recv,
241 	rib_read,
242 	rib_write,
243 	rib_getinfo
244 };
245 
246 /*
247  * RDMATF RPCIB plugin details
248  */
249 static rdma_mod_t rib_mod = {
250 	"ibtf",		/* api name */
251 	RDMATF_VERS_1,
252 	0,
253 	&rib_ops,	/* rdma op vector for ibtf */
254 };
255 
256 static rdma_stat open_hcas(rpcib_state_t *);
257 static rdma_stat rib_qp_init(rib_qp_t *, int);
258 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
259 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
260 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
261 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
262 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
263 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
264 	ibt_mr_hdl_t *, ibt_mr_desc_t *);
265 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *);
266 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
267 	rib_qp_t **);
268 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
269 	rib_qp_t **);
270 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
271 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
272 static int rib_free_sendwait(struct send_wid *);
273 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
274 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
275 static void rdma_done_rem_list(rib_qp_t *);
276 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
277 
278 static void rib_async_handler(void *,
279 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
280 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
281 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
282 static int rib_free_svc_recv(struct svc_recv *);
283 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
284 static void rib_free_wid(struct recv_wid *);
285 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
286 static void rib_detach_hca(rib_hca_t *);
287 static rdma_stat rib_chk_srv_ats(rib_hca_t *, struct netbuf *, int,
288 	ibt_path_info_t *);
289 
290 /*
291  * Registration with IBTF as a consumer
292  */
293 static struct ibt_clnt_modinfo_s rib_modinfo = {
294 	IBTI_V2,
295 	IBT_GENERIC,
296 	rib_async_handler,	/* async event handler */
297 	NULL,			/* Memory Region Handler */
298 	"nfs/ib"
299 };
300 
301 /*
302  * Global strucuture
303  */
304 
305 typedef struct rpcib_s {
306 	dev_info_t	*rpcib_dip;
307 	kmutex_t	rpcib_mutex;
308 } rpcib_t;
309 
310 rpcib_t rpcib;
311 
312 /*
313  * /etc/system controlled variable to control
314  * debugging in rpcib kernel module.
315  * Set it to values greater that 1 to control
316  * the amount of debugging messages required.
317  */
318 int rib_debug = 0;
319 
320 static int ats_running = 0;
321 int
322 _init(void)
323 {
324 	int		error;
325 
326 	error = mod_install((struct modlinkage *)&rib_modlinkage);
327 	if (error != 0) {
328 		/*
329 		 * Could not load module
330 		 */
331 		return (error);
332 	}
333 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
334 
335 	return (0);
336 }
337 
338 int
339 _fini()
340 {
341 	int status;
342 
343 	if ((status = rdma_unregister_mod(&rib_mod)) != RDMA_SUCCESS) {
344 		return (EBUSY);
345 	}
346 
347 	rib_deregister_ats();
348 
349 	/*
350 	 * Remove module
351 	 */
352 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
353 		(void) rdma_register_mod(&rib_mod);
354 		return (status);
355 	}
356 	mutex_destroy(&plugin_state_lock);
357 	return (0);
358 }
359 
360 int
361 _info(struct modinfo *modinfop)
362 {
363 	return (mod_info(&rib_modlinkage, modinfop));
364 }
365 
366 
367 /*
368  * rpcib_getinfo()
369  * Given the device number, return the devinfo pointer or the
370  * instance number.
371  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
372  */
373 
374 /*ARGSUSED*/
375 static int
376 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
377 {
378 	int ret = DDI_SUCCESS;
379 
380 	switch (cmd) {
381 	case DDI_INFO_DEVT2DEVINFO:
382 		if (rpcib.rpcib_dip != NULL)
383 			*result = rpcib.rpcib_dip;
384 		else {
385 			*result = NULL;
386 			ret = DDI_FAILURE;
387 		}
388 		break;
389 
390 	case DDI_INFO_DEVT2INSTANCE:
391 		*result = NULL;
392 		break;
393 
394 	default:
395 		ret = DDI_FAILURE;
396 	}
397 	return (ret);
398 }
399 
400 static int
401 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
402 {
403 	ibt_status_t	ibt_status;
404 	rdma_stat	r_status;
405 
406 	switch (cmd) {
407 	case DDI_ATTACH:
408 		break;
409 	case DDI_RESUME:
410 		return (DDI_SUCCESS);
411 	default:
412 		return (DDI_FAILURE);
413 	}
414 
415 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
416 
417 	mutex_enter(&rpcib.rpcib_mutex);
418 	if (rpcib.rpcib_dip != NULL) {
419 		mutex_exit(&rpcib.rpcib_mutex);
420 		return (DDI_FAILURE);
421 	}
422 	rpcib.rpcib_dip = dip;
423 	mutex_exit(&rpcib.rpcib_mutex);
424 	/*
425 	 * Create the "rpcib" minor-node.
426 	 */
427 	if (ddi_create_minor_node(dip,
428 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
429 		/* Error message, no cmn_err as they print on console */
430 		return (DDI_FAILURE);
431 	}
432 
433 	if (rib_stat == NULL) {
434 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
435 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
436 	}
437 
438 	rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids);
439 	if (rib_stat->hca_count < 1) {
440 		mutex_destroy(&rib_stat->open_hca_lock);
441 		kmem_free(rib_stat, sizeof (*rib_stat));
442 		rib_stat = NULL;
443 		return (DDI_FAILURE);
444 	}
445 
446 	ibt_status = ibt_attach(&rib_modinfo, dip,
447 			(void *)rib_stat, &rib_stat->ibt_clnt_hdl);
448 	if (ibt_status != IBT_SUCCESS) {
449 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
450 		mutex_destroy(&rib_stat->open_hca_lock);
451 		kmem_free(rib_stat, sizeof (*rib_stat));
452 		rib_stat = NULL;
453 		return (DDI_FAILURE);
454 	}
455 
456 	mutex_enter(&rib_stat->open_hca_lock);
457 	if (open_hcas(rib_stat) != RDMA_SUCCESS) {
458 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
459 		(void) ibt_detach(rib_stat->ibt_clnt_hdl);
460 		mutex_exit(&rib_stat->open_hca_lock);
461 		mutex_destroy(&rib_stat->open_hca_lock);
462 		kmem_free(rib_stat, sizeof (*rib_stat));
463 		rib_stat = NULL;
464 		return (DDI_FAILURE);
465 	}
466 	mutex_exit(&rib_stat->open_hca_lock);
467 
468 	/*
469 	 * Register with rdmatf
470 	 */
471 	rib_mod.rdma_count = rib_stat->hca_count;
472 	r_status = rdma_register_mod(&rib_mod);
473 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
474 		rib_detach_hca(rib_stat->hca);
475 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
476 		(void) ibt_detach(rib_stat->ibt_clnt_hdl);
477 		mutex_destroy(&rib_stat->open_hca_lock);
478 		kmem_free(rib_stat, sizeof (*rib_stat));
479 		rib_stat = NULL;
480 		return (DDI_FAILURE);
481 	}
482 
483 
484 	return (DDI_SUCCESS);
485 }
486 
487 /*ARGSUSED*/
488 static int
489 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
490 {
491 	switch (cmd) {
492 
493 	case DDI_DETACH:
494 		break;
495 
496 	case DDI_SUSPEND:
497 	default:
498 		return (DDI_FAILURE);
499 	}
500 
501 	/*
502 	 * Detach the hca and free resources
503 	 */
504 	mutex_enter(&plugin_state_lock);
505 	plugin_state = NO_ACCEPT;
506 	mutex_exit(&plugin_state_lock);
507 	rib_detach_hca(rib_stat->hca);
508 	ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
509 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
510 
511 	mutex_enter(&rpcib.rpcib_mutex);
512 	rpcib.rpcib_dip = NULL;
513 	mutex_exit(&rpcib.rpcib_mutex);
514 
515 	mutex_destroy(&rpcib.rpcib_mutex);
516 	return (DDI_SUCCESS);
517 }
518 
519 
520 static void
521 rib_deregister_ats()
522 {
523 	rib_hca_t		*hca;
524 	rib_service_t		*srv_list, *to_remove;
525 	ibt_status_t   		ibt_status;
526 
527 	/*
528 	 * deregister the Address Translation Service.
529 	 */
530 	hca = rib_stat->hca;
531 	rw_enter(&hca->service_list_lock, RW_WRITER);
532 	srv_list = hca->ats_list;
533 	while (srv_list != NULL) {
534 		to_remove = srv_list;
535 		srv_list = to_remove->srv_next;
536 
537 		ibt_status = ibt_deregister_ar(hca->ibt_clnt_hdl,
538 				&to_remove->srv_ar);
539 		if (ibt_status != IBT_SUCCESS) {
540 #ifdef DEBUG
541 		    if (rib_debug) {
542 			cmn_err(CE_WARN, "_fini: "
543 			    "ibt_deregister_ar FAILED"
544 				" status: %d", ibt_status);
545 		    }
546 #endif
547 		} else {
548 		    mutex_enter(&rib_stat->open_hca_lock);
549 		    ats_running = 0;
550 		    mutex_exit(&rib_stat->open_hca_lock);
551 #ifdef DEBUG
552 		    if (rib_debug) {
553 
554 			cmn_err(CE_NOTE, "_fini: "
555 			    "Successfully unregistered"
556 			    " ATS service: %s",
557 			    to_remove->srv_name);
558 		    }
559 #endif
560 		}
561 		kmem_free(to_remove, sizeof (rib_service_t));
562 	}
563 	hca->ats_list = NULL;
564 	rw_exit(&hca->service_list_lock);
565 }
566 
567 static void rib_rbufpool_free(rib_hca_t *, int);
568 static void rib_rbufpool_deregister(rib_hca_t *, int);
569 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
570 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
571 static rdma_stat rib_rem_replylist(rib_qp_t *);
572 static int rib_remreply(rib_qp_t *, struct reply *);
573 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
574 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
575 
576 /*
577  * One CQ pair per HCA
578  */
579 static rdma_stat
580 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
581 	rib_cq_t **cqp, rpcib_state_t *ribstat)
582 {
583 	rib_cq_t	*cq;
584 	ibt_cq_attr_t	cq_attr;
585 	uint32_t	real_size;
586 	ibt_status_t	status;
587 	rdma_stat	error = RDMA_SUCCESS;
588 
589 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
590 	cq->rib_hca = hca;
591 	cq_attr.cq_size = cq_size;
592 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
593 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
594 	    &real_size);
595 	if (status != IBT_SUCCESS) {
596 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
597 				" status=%d", status);
598 		error = RDMA_FAILED;
599 		goto fail;
600 	}
601 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat);
602 
603 	/*
604 	 * Enable CQ callbacks. CQ Callbacks are single shot
605 	 * (e.g. you have to call ibt_enable_cq_notify()
606 	 * after each callback to get another one).
607 	 */
608 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
609 	if (status != IBT_SUCCESS) {
610 		cmn_err(CE_WARN, "rib_create_cq: "
611 			"enable_cq_notify failed, status %d", status);
612 		error = RDMA_FAILED;
613 		goto fail;
614 	}
615 	*cqp = cq;
616 
617 	return (error);
618 fail:
619 	if (cq->rib_cq_hdl)
620 		(void) ibt_free_cq(cq->rib_cq_hdl);
621 	if (cq)
622 		kmem_free(cq, sizeof (rib_cq_t));
623 	return (error);
624 }
625 
626 static rdma_stat
627 open_hcas(rpcib_state_t *ribstat)
628 {
629 	rib_hca_t		*hca;
630 	ibt_status_t		ibt_status;
631 	rdma_stat		status;
632 	ibt_hca_portinfo_t	*pinfop;
633 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
634 	uint_t			size, cq_size;
635 	int			i;
636 
637 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
638 	if (ribstat->hcas == NULL)
639 		ribstat->hcas = kmem_zalloc(ribstat->hca_count *
640 				    sizeof (rib_hca_t), KM_SLEEP);
641 
642 	/*
643 	 * Open a hca and setup for RDMA
644 	 */
645 	for (i = 0; i < ribstat->hca_count; i++) {
646 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
647 				ribstat->hca_guids[i],
648 				&ribstat->hcas[i].hca_hdl);
649 		if (ibt_status != IBT_SUCCESS) {
650 			cmn_err(CE_WARN, "open_hcas: ibt_open_hca (%d) "
651 				"returned %d", i, ibt_status);
652 			continue;
653 		}
654 		ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
655 		hca = &(ribstat->hcas[i]);
656 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
657 		hca->state = HCA_INITED;
658 
659 		/*
660 		 * query HCA info
661 		 */
662 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
663 		if (ibt_status != IBT_SUCCESS) {
664 			cmn_err(CE_WARN, "open_hcas: ibt_query_hca "
665 			    "returned %d (hca_guid 0x%llx)",
666 			    ibt_status, (longlong_t)ribstat->hca_guids[i]);
667 			goto fail1;
668 		}
669 
670 		/*
671 		 * One PD (Protection Domain) per HCA.
672 		 * A qp is allowed to access a memory region
673 		 * only when it's in the same PD as that of
674 		 * the memory region.
675 		 */
676 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
677 		if (ibt_status != IBT_SUCCESS) {
678 			cmn_err(CE_WARN, "open_hcas: ibt_alloc_pd "
679 				"returned %d (hca_guid 0x%llx)",
680 				ibt_status, (longlong_t)ribstat->hca_guids[i]);
681 			goto fail1;
682 		}
683 
684 		/*
685 		 * query HCA ports
686 		 */
687 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
688 				0, &pinfop, &hca->hca_nports, &size);
689 		if (ibt_status != IBT_SUCCESS) {
690 			cmn_err(CE_WARN, "open_hcas: "
691 				"ibt_query_hca_ports returned %d "
692 				"(hca_guid 0x%llx)",
693 				ibt_status, (longlong_t)hca->hca_guid);
694 			goto fail2;
695 		}
696 		hca->hca_ports = pinfop;
697 		hca->hca_pinfosz = size;
698 		pinfop = NULL;
699 
700 		cq_size = DEF_CQ_SIZE; /* default cq size */
701 		/*
702 		 * Create 2 pairs of cq's (1 pair for client
703 		 * and the other pair for server) on this hca.
704 		 * If number of qp's gets too large, then several
705 		 * cq's will be needed.
706 		 */
707 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
708 				&hca->svc_rcq, ribstat);
709 		if (status != RDMA_SUCCESS) {
710 			goto fail3;
711 		}
712 
713 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
714 				&hca->svc_scq, ribstat);
715 		if (status != RDMA_SUCCESS) {
716 			goto fail3;
717 		}
718 
719 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
720 				&hca->clnt_rcq, ribstat);
721 		if (status != RDMA_SUCCESS) {
722 			goto fail3;
723 		}
724 
725 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
726 				&hca->clnt_scq, ribstat);
727 		if (status != RDMA_SUCCESS) {
728 			goto fail3;
729 		}
730 
731 		/*
732 		 * Create buffer pools.
733 		 * Note rib_rbuf_create also allocates memory windows.
734 		 */
735 		hca->recv_pool = rib_rbufpool_create(hca,
736 					RECV_BUFFER, MAX_BUFS);
737 		if (hca->recv_pool == NULL) {
738 			cmn_err(CE_WARN, "open_hcas: recv buf pool failed\n");
739 			goto fail3;
740 		}
741 
742 		hca->send_pool = rib_rbufpool_create(hca,
743 					SEND_BUFFER, MAX_BUFS);
744 		if (hca->send_pool == NULL) {
745 			cmn_err(CE_WARN, "open_hcas: send buf pool failed\n");
746 			rib_rbufpool_destroy(hca, RECV_BUFFER);
747 			goto fail3;
748 		}
749 
750 		/*
751 		 * Initialize the registered service list and
752 		 * the lock
753 		 */
754 		hca->service_list = NULL;
755 		rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
756 
757 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
758 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
759 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
760 			hca->iblock);
761 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
762 			hca->iblock);
763 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
764 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
765 		hca->inuse = TRUE;
766 		/*
767 		 * XXX One hca only. Add multi-hca functionality if needed
768 		 * later.
769 		 */
770 		ribstat->hca = hca;
771 		ribstat->nhca_inited++;
772 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
773 		break;
774 
775 fail3:
776 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
777 fail2:
778 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
779 fail1:
780 		(void) ibt_close_hca(hca->hca_hdl);
781 
782 	}
783 	if (ribstat->hca != NULL)
784 		return (RDMA_SUCCESS);
785 	else
786 		return (RDMA_FAILED);
787 }
788 
789 /*
790  * Callback routines
791  */
792 
793 /*
794  * SCQ handlers
795  */
796 /* ARGSUSED */
797 static void
798 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
799 {
800 	ibt_status_t	ibt_status;
801 	ibt_wc_t	wc;
802 	int		i;
803 
804 	/*
805 	 * Re-enable cq notify here to avoid missing any
806 	 * completion queue notification.
807 	 */
808 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
809 
810 	ibt_status = IBT_SUCCESS;
811 	while (ibt_status != IBT_CQ_EMPTY) {
812 	    bzero(&wc, sizeof (wc));
813 	    ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
814 	    if (ibt_status != IBT_SUCCESS)
815 		return;
816 
817 	/*
818 	 * Got a send completion
819 	 */
820 	    if (wc.wc_id != NULL) {	/* XXX can it be otherwise ???? */
821 		struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
822 		CONN	*conn = qptoc(wd->qp);
823 
824 		mutex_enter(&wd->sendwait_lock);
825 		switch (wc.wc_status) {
826 		case IBT_WC_SUCCESS:
827 			wd->status = RDMA_SUCCESS;
828 			break;
829 		case IBT_WC_WR_FLUSHED_ERR:
830 			wd->status = RDMA_FAILED;
831 			break;
832 		default:
833 /*
834  *    RC Send Q Error Code		Local state     Remote State
835  *    ==================== 		===========     ============
836  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
837  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
838  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
839  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
840  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
841  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
842  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
843  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
844  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
845  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
846  *    IBT_WC_WR_FLUSHED_ERR               None            None
847  */
848 #ifdef DEBUG
849 	if (rib_debug > 1) {
850 	    if (wc.wc_status != IBT_WC_SUCCESS) {
851 		    cmn_err(CE_NOTE, "rib_clnt_scq_handler: "
852 			"WR completed in error, wc.wc_status:%d, "
853 			"wc_id:%llx\n", wc.wc_status, (longlong_t)wc.wc_id);
854 	    }
855 	}
856 #endif
857 			/*
858 			 * Channel in error state. Set connection to
859 			 * ERROR and cleanup will happen either from
860 			 * conn_release  or from rib_conn_get
861 			 */
862 			wd->status = RDMA_FAILED;
863 			mutex_enter(&conn->c_lock);
864 			if (conn->c_state != C_DISCONN_PEND)
865 				conn->c_state = C_ERROR;
866 			mutex_exit(&conn->c_lock);
867 			break;
868 		}
869 		if (wd->cv_sig == 1) {
870 			/*
871 			 * Notify poster
872 			 */
873 			cv_signal(&wd->wait_cv);
874 			mutex_exit(&wd->sendwait_lock);
875 		} else {
876 			/*
877 			 * Poster not waiting for notification.
878 			 * Free the send buffers and send_wid
879 			 */
880 			for (i = 0; i < wd->nsbufs; i++) {
881 				rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
882 					(void *)(uintptr_t)wd->sbufaddr[i]);
883 			}
884 			mutex_exit(&wd->sendwait_lock);
885 			(void) rib_free_sendwait(wd);
886 		}
887 	    }
888 	}
889 }
890 
891 /* ARGSUSED */
892 static void
893 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
894 {
895 	ibt_status_t	ibt_status;
896 	ibt_wc_t	wc;
897 	int		i;
898 
899 	/*
900 	 * Re-enable cq notify here to avoid missing any
901 	 * completion queue notification.
902 	 */
903 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
904 
905 	ibt_status = IBT_SUCCESS;
906 	while (ibt_status != IBT_CQ_EMPTY) {
907 	    bzero(&wc, sizeof (wc));
908 	    ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
909 	    if (ibt_status != IBT_SUCCESS)
910 		return;
911 
912 	/*
913 	 * Got a send completion
914 	 */
915 #ifdef DEBUG
916 	    if (rib_debug > 1 && wc.wc_status != IBT_WC_SUCCESS) {
917 		cmn_err(CE_NOTE, "rib_svc_scq_handler: WR completed in error "
918 			"wc.wc_status:%d, wc_id:%llX",
919 			wc.wc_status, (longlong_t)wc.wc_id);
920 	    }
921 #endif
922 	    if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
923 		struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
924 
925 		mutex_enter(&wd->sendwait_lock);
926 		if (wd->cv_sig == 1) {
927 			/*
928 			 * Update completion status and notify poster
929 			 */
930 			if (wc.wc_status == IBT_WC_SUCCESS)
931 				wd->status = RDMA_SUCCESS;
932 			else
933 				wd->status = RDMA_FAILED;
934 			cv_signal(&wd->wait_cv);
935 			mutex_exit(&wd->sendwait_lock);
936 		} else {
937 			/*
938 			 * Poster not waiting for notification.
939 			 * Free the send buffers and send_wid
940 			 */
941 			for (i = 0; i < wd->nsbufs; i++) {
942 				rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
943 					(void *)(uintptr_t)wd->sbufaddr[i]);
944 			}
945 			mutex_exit(&wd->sendwait_lock);
946 			(void) rib_free_sendwait(wd);
947 		}
948 	    }
949 	}
950 }
951 
952 /*
953  * RCQ handler
954  */
955 /* ARGSUSED */
956 static void
957 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
958 {
959 	rib_qp_t	*qp;
960 	ibt_status_t	ibt_status;
961 	ibt_wc_t	wc;
962 	struct recv_wid	*rwid;
963 
964 	/*
965 	 * Re-enable cq notify here to avoid missing any
966 	 * completion queue notification.
967 	 */
968 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
969 
970 	ibt_status = IBT_SUCCESS;
971 	while (ibt_status != IBT_CQ_EMPTY) {
972 		bzero(&wc, sizeof (wc));
973 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
974 		if (ibt_status != IBT_SUCCESS)
975 		    return;
976 
977 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
978 		qp = rwid->qp;
979 		if (wc.wc_status == IBT_WC_SUCCESS) {
980 		    XDR			inxdrs, *xdrs;
981 		    uint_t		xid, vers, op, find_xid = 0;
982 		    struct reply	*r;
983 		    CONN *conn = qptoc(qp);
984 
985 		    xdrs = &inxdrs;
986 		    xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
987 			wc.wc_bytes_xfer, XDR_DECODE);
988 		/*
989 		 * Treat xid as opaque (xid is the first entity
990 		 * in the rpc rdma message).
991 		 */
992 		    xid = *(uint32_t *)(uintptr_t)rwid->addr;
993 		/* Skip xid and set the xdr position accordingly. */
994 		    XDR_SETPOS(xdrs, sizeof (uint32_t));
995 		    (void) xdr_u_int(xdrs, &vers);
996 		    (void) xdr_u_int(xdrs, &op);
997 		    XDR_DESTROY(xdrs);
998 		    if (vers != RPCRDMA_VERS) {
999 			/*
1000 			 * Invalid RPC/RDMA version. Cannot interoperate.
1001 			 * Set connection to ERROR state and bail out.
1002 			 */
1003 			mutex_enter(&conn->c_lock);
1004 			if (conn->c_state != C_DISCONN_PEND)
1005 				conn->c_state = C_ERROR;
1006 			mutex_exit(&conn->c_lock);
1007 			rib_rbuf_free(conn, RECV_BUFFER,
1008 				(void *)(uintptr_t)rwid->addr);
1009 			rib_free_wid(rwid);
1010 			continue;
1011 		    }
1012 
1013 		    mutex_enter(&qp->replylist_lock);
1014 		    for (r = qp->replylist; r != NULL; r = r->next) {
1015 			if (r->xid == xid) {
1016 			    find_xid = 1;
1017 			    switch (op) {
1018 			    case RDMA_MSG:
1019 			    case RDMA_NOMSG:
1020 			    case RDMA_MSGP:
1021 				r->status = RDMA_SUCCESS;
1022 				r->vaddr_cq = rwid->addr;
1023 				r->bytes_xfer = wc.wc_bytes_xfer;
1024 				cv_signal(&r->wait_cv);
1025 				break;
1026 			    default:
1027 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1028 						(void *)(uintptr_t)rwid->addr);
1029 				break;
1030 			    }
1031 			    break;
1032 			}
1033 		    }
1034 		    mutex_exit(&qp->replylist_lock);
1035 		    if (find_xid == 0) {
1036 			/* RPC caller not waiting for reply */
1037 #ifdef DEBUG
1038 			    if (rib_debug) {
1039 			cmn_err(CE_NOTE, "rib_clnt_rcq_handler: "
1040 			    "NO matching xid %u!\n", xid);
1041 			    }
1042 #endif
1043 			rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1044 				(void *)(uintptr_t)rwid->addr);
1045 		    }
1046 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1047 			CONN *conn = qptoc(qp);
1048 
1049 			/*
1050 			 * Connection being flushed. Just free
1051 			 * the posted buffer
1052 			 */
1053 			rib_rbuf_free(conn, RECV_BUFFER,
1054 				(void *)(uintptr_t)rwid->addr);
1055 		} else {
1056 			CONN *conn = qptoc(qp);
1057 /*
1058  *  RC Recv Q Error Code		Local state     Remote State
1059  *  ====================		===========     ============
1060  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1061  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1062  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1063  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1064  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1065  *  IBT_WC_WR_FLUSHED_ERR               None            None
1066  */
1067 			/*
1068 			 * Channel in error state. Set connection
1069 			 * in ERROR state.
1070 			 */
1071 			mutex_enter(&conn->c_lock);
1072 			if (conn->c_state != C_DISCONN_PEND)
1073 				conn->c_state = C_ERROR;
1074 			mutex_exit(&conn->c_lock);
1075 			rib_rbuf_free(conn, RECV_BUFFER,
1076 				(void *)(uintptr_t)rwid->addr);
1077 		}
1078 		rib_free_wid(rwid);
1079 	}
1080 }
1081 
1082 /* Server side */
1083 /* ARGSUSED */
1084 static void
1085 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1086 {
1087 	struct recv_data *rd;
1088 	rib_qp_t	*qp;
1089 	ibt_status_t	ibt_status;
1090 	ibt_wc_t	wc;
1091 	struct svc_recv	*s_recvp;
1092 	CONN		*conn;
1093 	mblk_t		*mp;
1094 
1095 	/*
1096 	 * Re-enable cq notify here to avoid missing any
1097 	 * completion queue notification.
1098 	 */
1099 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1100 
1101 	ibt_status = IBT_SUCCESS;
1102 	while (ibt_status != IBT_CQ_EMPTY) {
1103 		bzero(&wc, sizeof (wc));
1104 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1105 		if (ibt_status != IBT_SUCCESS)
1106 		    return;
1107 
1108 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1109 		qp = s_recvp->qp;
1110 		conn = qptoc(qp);
1111 		mutex_enter(&qp->posted_rbufs_lock);
1112 		qp->n_posted_rbufs--;
1113 		if (qp->n_posted_rbufs == 0)
1114 			cv_signal(&qp->posted_rbufs_cv);
1115 		mutex_exit(&qp->posted_rbufs_lock);
1116 
1117 		if (wc.wc_status == IBT_WC_SUCCESS) {
1118 		    XDR		inxdrs, *xdrs;
1119 		    uint_t	xid, vers, op;
1120 
1121 		    xdrs = &inxdrs;
1122 		    /* s_recvp->vaddr stores data */
1123 		    xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1124 			wc.wc_bytes_xfer, XDR_DECODE);
1125 
1126 		/*
1127 		 * Treat xid as opaque (xid is the first entity
1128 		 * in the rpc rdma message).
1129 		 */
1130 		    xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1131 		/* Skip xid and set the xdr position accordingly. */
1132 		    XDR_SETPOS(xdrs, sizeof (uint32_t));
1133 		    if (!xdr_u_int(xdrs, &vers) ||
1134 			!xdr_u_int(xdrs, &op)) {
1135 			rib_rbuf_free(conn, RECV_BUFFER,
1136 				(void *)(uintptr_t)s_recvp->vaddr);
1137 			XDR_DESTROY(xdrs);
1138 #ifdef DEBUG
1139 			cmn_err(CE_NOTE, "rib_svc_rcq_handler: "
1140 			    "xdr_u_int failed for qp %p, wc_id=%llx",
1141 			    (void *)qp, (longlong_t)wc.wc_id);
1142 #endif
1143 			(void) rib_free_svc_recv(s_recvp);
1144 			continue;
1145 		    }
1146 		    XDR_DESTROY(xdrs);
1147 
1148 		    if (vers != RPCRDMA_VERS) {
1149 			/*
1150 			 * Invalid RPC/RDMA version. Drop rpc rdma message.
1151 			 */
1152 			rib_rbuf_free(conn, RECV_BUFFER,
1153 				(void *)(uintptr_t)s_recvp->vaddr);
1154 			(void) rib_free_svc_recv(s_recvp);
1155 			continue;
1156 		    }
1157 			/*
1158 			 * Is this for RDMA_DONE?
1159 			 */
1160 		    if (op == RDMA_DONE) {
1161 			rib_rbuf_free(conn, RECV_BUFFER,
1162 				(void *)(uintptr_t)s_recvp->vaddr);
1163 			/*
1164 			 * Wake up the thread waiting on
1165 			 * a RDMA_DONE for xid
1166 			 */
1167 			mutex_enter(&qp->rdlist_lock);
1168 			rdma_done_notify(qp, xid);
1169 			mutex_exit(&qp->rdlist_lock);
1170 			(void) rib_free_svc_recv(s_recvp);
1171 			continue;
1172 		    }
1173 
1174 		    mutex_enter(&plugin_state_lock);
1175 		    if (plugin_state == ACCEPT) {
1176 			while ((mp = allocb(sizeof (*rd), BPRI_LO)) == NULL)
1177 			    (void) strwaitbuf(sizeof (*rd), BPRI_LO);
1178 			/*
1179 			 * Plugin is in accept state, hence the master
1180 			 * transport queue for this is still accepting
1181 			 * requests. Hence we can call svc_queuereq to
1182 			 * queue this recieved msg.
1183 			 */
1184 			rd = (struct recv_data *)mp->b_rptr;
1185 			rd->conn = conn;
1186 			rd->rpcmsg.addr = (caddr_t)(uintptr_t)s_recvp->vaddr;
1187 			rd->rpcmsg.type = RECV_BUFFER;
1188 			rd->rpcmsg.len = wc.wc_bytes_xfer;
1189 			rd->status = wc.wc_status;
1190 			mutex_enter(&conn->c_lock);
1191 			conn->c_ref++;
1192 			mutex_exit(&conn->c_lock);
1193 			mp->b_wptr += sizeof (*rd);
1194 			svc_queuereq((queue_t *)rib_stat->q, mp);
1195 			mutex_exit(&plugin_state_lock);
1196 		    } else {
1197 			/*
1198 			 * The master transport for this is going
1199 			 * away and the queue is not accepting anymore
1200 			 * requests for krpc, so don't do anything, just
1201 			 * free the msg.
1202 			 */
1203 			mutex_exit(&plugin_state_lock);
1204 			rib_rbuf_free(conn, RECV_BUFFER,
1205 			(void *)(uintptr_t)s_recvp->vaddr);
1206 		    }
1207 		} else {
1208 			rib_rbuf_free(conn, RECV_BUFFER,
1209 				(void *)(uintptr_t)s_recvp->vaddr);
1210 		}
1211 		(void) rib_free_svc_recv(s_recvp);
1212 	}
1213 }
1214 
1215 /*
1216  * Handles DR event of IBT_HCA_DETACH_EVENT.
1217  */
1218 /* ARGSUSED */
1219 static void
1220 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1221 	ibt_async_code_t code, ibt_async_event_t *event)
1222 {
1223 
1224 	switch (code) {
1225 	case IBT_HCA_ATTACH_EVENT:
1226 		/* ignore */
1227 		break;
1228 	case IBT_HCA_DETACH_EVENT:
1229 	{
1230 		ASSERT(rib_stat->hca->hca_hdl == hca_hdl);
1231 		rib_detach_hca(rib_stat->hca);
1232 #ifdef DEBUG
1233 	cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1234 #endif
1235 		break;
1236 	}
1237 #ifdef DEBUG
1238 	case IBT_EVENT_PATH_MIGRATED:
1239 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PATH_MIGRATED\n");
1240 		break;
1241 	case IBT_EVENT_SQD:
1242 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1243 		break;
1244 	case IBT_EVENT_COM_EST:
1245 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1246 		break;
1247 	case IBT_ERROR_CATASTROPHIC_CHAN:
1248 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CATASTROPHIC_CHAN\n");
1249 		break;
1250 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1251 	cmn_err(CE_NOTE, "rib_async_handler(): "
1252 		"IBT_ERROR_INVALID_REQUEST_CHAN\n");
1253 		break;
1254 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1255 	cmn_err(CE_NOTE, "rib_async_handler(): "
1256 		"IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1257 		break;
1258 	case IBT_ERROR_PATH_MIGRATE_REQ:
1259 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PATH_MIGRATE_REQ\n");
1260 		break;
1261 	case IBT_ERROR_CQ:
1262 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1263 		break;
1264 	case IBT_ERROR_PORT_DOWN:
1265 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1266 		break;
1267 	case IBT_EVENT_PORT_UP:
1268 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1269 		break;
1270 	case IBT_ASYNC_OPAQUE1:
1271 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1272 		break;
1273 	case IBT_ASYNC_OPAQUE2:
1274 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1275 		break;
1276 	case IBT_ASYNC_OPAQUE3:
1277 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1278 		break;
1279 	case IBT_ASYNC_OPAQUE4:
1280 	cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1281 		break;
1282 #endif
1283 	default:
1284 		break;
1285 	}
1286 }
1287 
1288 /*
1289  * Client's reachable function.
1290  */
1291 static rdma_stat
1292 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1293 {
1294 	rib_hca_t	*hca;
1295 	rdma_stat	status;
1296 
1297 	/*
1298 	 * First check if a hca is still attached
1299 	 */
1300 	*handle = NULL;
1301 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
1302 	if (rib_stat->hca->state != HCA_INITED) {
1303 		rw_exit(&rib_stat->hca->state_lock);
1304 		return (RDMA_FAILED);
1305 	}
1306 	status = rib_ping_srv(addr_type, raddr, &hca);
1307 	rw_exit(&rib_stat->hca->state_lock);
1308 
1309 	if (status == RDMA_SUCCESS) {
1310 		*handle = (void *)hca;
1311 		/*
1312 		 * Register the Address translation service
1313 		 */
1314 		mutex_enter(&rib_stat->open_hca_lock);
1315 		if (ats_running == 0) {
1316 			if (rib_register_ats(rib_stat->hca)
1317 			    == RDMA_SUCCESS) {
1318 				ats_running = 1;
1319 				mutex_exit(&rib_stat->open_hca_lock);
1320 				return (RDMA_SUCCESS);
1321 			} else {
1322 				mutex_exit(&rib_stat->open_hca_lock);
1323 				return (RDMA_FAILED);
1324 			}
1325 		} else {
1326 			mutex_exit(&rib_stat->open_hca_lock);
1327 			return (RDMA_SUCCESS);
1328 		}
1329 	} else {
1330 		*handle = NULL;
1331 		if (rib_debug > 2)
1332 		    cmn_err(CE_WARN, "rib_reachable(): ping_srv failed.\n");
1333 		return (RDMA_FAILED);
1334 	}
1335 }
1336 
1337 /* Client side qp creation */
1338 static rdma_stat
1339 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1340 {
1341 	rib_qp_t	*kqp = NULL;
1342 	CONN		*conn;
1343 
1344 	ASSERT(qp != NULL);
1345 	*qp = NULL;
1346 
1347 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1348 	conn = qptoc(kqp);
1349 	kqp->hca = hca;
1350 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1351 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1352 
1353 	kqp->mode = RIB_CLIENT;
1354 	kqp->chan_flags = IBT_BLOCKING;
1355 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1356 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1357 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1358 
1359 	/*
1360 	 * Initialize
1361 	 */
1362 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1363 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1364 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1365 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1366 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1367 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1368 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1369 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1370 
1371 	*qp = kqp;
1372 	return (RDMA_SUCCESS);
1373 }
1374 
1375 /* Server side qp creation */
1376 static rdma_stat
1377 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1378 {
1379 	rib_qp_t	*kqp = NULL;
1380 	ibt_chan_sizes_t	chan_sizes;
1381 	ibt_rc_chan_alloc_args_t	qp_attr;
1382 	ibt_status_t		ibt_status;
1383 
1384 	ASSERT(qp != NULL);
1385 	*qp = NULL;
1386 
1387 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1388 	kqp->hca = hca;
1389 	kqp->port_num = port;
1390 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1391 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1392 
1393 	/*
1394 	 * Create the qp handle
1395 	 */
1396 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1397 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1398 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1399 	qp_attr.rc_pd = hca->pd_hdl;
1400 	qp_attr.rc_hca_port_num = port;
1401 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1402 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1403 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1404 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1405 	qp_attr.rc_clone_chan = NULL;
1406 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1407 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1408 
1409 	rw_enter(&hca->state_lock, RW_READER);
1410 	if (hca->state != HCA_DETACHED) {
1411 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1412 			IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1413 			&chan_sizes);
1414 	} else {
1415 		rw_exit(&hca->state_lock);
1416 		goto fail;
1417 	}
1418 	rw_exit(&hca->state_lock);
1419 
1420 	if (ibt_status != IBT_SUCCESS) {
1421 		cmn_err(CE_WARN, "rib_svc_create_chan: "
1422 			"ibt_alloc_rc_channel failed, ibt_status=%d.",
1423 			ibt_status);
1424 		goto fail;
1425 	}
1426 
1427 	kqp->mode = RIB_SERVER;
1428 	kqp->chan_flags = IBT_BLOCKING;
1429 	kqp->q = q;	/* server ONLY */
1430 
1431 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1432 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1433 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1434 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1435 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1436 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1437 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1438 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1439 	/*
1440 	 * Set the private data area to qp to be used in callbacks
1441 	 */
1442 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1443 	kqp->rdmaconn.c_state = C_CONNECTED;
1444 	*qp = kqp;
1445 	return (RDMA_SUCCESS);
1446 fail:
1447 	if (kqp)
1448 		kmem_free(kqp, sizeof (rib_qp_t));
1449 
1450 	return (RDMA_FAILED);
1451 }
1452 
1453 void
1454 rib_dump_pathrec(ibt_path_info_t *path_rec)
1455 {
1456 	ib_pkey_t	pkey;
1457 
1458 	if (rib_debug > 1) {
1459 	    cmn_err(CE_NOTE, "Path Record:\n");
1460 
1461 	    cmn_err(CE_NOTE, "Source HCA GUID = %llx\n",
1462 		(longlong_t)path_rec->pi_hca_guid);
1463 	    cmn_err(CE_NOTE, "Dest Service ID = %llx\n",
1464 		(longlong_t)path_rec->pi_sid);
1465 	    cmn_err(CE_NOTE, "Port Num        = %02d\n",
1466 		path_rec->pi_prim_cep_path.cep_hca_port_num);
1467 	    cmn_err(CE_NOTE, "P_Key Index     = %04d\n",
1468 		path_rec->pi_prim_cep_path.cep_pkey_ix);
1469 
1470 	    (void) ibt_index2pkey_byguid(path_rec->pi_hca_guid,
1471 			path_rec->pi_prim_cep_path.cep_hca_port_num,
1472 			path_rec->pi_prim_cep_path.cep_pkey_ix, &pkey);
1473 	    cmn_err(CE_NOTE, "P_Key		= 0x%x\n", pkey);
1474 
1475 
1476 	    cmn_err(CE_NOTE, "SGID:           = %llx:%llx\n",
1477 		(longlong_t)
1478 		path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_prefix,
1479 		(longlong_t)
1480 		path_rec->pi_prim_cep_path.cep_adds_vect.av_sgid.gid_guid);
1481 
1482 	    cmn_err(CE_NOTE, "DGID:           = %llx:%llx\n",
1483 		(longlong_t)
1484 		path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_prefix,
1485 		(longlong_t)
1486 		path_rec->pi_prim_cep_path.cep_adds_vect.av_dgid.gid_guid);
1487 
1488 	    cmn_err(CE_NOTE, "Path Rate       = %02x\n",
1489 		path_rec->pi_prim_cep_path.cep_adds_vect.av_srate);
1490 	    cmn_err(CE_NOTE, "SL              = %02x\n",
1491 		path_rec->pi_prim_cep_path.cep_adds_vect.av_srvl);
1492 	    cmn_err(CE_NOTE, "Prim Packet LT  = %02x\n",
1493 		path_rec->pi_prim_pkt_lt);
1494 	    cmn_err(CE_NOTE, "Path MTU        = %02x\n",
1495 		path_rec->pi_path_mtu);
1496 	}
1497 }
1498 
1499 /* ARGSUSED */
1500 ibt_cm_status_t
1501 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1502     ibt_cm_return_args_t *ret_args, void *priv_data,
1503     ibt_priv_data_len_t len)
1504 {
1505 	rpcib_state_t   *ribstat;
1506 	rib_hca_t	*hca;
1507 
1508 	ribstat = (rpcib_state_t *)clnt_hdl;
1509 	hca = (rib_hca_t *)ribstat->hca;
1510 
1511 	switch (event->cm_type) {
1512 
1513 	/* got a connection close event */
1514 	case IBT_CM_EVENT_CONN_CLOSED:
1515 	{
1516 		CONN	*conn;
1517 		rib_qp_t *qp;
1518 
1519 		/* check reason why connection was closed */
1520 		switch (event->cm_event.closed) {
1521 		case IBT_CM_CLOSED_DREP_RCVD:
1522 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1523 		case IBT_CM_CLOSED_DUP:
1524 		case IBT_CM_CLOSED_ABORT:
1525 		case IBT_CM_CLOSED_ALREADY:
1526 			/*
1527 			 * These cases indicate the local end initiated
1528 			 * the closing of the channel. Nothing to do here.
1529 			 */
1530 			break;
1531 		default:
1532 			/*
1533 			 * Reason for CONN_CLOSED event must be one of
1534 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1535 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1536 			 * the remote end is closing the channel. In these
1537 			 * cases free the channel and transition to error
1538 			 * state
1539 			 */
1540 			qp = ibt_get_chan_private(event->cm_channel);
1541 			conn = qptoc(qp);
1542 			mutex_enter(&conn->c_lock);
1543 			if (conn->c_state == C_DISCONN_PEND) {
1544 				mutex_exit(&conn->c_lock);
1545 				break;
1546 			}
1547 
1548 			conn->c_state = C_ERROR;
1549 
1550 			/*
1551 			 * Free the rc_channel. Channel has already
1552 			 * transitioned to ERROR state and WRs have been
1553 			 * FLUSHED_ERR already.
1554 			 */
1555 			(void) ibt_free_channel(qp->qp_hdl);
1556 			qp->qp_hdl = NULL;
1557 
1558 			/*
1559 			 * Free the conn if c_ref is down to 0 already
1560 			 */
1561 			if (conn->c_ref == 0) {
1562 				/*
1563 				 * Remove from list and free conn
1564 				 */
1565 				conn->c_state = C_DISCONN_PEND;
1566 				mutex_exit(&conn->c_lock);
1567 				(void) rib_disconnect_channel(conn,
1568 					&hca->cl_conn_list);
1569 			} else {
1570 				mutex_exit(&conn->c_lock);
1571 			}
1572 #ifdef DEBUG
1573 			if (rib_debug)
1574 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1575 					"(CONN_CLOSED) channel disconnected");
1576 #endif
1577 			break;
1578 		}
1579 		break;
1580 	}
1581 	default:
1582 		break;
1583 	}
1584 	return (IBT_CM_ACCEPT);
1585 }
1586 
1587 
1588 /* Check if server has done ATS registration */
1589 rdma_stat
1590 rib_chk_srv_ats(rib_hca_t *hca, struct netbuf *raddr,
1591 	int addr_type, ibt_path_info_t *path)
1592 {
1593 	struct sockaddr_in	*sin4;
1594 	struct sockaddr_in6	*sin6;
1595 	ibt_path_attr_t		path_attr;
1596 	ibt_status_t		ibt_status;
1597 	ib_pkey_t		pkey;
1598 	ibt_ar_t		ar_query, ar_result;
1599 	rib_service_t		*ats;
1600 	ib_gid_t		sgid;
1601 	ibt_path_info_t		paths[MAX_PORTS];
1602 	uint8_t			npaths, i;
1603 
1604 	(void) bzero(&path_attr, sizeof (ibt_path_attr_t));
1605 	(void) bzero(path, sizeof (ibt_path_info_t));
1606 
1607 	/*
1608 	 * Construct svc name
1609 	 */
1610 	path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
1611 	switch (addr_type) {
1612 	case AF_INET:
1613 		sin4 = (struct sockaddr_in *)raddr->buf;
1614 		(void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname,
1615 		    IB_SVC_NAME_LEN);
1616 		break;
1617 
1618 	case AF_INET6:
1619 		sin6 = (struct sockaddr_in6 *)raddr->buf;
1620 		(void) inet_ntop(AF_INET6, &sin6->sin6_addr,
1621 		    path_attr.pa_sname, IB_SVC_NAME_LEN);
1622 		break;
1623 
1624 	default:
1625 		kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1626 		return (RDMA_INVAL);
1627 	}
1628 	(void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN);
1629 
1630 	/*
1631 	 * Attempt a path to the server on an ATS-registered port.
1632 	 * Try all ATS-registered ports until one succeeds.
1633 	 * The first one that succeeds will be used to connect
1634 	 * to the server.  If none of them succeed, return RDMA_FAILED.
1635 	 */
1636 	rw_enter(&hca->state_lock, RW_READER);
1637 	if (hca->state != HCA_DETACHED) {
1638 	    rw_enter(&hca->service_list_lock, RW_READER);
1639 	    for (ats = hca->ats_list; ats != NULL; ats = ats->srv_next) {
1640 		path_attr.pa_hca_guid = hca->hca_guid;
1641 		path_attr.pa_hca_port_num = ats->srv_port;
1642 		ibt_status = ibt_get_paths(hca->ibt_clnt_hdl,
1643 			IBT_PATH_MULTI_SVC_DEST, &path_attr, 2, paths, &npaths);
1644 		if (ibt_status == IBT_SUCCESS ||
1645 			ibt_status == IBT_INSUFF_DATA) {
1646 		    for (i = 0; i < npaths; i++) {
1647 			if (paths[i].pi_hca_guid) {
1648 			/*
1649 			 * do ibt_query_ar()
1650 			 */
1651 			    sgid =
1652 				paths[i].pi_prim_cep_path.cep_adds_vect.av_sgid;
1653 
1654 			    (void) ibt_index2pkey_byguid(paths[i].pi_hca_guid,
1655 				paths[i].pi_prim_cep_path.cep_hca_port_num,
1656 				paths[i].pi_prim_cep_path.cep_pkey_ix, &pkey);
1657 
1658 			    bzero(&ar_query, sizeof (ar_query));
1659 			    bzero(&ar_result, sizeof (ar_result));
1660 			    ar_query.ar_gid =
1661 				paths[i].pi_prim_cep_path.cep_adds_vect.av_dgid;
1662 			    ar_query.ar_pkey = pkey;
1663 			    ibt_status = ibt_query_ar(&sgid, &ar_query,
1664 					&ar_result);
1665 			    if (ibt_status == IBT_SUCCESS) {
1666 #ifdef DEBUG
1667 				if (rib_debug > 1)
1668 				    rib_dump_pathrec(&paths[i]);
1669 #endif
1670 				bcopy(&paths[i], path,
1671 					sizeof (ibt_path_info_t));
1672 				rw_exit(&hca->service_list_lock);
1673 				kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1674 				rw_exit(&hca->state_lock);
1675 				return (RDMA_SUCCESS);
1676 			    }
1677 #ifdef DEBUG
1678 			    if (rib_debug) {
1679 				cmn_err(CE_NOTE, "rib_chk_srv_ats: "
1680 				    "ibt_query_ar FAILED, return\n");
1681 			    }
1682 #endif
1683 			}
1684 		    }
1685 		}
1686 	    }
1687 	    rw_exit(&hca->service_list_lock);
1688 	}
1689 	kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1690 	rw_exit(&hca->state_lock);
1691 	return (RDMA_FAILED);
1692 }
1693 
1694 
1695 /*
1696  * Connect to the server.
1697  */
1698 rdma_stat
1699 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path)
1700 {
1701 	ibt_chan_open_args_t	chan_args;	/* channel args */
1702 	ibt_chan_sizes_t	chan_sizes;
1703 	ibt_rc_chan_alloc_args_t	qp_attr;
1704 	ibt_status_t		ibt_status;
1705 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1706 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1707 
1708 	(void) bzero(&chan_args, sizeof (chan_args));
1709 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1710 
1711 	qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num;
1712 	/* Alloc a RC channel */
1713 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1714 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1715 	qp_attr.rc_pd = hca->pd_hdl;
1716 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1717 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1718 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1719 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1720 	qp_attr.rc_clone_chan = NULL;
1721 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1722 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1723 
1724 	chan_args.oc_path = path;
1725 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1726 	chan_args.oc_cm_clnt_private = (void *)rib_stat;
1727 	chan_args.oc_rdma_ra_out = 1;
1728 	chan_args.oc_rdma_ra_in = 1;
1729 	chan_args.oc_path_retry_cnt = 2;
1730 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1731 
1732 refresh:
1733 	rw_enter(&hca->state_lock, RW_READER);
1734 	if (hca->state != HCA_DETACHED) {
1735 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1736 			IBT_ACHAN_NO_FLAGS, &qp_attr, &qp->qp_hdl,
1737 			&chan_sizes);
1738 	} else {
1739 		rw_exit(&hca->state_lock);
1740 		return (RDMA_FAILED);
1741 	}
1742 	rw_exit(&hca->state_lock);
1743 
1744 	if (ibt_status != IBT_SUCCESS) {
1745 #ifdef DEBUG
1746 		cmn_err(CE_WARN, "rib_conn_to_srv: alloc_rc_channel "
1747 		"failed, ibt_status=%d.", ibt_status);
1748 #endif
1749 		return (RDMA_FAILED);
1750 	}
1751 
1752 	/* Connect to the Server */
1753 	(void) bzero(&ret_args, sizeof (ret_args));
1754 	mutex_enter(&qp->cb_lock);
1755 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1756 			IBT_BLOCKING, &chan_args, &ret_args);
1757 	if (ibt_status != IBT_SUCCESS) {
1758 #ifdef DEBUG
1759 		if (rib_debug)
1760 			cmn_err(CE_WARN, "rib_conn_to_srv: open_rc_channel"
1761 				" failed for qp %p, status=%d, "
1762 				"ret_args.rc_status=%d\n",
1763 				(void *)qp, ibt_status, ret_args.rc_status);
1764 #endif
1765 		(void) ibt_free_channel(qp->qp_hdl);
1766 		qp->qp_hdl = NULL;
1767 		mutex_exit(&qp->cb_lock);
1768 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1769 			ret_args.rc_status == IBT_CM_CONN_STALE) {
1770 			/*
1771 			 * Got IBT_CM_CONN_STALE probably because of stale
1772 			 * data on the passive end of a channel that existed
1773 			 * prior to reboot. Retry establishing a channel
1774 			 * REFRESH_ATTEMPTS times, during which time the
1775 			 * stale conditions on the server might clear up.
1776 			 */
1777 			goto refresh;
1778 		}
1779 		return (RDMA_FAILED);
1780 	}
1781 	mutex_exit(&qp->cb_lock);
1782 	/*
1783 	 * Set the private data area to qp to be used in callbacks
1784 	 */
1785 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1786 	return (RDMA_SUCCESS);
1787 }
1788 
1789 rdma_stat
1790 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca)
1791 {
1792 	struct sockaddr_in	*sin4;
1793 	struct sockaddr_in6	*sin6;
1794 	ibt_path_attr_t		path_attr;
1795 	ibt_path_info_t		path;
1796 	ibt_status_t		ibt_status;
1797 
1798 	ASSERT(raddr->buf != NULL);
1799 
1800 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1801 	bzero(&path, sizeof (ibt_path_info_t));
1802 
1803 	/*
1804 	 * Conctruct svc name
1805 	 */
1806 	path_attr.pa_sname = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
1807 	switch (addr_type) {
1808 	case AF_INET:
1809 		sin4 = (struct sockaddr_in *)raddr->buf;
1810 		(void) inet_ntop(AF_INET, &sin4->sin_addr, path_attr.pa_sname,
1811 		    IB_SVC_NAME_LEN);
1812 		break;
1813 
1814 	case AF_INET6:
1815 		sin6 = (struct sockaddr_in6 *)raddr->buf;
1816 		(void) inet_ntop(AF_INET6, &sin6->sin6_addr,
1817 		    path_attr.pa_sname, IB_SVC_NAME_LEN);
1818 		break;
1819 
1820 	default:
1821 #ifdef	DEBUG
1822 	    if (rib_debug) {
1823 		cmn_err(CE_WARN, "rib_ping_srv: Address not recognized\n");
1824 	    }
1825 #endif
1826 		kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1827 		return (RDMA_INVAL);
1828 	}
1829 	(void) strlcat(path_attr.pa_sname, "::NFS", IB_SVC_NAME_LEN);
1830 
1831 	ibt_status = ibt_get_paths(rib_stat->ibt_clnt_hdl,
1832 		IBT_PATH_NO_FLAGS, &path_attr, 1, &path, NULL);
1833 	kmem_free(path_attr.pa_sname, IB_SVC_NAME_LEN);
1834 	if (ibt_status != IBT_SUCCESS) {
1835 	    if (rib_debug > 1) {
1836 		cmn_err(CE_WARN, "rib_ping_srv: ibt_get_paths FAILED!"
1837 			" status=%d\n", ibt_status);
1838 	    }
1839 	} else if (path.pi_hca_guid) {
1840 		ASSERT(path.pi_hca_guid == rib_stat->hca->hca_guid);
1841 		*hca = rib_stat->hca;
1842 		return (RDMA_SUCCESS);
1843 	}
1844 	return (RDMA_FAILED);
1845 }
1846 
1847 /*
1848  * Close channel, remove from connection list and
1849  * free up resources allocated for that channel.
1850  */
1851 rdma_stat
1852 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
1853 {
1854 	rib_qp_t	*qp = ctoqp(conn);
1855 	rib_hca_t	*hca;
1856 
1857 	/*
1858 	 * c_ref == 0 and connection is in C_DISCONN_PEND
1859 	 */
1860 	hca = qp->hca;
1861 	if (conn_list != NULL)
1862 		(void) rib_rm_conn(conn, conn_list);
1863 	if (qp->qp_hdl != NULL) {
1864 		/*
1865 		 * If the channel has not been establised,
1866 		 * ibt_flush_channel is called to flush outstanding WRs
1867 		 * on the Qs.  Otherwise, ibt_close_rc_channel() is
1868 		 * called.  The channel is then freed.
1869 		 */
1870 		if (conn_list != NULL)
1871 		    (void) ibt_close_rc_channel(qp->qp_hdl,
1872 			IBT_BLOCKING, NULL, 0, NULL, NULL, 0);
1873 		else
1874 		    (void) ibt_flush_channel(qp->qp_hdl);
1875 
1876 		mutex_enter(&qp->posted_rbufs_lock);
1877 		while (qp->n_posted_rbufs)
1878 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
1879 		mutex_exit(&qp->posted_rbufs_lock);
1880 		(void) ibt_free_channel(qp->qp_hdl);
1881 		qp->qp_hdl = NULL;
1882 	}
1883 	ASSERT(qp->rdlist == NULL);
1884 	if (qp->replylist != NULL) {
1885 		(void) rib_rem_replylist(qp);
1886 	}
1887 
1888 	cv_destroy(&qp->cb_conn_cv);
1889 	cv_destroy(&qp->posted_rbufs_cv);
1890 	mutex_destroy(&qp->cb_lock);
1891 
1892 	mutex_destroy(&qp->replylist_lock);
1893 	mutex_destroy(&qp->posted_rbufs_lock);
1894 	mutex_destroy(&qp->rdlist_lock);
1895 
1896 	cv_destroy(&conn->c_cv);
1897 	mutex_destroy(&conn->c_lock);
1898 
1899 	if (conn->c_raddr.buf != NULL) {
1900 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
1901 	}
1902 	if (conn->c_laddr.buf != NULL) {
1903 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
1904 	}
1905 	kmem_free(qp, sizeof (rib_qp_t));
1906 
1907 	/*
1908 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
1909 	 * then the hca is no longer being used.
1910 	 */
1911 	if (conn_list != NULL) {
1912 		rw_enter(&hca->state_lock, RW_READER);
1913 		if (hca->state == HCA_DETACHED) {
1914 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
1915 			if (hca->srv_conn_list.conn_hd == NULL) {
1916 				rw_enter(&hca->cl_conn_list.conn_lock,
1917 					RW_READER);
1918 				if (hca->cl_conn_list.conn_hd == NULL) {
1919 					mutex_enter(&hca->inuse_lock);
1920 					hca->inuse = FALSE;
1921 					cv_signal(&hca->cb_cv);
1922 					mutex_exit(&hca->inuse_lock);
1923 				}
1924 				rw_exit(&hca->cl_conn_list.conn_lock);
1925 			}
1926 			rw_exit(&hca->srv_conn_list.conn_lock);
1927 		}
1928 		rw_exit(&hca->state_lock);
1929 	}
1930 	return (RDMA_SUCCESS);
1931 }
1932 
1933 /*
1934  * Wait for send completion notification. Only on receiving a
1935  * notification be it a successful or error completion, free the
1936  * send_wid.
1937  */
1938 static rdma_stat
1939 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
1940 {
1941 	clock_t timout, cv_wait_ret;
1942 	rdma_stat error = RDMA_SUCCESS;
1943 	int	i;
1944 
1945 	/*
1946 	 * Wait for send to complete
1947 	 */
1948 	ASSERT(wd != NULL);
1949 	mutex_enter(&wd->sendwait_lock);
1950 	if (wd->status == (uint_t)SEND_WAIT) {
1951 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
1952 		    ddi_get_lbolt();
1953 		if (qp->mode == RIB_SERVER) {
1954 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
1955 				    &wd->sendwait_lock, timout)) > 0 &&
1956 			    wd->status == (uint_t)SEND_WAIT)
1957 				;
1958 			switch (cv_wait_ret) {
1959 			case -1:	/* timeout */
1960 #ifdef DEBUG
1961 				if (rib_debug > 2)
1962 					cmn_err(CE_WARN, "rib_sendwait: "
1963 					    "timed out qp %p\n", (void *)qp);
1964 #endif
1965 				wd->cv_sig = 0;		/* no signal needed */
1966 				error = RDMA_TIMEDOUT;
1967 				break;
1968 			default:	/* got send completion */
1969 				break;
1970 			}
1971 		} else {
1972 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
1973 				    &wd->sendwait_lock, timout)) > 0 &&
1974 			    wd->status == (uint_t)SEND_WAIT)
1975 				;
1976 			switch (cv_wait_ret) {
1977 			case -1:	/* timeout */
1978 #ifdef DEBUG
1979 				if (rib_debug > 2)
1980 					cmn_err(CE_WARN, "rib_sendwait: "
1981 					    "timed out qp %p\n", (void *)qp);
1982 #endif
1983 				wd->cv_sig = 0;		/* no signal needed */
1984 				error = RDMA_TIMEDOUT;
1985 				break;
1986 			case 0:		/* interrupted */
1987 #ifdef DEBUG
1988 				if (rib_debug > 2)
1989 					cmn_err(CE_NOTE, "rib_sendwait:"
1990 					    " interrupted on qp %p\n",
1991 					    (void *)qp);
1992 #endif
1993 				wd->cv_sig = 0;		/* no signal needed */
1994 				error = RDMA_INTR;
1995 				break;
1996 			default:	/* got send completion */
1997 				break;
1998 			}
1999 		}
2000 	}
2001 
2002 	if (wd->status != (uint_t)SEND_WAIT) {
2003 		/* got send completion */
2004 		if (wd->status != RDMA_SUCCESS) {
2005 		    error = wd->status;
2006 		    if (wd->status != RDMA_CONNLOST)
2007 			error = RDMA_FAILED;
2008 		}
2009 		for (i = 0; i < wd->nsbufs; i++) {
2010 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2011 				(void *)(uintptr_t)wd->sbufaddr[i]);
2012 		}
2013 		mutex_exit(&wd->sendwait_lock);
2014 		(void) rib_free_sendwait(wd);
2015 	} else {
2016 		mutex_exit(&wd->sendwait_lock);
2017 	}
2018 
2019 	return (error);
2020 }
2021 
2022 static struct send_wid *
2023 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2024 {
2025 	struct send_wid	*wd;
2026 
2027 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2028 	wd->xid = xid;
2029 	wd->cv_sig = cv_sig;
2030 	wd->qp = qp;
2031 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2032 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2033 	wd->status = (uint_t)SEND_WAIT;
2034 
2035 	return (wd);
2036 }
2037 
2038 static int
2039 rib_free_sendwait(struct send_wid *wdesc)
2040 {
2041 	cv_destroy(&wdesc->wait_cv);
2042 	mutex_destroy(&wdesc->sendwait_lock);
2043 	kmem_free(wdesc, sizeof (*wdesc));
2044 
2045 	return (0);
2046 }
2047 
2048 static rdma_stat
2049 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2050 {
2051 	mutex_enter(&qp->replylist_lock);
2052 	if (rep != NULL) {
2053 	    (void) rib_remreply(qp, rep);
2054 	    mutex_exit(&qp->replylist_lock);
2055 	    return (RDMA_SUCCESS);
2056 	}
2057 	mutex_exit(&qp->replylist_lock);
2058 	return (RDMA_FAILED);
2059 }
2060 
2061 /*
2062  * Send buffers are freed here only in case of error in posting
2063  * on QP. If the post succeeded, the send buffers are freed upon
2064  * send completion in rib_sendwait() or in the scq_handler.
2065  */
2066 rdma_stat
2067 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2068 	int send_sig, int cv_sig)
2069 {
2070 	struct send_wid	*wdesc;
2071 	struct clist	*clp;
2072 	ibt_status_t	ibt_status = IBT_SUCCESS;
2073 	rdma_stat	ret = RDMA_SUCCESS;
2074 	ibt_send_wr_t	tx_wr;
2075 	int		i, nds;
2076 	ibt_wr_ds_t	sgl[DSEG_MAX];
2077 	uint_t		total_msg_size;
2078 	rib_qp_t	*qp = ctoqp(conn);
2079 
2080 	ASSERT(cl != NULL);
2081 
2082 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2083 
2084 	nds = 0;
2085 	total_msg_size = 0;
2086 	clp = cl;
2087 	while (clp != NULL) {
2088 		if (nds >= DSEG_MAX) {
2089 			cmn_err(CE_WARN, "rib_send_and_wait: DSEG_MAX"
2090 			    " too small!");
2091 			return (RDMA_FAILED);
2092 		}
2093 		sgl[nds].ds_va = clp->c_saddr;
2094 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2095 		sgl[nds].ds_len = clp->c_len;
2096 		total_msg_size += clp->c_len;
2097 		clp = clp->c_next;
2098 		nds++;
2099 	}
2100 
2101 	if (send_sig) {
2102 		/* Set SEND_SIGNAL flag. */
2103 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2104 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2105 	} else {
2106 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2107 		wdesc = rib_init_sendwait(msgid, 0, qp);
2108 	}
2109 	wdesc->nsbufs = nds;
2110 	for (i = 0; i < nds; i++) {
2111 		wdesc->sbufaddr[i] = sgl[i].ds_va;
2112 	}
2113 
2114 	tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2115 	tx_wr.wr_opcode = IBT_WRC_SEND;
2116 	tx_wr.wr_trans = IBT_RC_SRV;
2117 	tx_wr.wr_nds = nds;
2118 	tx_wr.wr_sgl = sgl;
2119 
2120 	mutex_enter(&conn->c_lock);
2121 	if (conn->c_state & C_CONNECTED) {
2122 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2123 	}
2124 	if (((conn->c_state & C_CONNECTED) == 0) ||
2125 		ibt_status != IBT_SUCCESS) {
2126 		mutex_exit(&conn->c_lock);
2127 		for (i = 0; i < nds; i++) {
2128 			rib_rbuf_free(conn, SEND_BUFFER,
2129 				(void *)(uintptr_t)wdesc->sbufaddr[i]);
2130 		}
2131 		(void) rib_free_sendwait(wdesc);
2132 #ifdef DEBUG
2133 		if (rib_debug && ibt_status != IBT_SUCCESS)
2134 			cmn_err(CE_WARN, "rib_send_and_wait: ibt_post_send "
2135 				"failed! wr_id %llx on qpn %p, status=%d!",
2136 				(longlong_t)tx_wr.wr_id, (void *)qp,
2137 				ibt_status);
2138 #endif
2139 		return (RDMA_FAILED);
2140 	}
2141 	mutex_exit(&conn->c_lock);
2142 
2143 	if (send_sig) {
2144 	    if (cv_sig) {
2145 		/*
2146 		 * cv_wait for send to complete.
2147 		 * We can fail due to a timeout or signal or
2148 		 * unsuccessful send.
2149 		 */
2150 		ret = rib_sendwait(qp, wdesc);
2151 #ifdef DEBUG
2152 	    if (rib_debug > 2)
2153 		if (ret != 0) {
2154 		    cmn_err(CE_WARN, "rib_send_and_wait: rib_sendwait "
2155 			"FAILED, rdma stat=%d, wr_id %llx, qp %p!",
2156 			ret, (longlong_t)tx_wr.wr_id, (void *)qp);
2157 		}
2158 #endif
2159 		return (ret);
2160 	    }
2161 	}
2162 
2163 	return (RDMA_SUCCESS);
2164 }
2165 
2166 rdma_stat
2167 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2168 {
2169 	rdma_stat	ret;
2170 
2171 	/* send-wait & cv_signal */
2172 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1);
2173 
2174 	return (ret);
2175 }
2176 
2177 /*
2178  * Server interface (svc_rdma_ksend).
2179  * Send RPC reply and wait for RDMA_DONE.
2180  */
2181 rdma_stat
2182 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2183 {
2184 	rdma_stat ret = RDMA_SUCCESS;
2185 	struct rdma_done_list *rd;
2186 	clock_t timout, cv_wait_ret;
2187 	rib_qp_t *qp = ctoqp(conn);
2188 
2189 	mutex_enter(&qp->rdlist_lock);
2190 	rd = rdma_done_add(qp, msgid);
2191 
2192 	/* No cv_signal (whether send-wait or no-send-wait) */
2193 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0);
2194 	if (ret != RDMA_SUCCESS) {
2195 #ifdef DEBUG
2196 	    cmn_err(CE_WARN, "rib_send_resp: send_and_wait "
2197 		"failed, msgid %u, qp %p", msgid, (void *)qp);
2198 #endif
2199 	    rdma_done_rm(qp, rd);
2200 	    goto done;
2201 	}
2202 
2203 	/*
2204 	 * Wait for RDMA_DONE from remote end
2205 	 */
2206 	timout = drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2207 	cv_wait_ret = cv_timedwait(&rd->rdma_done_cv, &qp->rdlist_lock,
2208 	    timout);
2209 	rdma_done_rm(qp, rd);
2210 	if (cv_wait_ret < 0) {
2211 #ifdef DEBUG
2212 		if (rib_debug > 1) {
2213 			cmn_err(CE_WARN, "rib_send_resp: RDMA_DONE not"
2214 			    " recv'd for qp %p, xid:%u\n",
2215 			    (void *)qp, msgid);
2216 		}
2217 #endif
2218 		ret = RDMA_TIMEDOUT;
2219 		goto done;
2220 	}
2221 
2222 done:
2223 	mutex_exit(&qp->rdlist_lock);
2224 	return (ret);
2225 }
2226 
2227 static struct recv_wid *
2228 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2229 {
2230 	struct recv_wid	*rwid;
2231 
2232 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2233 	rwid->xid = msgid;
2234 	rwid->addr = sgl->ds_va;
2235 	rwid->qp = qp;
2236 
2237 	return (rwid);
2238 }
2239 
2240 static void
2241 rib_free_wid(struct recv_wid *rwid)
2242 {
2243 	kmem_free(rwid, sizeof (struct recv_wid));
2244 }
2245 
2246 rdma_stat
2247 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2248 {
2249 	rib_qp_t	*qp = ctoqp(conn);
2250 	struct clist	*clp = cl;
2251 	struct reply	*rep;
2252 	struct recv_wid	*rwid;
2253 	int		nds;
2254 	ibt_wr_ds_t	sgl[DSEG_MAX];
2255 	ibt_recv_wr_t	recv_wr;
2256 	rdma_stat	ret;
2257 	ibt_status_t	ibt_status;
2258 
2259 	/*
2260 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2261 	 */
2262 
2263 	nds = 0;
2264 	while (cl != NULL) {
2265 		if (nds >= DSEG_MAX) {
2266 		    cmn_err(CE_WARN, "rib_clnt_post: DSEG_MAX too small!");
2267 		    ret = RDMA_FAILED;
2268 		    goto done;
2269 		}
2270 		sgl[nds].ds_va = cl->c_saddr;
2271 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2272 		sgl[nds].ds_len = cl->c_len;
2273 		cl = cl->c_next;
2274 		nds++;
2275 	}
2276 
2277 	if (nds != 1) {
2278 	    cmn_err(CE_WARN, "rib_clnt_post: nds!=1\n");
2279 	    ret = RDMA_FAILED;
2280 	    goto done;
2281 	}
2282 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2283 	recv_wr.wr_nds = nds;
2284 	recv_wr.wr_sgl = sgl;
2285 
2286 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2287 	if (rwid) {
2288 	    recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2289 	} else {
2290 		cmn_err(CE_WARN, "rib_clnt_post: out of memory");
2291 		ret = RDMA_NORESOURCE;
2292 		goto done;
2293 	}
2294 	rep = rib_addreplylist(qp, msgid);
2295 	if (!rep) {
2296 		cmn_err(CE_WARN, "rib_clnt_post: out of memory");
2297 		rib_free_wid(rwid);
2298 		ret = RDMA_NORESOURCE;
2299 		goto done;
2300 	}
2301 
2302 	mutex_enter(&conn->c_lock);
2303 	if (conn->c_state & C_CONNECTED) {
2304 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2305 	}
2306 	if (((conn->c_state & C_CONNECTED) == 0) ||
2307 		ibt_status != IBT_SUCCESS) {
2308 		mutex_exit(&conn->c_lock);
2309 #ifdef DEBUG
2310 		cmn_err(CE_WARN, "rib_clnt_post: QPN %p failed in "
2311 		    "ibt_post_recv(), msgid=%d, status=%d",
2312 		    (void *)qp,  msgid, ibt_status);
2313 #endif
2314 		rib_free_wid(rwid);
2315 		(void) rib_rem_rep(qp, rep);
2316 		ret = RDMA_FAILED;
2317 		goto done;
2318 	}
2319 	mutex_exit(&conn->c_lock);
2320 	return (RDMA_SUCCESS);
2321 
2322 done:
2323 	while (clp != NULL) {
2324 	    rib_rbuf_free(conn, RECV_BUFFER, (void *)(uintptr_t)clp->c_saddr);
2325 	    clp = clp->c_next;
2326 	}
2327 	return (ret);
2328 }
2329 
2330 rdma_stat
2331 rib_svc_post(CONN* conn, struct clist *cl)
2332 {
2333 	rib_qp_t	*qp = ctoqp(conn);
2334 	struct svc_recv	*s_recvp;
2335 	int		nds;
2336 	ibt_wr_ds_t	sgl[DSEG_MAX];
2337 	ibt_recv_wr_t	recv_wr;
2338 	ibt_status_t	ibt_status;
2339 
2340 	nds = 0;
2341 	while (cl != NULL) {
2342 		if (nds >= DSEG_MAX) {
2343 		    cmn_err(CE_WARN, "rib_svc_post: DSEG_MAX too small!");
2344 		    return (RDMA_FAILED);
2345 		}
2346 		sgl[nds].ds_va = cl->c_saddr;
2347 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2348 		sgl[nds].ds_len = cl->c_len;
2349 		cl = cl->c_next;
2350 		nds++;
2351 	}
2352 
2353 	if (nds != 1) {
2354 	    cmn_err(CE_WARN, "rib_svc_post: nds!=1\n");
2355 	    rib_rbuf_free(conn, RECV_BUFFER, (caddr_t)(uintptr_t)sgl[0].ds_va);
2356 	    return (RDMA_FAILED);
2357 	}
2358 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2359 	recv_wr.wr_nds = nds;
2360 	recv_wr.wr_sgl = sgl;
2361 
2362 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2363 	/* Use s_recvp's addr as wr id */
2364 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2365 	mutex_enter(&conn->c_lock);
2366 	if (conn->c_state & C_CONNECTED) {
2367 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2368 	}
2369 	if (((conn->c_state & C_CONNECTED) == 0) ||
2370 		ibt_status != IBT_SUCCESS) {
2371 		mutex_exit(&conn->c_lock);
2372 #ifdef DEBUG
2373 		cmn_err(CE_WARN, "rib_svc_post: QP %p failed in "
2374 		    "ibt_post_recv(), status=%d",
2375 		    (void *)qp, ibt_status);
2376 #endif
2377 		rib_rbuf_free(conn, RECV_BUFFER,
2378 			(caddr_t)(uintptr_t)sgl[0].ds_va);
2379 		(void) rib_free_svc_recv(s_recvp);
2380 		return (RDMA_FAILED);
2381 	}
2382 	mutex_exit(&conn->c_lock);
2383 
2384 	return (RDMA_SUCCESS);
2385 }
2386 
2387 /* Client */
2388 rdma_stat
2389 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2390 {
2391 
2392 	return (rib_clnt_post(conn, cl, msgid));
2393 }
2394 
2395 /* Server */
2396 rdma_stat
2397 rib_post_recv(CONN *conn, struct clist *cl)
2398 {
2399 	rib_qp_t	*qp = ctoqp(conn);
2400 
2401 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2402 		mutex_enter(&qp->posted_rbufs_lock);
2403 		qp->n_posted_rbufs++;
2404 		mutex_exit(&qp->posted_rbufs_lock);
2405 		return (RDMA_SUCCESS);
2406 	}
2407 	return (RDMA_FAILED);
2408 }
2409 
2410 /*
2411  * Client side only interface to "recv" the rpc reply buf
2412  * posted earlier by rib_post_resp(conn, cl, msgid).
2413  */
2414 rdma_stat
2415 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2416 {
2417 	struct reply *rep = NULL;
2418 	clock_t timout, cv_wait_ret;
2419 	rdma_stat ret = RDMA_SUCCESS;
2420 	rib_qp_t *qp = ctoqp(conn);
2421 
2422 	/*
2423 	 * Find the reply structure for this msgid
2424 	 */
2425 	mutex_enter(&qp->replylist_lock);
2426 
2427 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2428 	    if (rep->xid == msgid)
2429 		break;
2430 	}
2431 	if (rep != NULL) {
2432 		/*
2433 		 * If message not yet received, wait.
2434 		 */
2435 		if (rep->status == (uint_t)REPLY_WAIT) {
2436 			timout = ddi_get_lbolt() +
2437 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2438 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2439 				    &qp->replylist_lock, timout)) > 0 &&
2440 			    rep->status == (uint_t)REPLY_WAIT);
2441 
2442 			switch (cv_wait_ret) {
2443 			case -1:	/* timeout */
2444 				ret = RDMA_TIMEDOUT;
2445 				break;
2446 			case 0:
2447 				ret = RDMA_INTR;
2448 				break;
2449 			default:
2450 				break;
2451 			}
2452 		}
2453 
2454 		if (rep->status == RDMA_SUCCESS) {
2455 			struct clist *cl = NULL;
2456 
2457 			/*
2458 			 * Got message successfully
2459 			 */
2460 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2461 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2462 			*clp = cl;
2463 		} else {
2464 			if (rep->status != (uint_t)REPLY_WAIT) {
2465 				/*
2466 				 * Got error in reply message. Free
2467 				 * recv buffer here.
2468 				 */
2469 				ret = rep->status;
2470 				rib_rbuf_free(conn, RECV_BUFFER,
2471 					(caddr_t)(uintptr_t)rep->vaddr_cq);
2472 			}
2473 		}
2474 		(void) rib_remreply(qp, rep);
2475 	} else {
2476 		/*
2477 		 * No matching reply structure found for given msgid on the
2478 		 * reply wait list.
2479 		 */
2480 		ret = RDMA_INVAL;
2481 #ifdef DEBUG
2482 		cmn_err(CE_WARN, "rib_recv: no matching reply for "
2483 		    "xid %u, qp %p\n", msgid, (void *)qp);
2484 #endif
2485 	}
2486 
2487 	/*
2488 	 * Done.
2489 	 */
2490 	mutex_exit(&qp->replylist_lock);
2491 	return (ret);
2492 }
2493 
2494 /*
2495  * RDMA write a buffer to the remote address.
2496  */
2497 rdma_stat
2498 rib_write(CONN *conn, struct clist *cl, int wait)
2499 {
2500 	ibt_send_wr_t	tx_wr;
2501 	int		nds;
2502 	int		cv_sig;
2503 	ibt_wr_ds_t	sgl[DSEG_MAX];
2504 	struct send_wid	*wdesc;
2505 	ibt_status_t	ibt_status;
2506 	rdma_stat	ret = RDMA_SUCCESS;
2507 	rib_qp_t	*qp = ctoqp(conn);
2508 
2509 	if (cl == NULL) {
2510 		cmn_err(CE_WARN, "rib_write: NULL clist\n");
2511 		return (RDMA_FAILED);
2512 	}
2513 
2514 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2515 	/*
2516 	 * Remote address is at the head chunk item in list.
2517 	 */
2518 	tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_daddr;
2519 	tx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_dmemhandle.mrc_rmr; /* rkey */
2520 
2521 	nds = 0;
2522 	while (cl != NULL) {
2523 		if (nds >= DSEG_MAX) {
2524 			cmn_err(CE_WARN, "rib_write: DSEG_MAX too small!");
2525 			return (RDMA_FAILED);
2526 		}
2527 		sgl[nds].ds_va = cl->c_saddr;
2528 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2529 		sgl[nds].ds_len = cl->c_len;
2530 		cl = cl->c_next;
2531 		nds++;
2532 	}
2533 
2534 	if (wait) {
2535 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2536 		cv_sig = 1;
2537 	} else {
2538 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2539 		cv_sig = 0;
2540 	}
2541 
2542 	wdesc = rib_init_sendwait(0, cv_sig, qp);
2543 	tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2544 	tx_wr.wr_opcode = IBT_WRC_RDMAW;
2545 	tx_wr.wr_trans = IBT_RC_SRV;
2546 	tx_wr.wr_nds = nds;
2547 	tx_wr.wr_sgl = sgl;
2548 
2549 	mutex_enter(&conn->c_lock);
2550 	if (conn->c_state & C_CONNECTED) {
2551 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2552 	}
2553 	if (((conn->c_state & C_CONNECTED) == 0) ||
2554 		ibt_status != IBT_SUCCESS) {
2555 		mutex_exit(&conn->c_lock);
2556 		(void) rib_free_sendwait(wdesc);
2557 		return (RDMA_FAILED);
2558 	}
2559 	mutex_exit(&conn->c_lock);
2560 
2561 	/*
2562 	 * Wait for send to complete
2563 	 */
2564 	if (wait) {
2565 		ret = rib_sendwait(qp, wdesc);
2566 		if (ret != 0) {
2567 			return (ret);
2568 		}
2569 	}
2570 	return (RDMA_SUCCESS);
2571 }
2572 
2573 /*
2574  * RDMA Read a buffer from the remote address.
2575  */
2576 rdma_stat
2577 rib_read(CONN *conn, struct clist *cl, int wait)
2578 {
2579 	ibt_send_wr_t	rx_wr;
2580 	int		nds;
2581 	int		cv_sig;
2582 	ibt_wr_ds_t	sgl[DSEG_MAX];	/* is 2 sufficient? */
2583 	struct send_wid	*wdesc;
2584 	ibt_status_t	ibt_status = IBT_SUCCESS;
2585 	rdma_stat	ret = RDMA_SUCCESS;
2586 	rib_qp_t	*qp = ctoqp(conn);
2587 
2588 	if (cl == NULL) {
2589 		cmn_err(CE_WARN, "rib_read: NULL clist\n");
2590 		return (RDMA_FAILED);
2591 	}
2592 
2593 	bzero(&rx_wr, sizeof (ibt_send_wr_t));
2594 	/*
2595 	 * Remote address is at the head chunk item in list.
2596 	 */
2597 	rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->c_saddr;
2598 	rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr; /* rkey */
2599 
2600 	nds = 0;
2601 	while (cl != NULL) {
2602 		if (nds >= DSEG_MAX) {
2603 			cmn_err(CE_WARN, "rib_read: DSEG_MAX too small!");
2604 			return (RDMA_FAILED);
2605 		}
2606 		sgl[nds].ds_va = cl->c_daddr;
2607 		sgl[nds].ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2608 		sgl[nds].ds_len = cl->c_len;
2609 		cl = cl->c_next;
2610 		nds++;
2611 	}
2612 
2613 	if (wait) {
2614 		rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2615 		cv_sig = 1;
2616 	} else {
2617 		rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2618 		cv_sig = 0;
2619 	}
2620 
2621 	wdesc = rib_init_sendwait(0, cv_sig, qp);
2622 	rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2623 	rx_wr.wr_opcode = IBT_WRC_RDMAR;
2624 	rx_wr.wr_trans = IBT_RC_SRV;
2625 	rx_wr.wr_nds = nds;
2626 	rx_wr.wr_sgl = sgl;
2627 
2628 	mutex_enter(&conn->c_lock);
2629 	if (conn->c_state & C_CONNECTED) {
2630 		ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2631 	}
2632 	if (((conn->c_state & C_CONNECTED) == 0) ||
2633 		ibt_status != IBT_SUCCESS) {
2634 		mutex_exit(&conn->c_lock);
2635 #ifdef DEBUG
2636 		if (rib_debug && ibt_status != IBT_SUCCESS)
2637 			cmn_err(CE_WARN, "rib_read: FAILED post_sending RDMAR"
2638 				" wr_id %llx on qp %p, status=%d",
2639 				(longlong_t)rx_wr.wr_id, (void *)qp,
2640 				ibt_status);
2641 #endif
2642 		(void) rib_free_sendwait(wdesc);
2643 		return (RDMA_FAILED);
2644 	}
2645 	mutex_exit(&conn->c_lock);
2646 
2647 	/*
2648 	 * Wait for send to complete
2649 	 */
2650 	if (wait) {
2651 		ret = rib_sendwait(qp, wdesc);
2652 		if (ret != 0) {
2653 			return (ret);
2654 		}
2655 	}
2656 
2657 	return (RDMA_SUCCESS);
2658 }
2659 
2660 int
2661 is_for_ipv4(ibt_ar_t *result)
2662 {
2663 	int	i, size = sizeof (struct in_addr);
2664 	uint8_t	zero = 0;
2665 
2666 	for (i = 0; i < (ATS_AR_DATA_LEN - size); i++)
2667 		zero |= result->ar_data[i];
2668 	return (zero == 0);
2669 }
2670 
2671 /*
2672  * rib_srv_cm_handler()
2673  *    Connection Manager callback to handle RC connection requests.
2674  */
2675 /* ARGSUSED */
2676 static ibt_cm_status_t
2677 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2678 	ibt_cm_return_args_t *ret_args, void *priv_data,
2679 	ibt_priv_data_len_t len)
2680 {
2681 	queue_t		*q;
2682 	rib_qp_t	*qp;
2683 	rpcib_state_t	*ribstat;
2684 	rib_hca_t	*hca;
2685 	rdma_stat	status = RDMA_SUCCESS;
2686 	int		i;
2687 	struct clist	cl;
2688 	rdma_buf_t	rdbuf;
2689 	void		*buf = NULL;
2690 	ibt_cm_req_rcv_t	cm_req_rcv;
2691 	CONN		*conn;
2692 	ibt_status_t ibt_status;
2693 	ibt_ar_t	ar_query, ar_result;
2694 	ib_gid_t	sgid;
2695 
2696 
2697 	ASSERT(any != NULL);
2698 	ASSERT(event != NULL);
2699 
2700 	ribstat = (rpcib_state_t *)any;
2701 	hca = (rib_hca_t *)ribstat->hca;
2702 	ASSERT(hca != NULL);
2703 
2704 	/* got a connection request */
2705 	switch (event->cm_type) {
2706 	case IBT_CM_EVENT_REQ_RCV:
2707 		/*
2708 		 * If the plugin is in the NO_ACCEPT state, bail out.
2709 		 */
2710 		mutex_enter(&plugin_state_lock);
2711 		if (plugin_state == NO_ACCEPT) {
2712 			mutex_exit(&plugin_state_lock);
2713 			return (IBT_CM_REJECT);
2714 		}
2715 		mutex_exit(&plugin_state_lock);
2716 
2717 		/*
2718 		 * Need to send a MRA MAD to CM so that it does not
2719 		 * timeout on us.
2720 		 */
2721 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2722 			    event->cm_event.req.req_timeout * 8, NULL, 0);
2723 
2724 		mutex_enter(&rib_stat->open_hca_lock);
2725 		q = rib_stat->q;
2726 		mutex_exit(&rib_stat->open_hca_lock);
2727 		status = rib_svc_create_chan(hca, (caddr_t)q,
2728 			event->cm_event.req.req_prim_hca_port, &qp);
2729 		if (status) {
2730 #ifdef DEBUG
2731 			cmn_err(CE_WARN, "rib_srv_cm_handler: "
2732 			    "create_channel failed %d", status);
2733 #endif
2734 			return (IBT_CM_REJECT);
2735 		}
2736 		cm_req_rcv = event->cm_event.req;
2737 
2738 #ifdef DEBUG
2739 		if (rib_debug > 2) {
2740 		    cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2741 			"server recv'ed IBT_CM_EVENT_REQ_RCV\n");
2742 		    cmn_err(CE_NOTE, "\t\t SID:%llx\n",
2743 				(longlong_t)cm_req_rcv.req_service_id);
2744 		    cmn_err(CE_NOTE, "\t\t Local Port:%d\n",
2745 				cm_req_rcv.req_prim_hca_port);
2746 		    cmn_err(CE_NOTE,
2747 			"\t\t Remote GID:(prefix:%llx,guid:%llx)\n",
2748 			(longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_prefix,
2749 			(longlong_t)cm_req_rcv.req_prim_addr.av_dgid.gid_guid);
2750 		    cmn_err(CE_NOTE, "\t\t Local GID:(prefix:%llx,guid:%llx)\n",
2751 			(longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_prefix,
2752 			(longlong_t)cm_req_rcv.req_prim_addr.av_sgid.gid_guid);
2753 		    cmn_err(CE_NOTE, "\t\t Remote QPN:%u\n",
2754 			cm_req_rcv.req_remote_qpn);
2755 		    cmn_err(CE_NOTE, "\t\t Remote Q_Key:%x\n",
2756 			cm_req_rcv.req_remote_qkey);
2757 		    cmn_err(CE_NOTE, "\t\t Local QP %p (qp_hdl=%p)\n",
2758 			(void *)qp, (void *)qp->qp_hdl);
2759 		}
2760 
2761 		if (rib_debug > 2) {
2762 		    ibt_rc_chan_query_attr_t	chan_attrs;
2763 
2764 		    if (ibt_query_rc_channel(qp->qp_hdl, &chan_attrs)
2765 			== IBT_SUCCESS) {
2766 			cmn_err(CE_NOTE, "rib_svc_cm_handler: qp %p in "
2767 			    "CEP state %d\n", (void *)qp, chan_attrs.rc_state);
2768 		    }
2769 		}
2770 #endif
2771 
2772 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2773 		ret_args->cm_ret.rep.cm_rdma_ra_out = 1;
2774 		ret_args->cm_ret.rep.cm_rdma_ra_in = 1;
2775 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2776 
2777 		/*
2778 		 * Pre-posts RECV buffers
2779 		 */
2780 		conn = qptoc(qp);
2781 		for (i = 0; i < preposted_rbufs; i++) {
2782 		    bzero(&rdbuf, sizeof (rdbuf));
2783 		    rdbuf.type = RECV_BUFFER;
2784 		    buf = rib_rbuf_alloc(conn, &rdbuf);
2785 		    if (buf == NULL) {
2786 			cmn_err(CE_WARN, "rib_svc_cm_handler: "
2787 			    "No RECV_BUFFER buf!\n");
2788 			(void) rib_disconnect_channel(conn, NULL);
2789 			return (IBT_CM_REJECT);
2790 		    }
2791 
2792 		    bzero(&cl, sizeof (cl));
2793 		    cl.c_saddr = (uintptr_t)rdbuf.addr;
2794 		    cl.c_len = rdbuf.len;
2795 		    cl.c_smemhandle.mrc_lmr = rdbuf.handle.mrc_lmr; /* lkey */
2796 		    cl.c_next = NULL;
2797 		    status = rib_post_recv(conn, &cl);
2798 		    if (status != RDMA_SUCCESS) {
2799 			cmn_err(CE_WARN, "rib_srv_cm_handler: failed "
2800 			    "posting RPC_REQ buf to qp %p!", (void *)qp);
2801 			(void) rib_disconnect_channel(conn, NULL);
2802 			return (IBT_CM_REJECT);
2803 		    }
2804 		}
2805 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
2806 
2807 		/*
2808 		 * Get the address translation service record from ATS
2809 		 */
2810 		rw_enter(&hca->state_lock, RW_READER);
2811 		if (hca->state == HCA_DETACHED) {
2812 		    rw_exit(&hca->state_lock);
2813 		    return (IBT_CM_REJECT);
2814 		}
2815 		rw_exit(&hca->state_lock);
2816 
2817 		for (i = 0; i < hca->hca_nports; i++) {
2818 		    ibt_status = ibt_get_port_state(hca->hca_hdl, i+1,
2819 					&sgid, NULL);
2820 		    if (ibt_status != IBT_SUCCESS) {
2821 			if (rib_debug) {
2822 			    cmn_err(CE_WARN, "rib_srv_cm_handler: "
2823 				"ibt_get_port_state FAILED!"
2824 				"status = %d\n", ibt_status);
2825 			}
2826 		    } else {
2827 			/*
2828 			 * do ibt_query_ar()
2829 			 */
2830 			bzero(&ar_query, sizeof (ar_query));
2831 			bzero(&ar_result, sizeof (ar_result));
2832 			ar_query.ar_gid = cm_req_rcv.req_prim_addr.av_dgid;
2833 			ar_query.ar_pkey = event->cm_event.req.req_pkey;
2834 			ibt_status = ibt_query_ar(&sgid, &ar_query,
2835 							&ar_result);
2836 			if (ibt_status != IBT_SUCCESS) {
2837 			    if (rib_debug) {
2838 				cmn_err(CE_WARN, "rib_srv_cm_handler: "
2839 				    "ibt_query_ar FAILED!"
2840 				    "status = %d\n", ibt_status);
2841 			    }
2842 			} else {
2843 			    conn = qptoc(qp);
2844 
2845 			    if (is_for_ipv4(&ar_result)) {
2846 				struct sockaddr_in *s;
2847 				int sin_size = sizeof (struct sockaddr_in);
2848 				int in_size = sizeof (struct in_addr);
2849 				uint8_t	*start_pos;
2850 
2851 				conn->c_raddr.maxlen =
2852 					conn->c_raddr.len = sin_size;
2853 				conn->c_raddr.buf = kmem_zalloc(sin_size,
2854 						KM_SLEEP);
2855 				s = (struct sockaddr_in *)conn->c_raddr.buf;
2856 				s->sin_family = AF_INET;
2857 				/*
2858 				 * For IPv4,  the IP addr is stored in
2859 				 * the last four bytes of ar_data.
2860 				 */
2861 				start_pos = ar_result.ar_data +
2862 					ATS_AR_DATA_LEN - in_size;
2863 				bcopy(start_pos, &s->sin_addr, in_size);
2864 				if (rib_debug > 1) {
2865 				    char print_addr[INET_ADDRSTRLEN];
2866 
2867 				    bzero(print_addr, INET_ADDRSTRLEN);
2868 				    (void) inet_ntop(AF_INET, &s->sin_addr,
2869 						print_addr, INET_ADDRSTRLEN);
2870 				    cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2871 					"remote clnt_addr: %s\n", print_addr);
2872 				}
2873 			    } else {
2874 				struct sockaddr_in6 *s6;
2875 				int sin6_size = sizeof (struct sockaddr_in6);
2876 
2877 				conn->c_raddr.maxlen =
2878 					conn->c_raddr.len = sin6_size;
2879 				conn->c_raddr.buf = kmem_zalloc(sin6_size,
2880 					KM_SLEEP);
2881 
2882 				s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
2883 				s6->sin6_family = AF_INET6;
2884 				/* sin6_addr is stored in ar_data */
2885 				bcopy(ar_result.ar_data, &s6->sin6_addr,
2886 					sizeof (struct in6_addr));
2887 				if (rib_debug > 1) {
2888 				    char print_addr[INET6_ADDRSTRLEN];
2889 
2890 				    bzero(print_addr, INET6_ADDRSTRLEN);
2891 				    (void) inet_ntop(AF_INET6, &s6->sin6_addr,
2892 						print_addr, INET6_ADDRSTRLEN);
2893 				    cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2894 					"remote clnt_addr: %s\n", print_addr);
2895 				}
2896 			    }
2897 			    return (IBT_CM_ACCEPT);
2898 			}
2899 		    }
2900 		}
2901 		if (rib_debug > 1) {
2902 		    cmn_err(CE_WARN, "rib_srv_cm_handler: "
2903 				"address record query failed!");
2904 		}
2905 		break;
2906 
2907 	case IBT_CM_EVENT_CONN_CLOSED:
2908 	{
2909 		CONN		*conn;
2910 		rib_qp_t	*qp;
2911 
2912 		switch (event->cm_event.closed) {
2913 		case IBT_CM_CLOSED_DREP_RCVD:
2914 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
2915 		case IBT_CM_CLOSED_DUP:
2916 		case IBT_CM_CLOSED_ABORT:
2917 		case IBT_CM_CLOSED_ALREADY:
2918 			/*
2919 			 * These cases indicate the local end initiated
2920 			 * the closing of the channel. Nothing to do here.
2921 			 */
2922 			break;
2923 		default:
2924 			/*
2925 			 * Reason for CONN_CLOSED event must be one of
2926 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
2927 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
2928 			 * the remote end is closing the channel. In these
2929 			 * cases free the channel and transition to error
2930 			 * state
2931 			 */
2932 			qp = ibt_get_chan_private(event->cm_channel);
2933 			conn = qptoc(qp);
2934 			mutex_enter(&conn->c_lock);
2935 			if (conn->c_state == C_DISCONN_PEND) {
2936 				mutex_exit(&conn->c_lock);
2937 				break;
2938 			}
2939 			conn->c_state = C_ERROR;
2940 
2941 			/*
2942 			 * Free the rc_channel. Channel has already
2943 			 * transitioned to ERROR state and WRs have been
2944 			 * FLUSHED_ERR already.
2945 			 */
2946 			(void) ibt_free_channel(qp->qp_hdl);
2947 			qp->qp_hdl = NULL;
2948 
2949 			/*
2950 			 * Free the conn if c_ref goes down to 0
2951 			 */
2952 			if (conn->c_ref == 0) {
2953 				/*
2954 				 * Remove from list and free conn
2955 				 */
2956 				conn->c_state = C_DISCONN_PEND;
2957 				mutex_exit(&conn->c_lock);
2958 				(void) rib_disconnect_channel(conn,
2959 					&hca->srv_conn_list);
2960 			} else {
2961 				mutex_exit(&conn->c_lock);
2962 			}
2963 #ifdef DEBUG
2964 			if (rib_debug)
2965 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2966 					" (CONN_CLOSED) channel disconnected");
2967 #endif
2968 			break;
2969 		}
2970 		break;
2971 	}
2972 	case IBT_CM_EVENT_CONN_EST:
2973 	/*
2974 	 * RTU received, hence connection established.
2975 	 */
2976 		if (rib_debug > 1)
2977 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2978 				"(CONN_EST) channel established");
2979 		break;
2980 
2981 	default:
2982 	    if (rib_debug > 2) {
2983 		/* Let CM handle the following events. */
2984 		if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
2985 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2986 			    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
2987 		} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
2988 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2989 			    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
2990 		} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
2991 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2992 			    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
2993 		} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
2994 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2995 			    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
2996 		} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
2997 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2998 			    "server recv'ed IBT_CM_EVENT_FAILURE\n");
2999 		}
3000 	    }
3001 	    return (IBT_CM_REJECT);
3002 	}
3003 
3004 	/* accept all other CM messages (i.e. let the CM handle them) */
3005 	return (IBT_CM_ACCEPT);
3006 }
3007 
3008 static rdma_stat
3009 rib_register_ats(rib_hca_t *hca)
3010 {
3011 	ibt_hca_portinfo_t	*port_infop;
3012 	uint_t			port_size;
3013 	uint_t			pki, i, num_ports, nbinds;
3014 	ibt_status_t		ibt_status;
3015 	rib_service_t		*new_service, *temp_srv;
3016 	rpcib_ats_t		*atsp;
3017 	rpcib_ibd_insts_t	ibds;
3018 	ib_pkey_t		pkey;
3019 	ibt_ar_t		ar;	/* address record */
3020 
3021 	/*
3022 	 * Query all ports for the given HCA
3023 	 */
3024 	rw_enter(&hca->state_lock, RW_READER);
3025 	if (hca->state != HCA_DETACHED) {
3026 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3027 		    &num_ports, &port_size);
3028 		rw_exit(&hca->state_lock);
3029 	} else {
3030 		rw_exit(&hca->state_lock);
3031 		return (RDMA_FAILED);
3032 	}
3033 	if (ibt_status != IBT_SUCCESS) {
3034 #ifdef DEBUG
3035 	    if (rib_debug) {
3036 		cmn_err(CE_NOTE, "rib_register_ats: FAILED in "
3037 		    "ibt_query_hca_ports, status = %d\n", ibt_status);
3038 	    }
3039 #endif
3040 		return (RDMA_FAILED);
3041 	}
3042 
3043 #ifdef	DEBUG
3044 	if (rib_debug > 1) {
3045 		cmn_err(CE_NOTE, "rib_register_ats: Ports detected "
3046 		    "%d\n", num_ports);
3047 
3048 		for (i = 0; i < num_ports; i++) {
3049 			if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3050 				cmn_err(CE_WARN, "rib_register_ats "
3051 				    "Port #: %d INACTIVE\n", i+1);
3052 			} else if (port_infop[i].p_linkstate ==
3053 			    IBT_PORT_ACTIVE) {
3054 				cmn_err(CE_NOTE, "rib_register_ats "
3055 				    "Port #: %d ACTIVE\n", i+1);
3056 			}
3057 		}
3058 	}
3059 #endif
3060 
3061 	ibds.rib_ibd_alloc = N_IBD_INSTANCES;
3062 	ibds.rib_ibd_cnt = 0;
3063 	ibds.rib_ats = (rpcib_ats_t *)kmem_zalloc(ibds.rib_ibd_alloc *
3064 			sizeof (rpcib_ats_t), KM_SLEEP);
3065 	rib_get_ibd_insts(&ibds);
3066 
3067 	if (ibds.rib_ibd_cnt == 0) {
3068 	    kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc *
3069 				sizeof (rpcib_ats_t));
3070 	    ibt_free_portinfo(port_infop, port_size);
3071 	    return (RDMA_FAILED);
3072 	}
3073 
3074 	/*
3075 	 * Get the IP addresses of active ports and
3076 	 * register them with ATS.  IPv4 addresses
3077 	 * have precedence over IPv6 addresses.
3078 	 */
3079 	if (get_ibd_ipaddr(&ibds) != 0) {
3080 #ifdef	DEBUG
3081 	    if (rib_debug > 1) {
3082 		cmn_err(CE_WARN, "rib_register_ats: "
3083 		    "get_ibd_ipaddr failed");
3084 	    }
3085 #endif
3086 	    kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc *
3087 				sizeof (rpcib_ats_t));
3088 	    ibt_free_portinfo(port_infop, port_size);
3089 	    return (RDMA_FAILED);
3090 	}
3091 
3092 	/*
3093 	 * Start ATS registration for active ports on this HCA.
3094 	 */
3095 	rw_enter(&hca->service_list_lock, RW_WRITER);
3096 	nbinds = 0;
3097 	new_service = NULL;
3098 	for (i = 0; i < num_ports; i++) {
3099 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3100 			continue;
3101 
3102 	    for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3103 		pkey = port_infop[i].p_pkey_tbl[pki];
3104 		if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) {
3105 		    ar.ar_gid = port_infop[i].p_sgid_tbl[0];
3106 		    ar.ar_pkey = pkey;
3107 		    atsp = get_ibd_entry(&ar.ar_gid, pkey, &ibds);
3108 		    if (atsp == NULL)
3109 			continue;
3110 		/*
3111 		 * store the sin[6]_addr in ar_data
3112 		 */
3113 		    (void) bzero(ar.ar_data, ATS_AR_DATA_LEN);
3114 		    if (atsp->ras_inet_type == AF_INET) {
3115 			uint8_t *start_pos;
3116 
3117 			/*
3118 			 * The ipv4 addr goes into the last
3119 			 * four bytes of ar_data.
3120 			 */
3121 			start_pos = ar.ar_data + ATS_AR_DATA_LEN -
3122 				sizeof (struct in_addr);
3123 			bcopy(&atsp->ras_sin.sin_addr, start_pos,
3124 				sizeof (struct in_addr));
3125 		    } else if (atsp->ras_inet_type == AF_INET6) {
3126 			bcopy(&atsp->ras_sin6.sin6_addr, ar.ar_data,
3127 				sizeof (struct in6_addr));
3128 		    } else
3129 			continue;
3130 
3131 		    ibt_status = ibt_register_ar(hca->ibt_clnt_hdl, &ar);
3132 		    if (ibt_status == IBT_SUCCESS) {
3133 #ifdef	DEBUG
3134 			if (rib_debug > 1) {
3135 				cmn_err(CE_WARN, "rib_register_ats: "
3136 				    "ibt_register_ar OK on port %d", i+1);
3137 			}
3138 #endif
3139 			/*
3140 			 * Allocate and prepare a service entry
3141 			 */
3142 			new_service = kmem_zalloc(sizeof (rib_service_t),
3143 				KM_SLEEP);
3144 			new_service->srv_port = i + 1;
3145 			new_service->srv_ar = ar;
3146 			new_service->srv_next = NULL;
3147 
3148 			/*
3149 			 * Add to the service list for this HCA
3150 			 */
3151 			new_service->srv_next = hca->ats_list;
3152 			hca->ats_list = new_service;
3153 			new_service = NULL;
3154 			nbinds ++;
3155 		    } else {
3156 #ifdef	DEBUG
3157 			if (rib_debug > 1) {
3158 			    cmn_err(CE_WARN, "rib_register_ats: "
3159 			    "ibt_register_ar FAILED on port %d", i+1);
3160 			}
3161 #endif
3162 		    }
3163 		}
3164 	    }
3165 	}
3166 
3167 #ifdef	DEBUG
3168 	if (rib_debug > 1) {
3169 		for (temp_srv = hca->ats_list; temp_srv != NULL;
3170 			temp_srv = temp_srv->srv_next) {
3171 				cmn_err(CE_NOTE, "Service: ATS, active on"
3172 					" port: %d\n", temp_srv->srv_port);
3173 		}
3174 	}
3175 #endif
3176 
3177 	rw_exit(&hca->service_list_lock);
3178 	kmem_free(ibds.rib_ats, ibds.rib_ibd_alloc * sizeof (rpcib_ats_t));
3179 	ibt_free_portinfo(port_infop, port_size);
3180 
3181 	if (nbinds == 0) {
3182 #ifdef	DEBUG
3183 	if (rib_debug > 1) {
3184 		cmn_err(CE_WARN, "rib_register_ats FAILED!\n");
3185 	}
3186 #endif
3187 		return (RDMA_FAILED);
3188 	}
3189 	return (RDMA_SUCCESS);
3190 }
3191 
3192 static rdma_stat
3193 rib_register_service(rib_hca_t *hca, int service_type)
3194 {
3195 	ibt_srv_desc_t		sdesc;
3196 	ibt_srv_bind_t		sbind;
3197 	ibt_hca_portinfo_t	*port_infop;
3198 	ib_svc_id_t		srv_id;
3199 	ibt_srv_hdl_t		srv_hdl;
3200 	uint_t			port_size;
3201 	uint_t			pki, i, j, num_ports, nbinds;
3202 	ibt_status_t		ibt_status;
3203 	char			**addrs;
3204 	int			addr_count;
3205 	rib_service_t		*new_service, *temp_srv;
3206 	ib_pkey_t		pkey;
3207 
3208 	/*
3209 	 * Query all ports for the given HCA
3210 	 */
3211 	rw_enter(&hca->state_lock, RW_READER);
3212 	if (hca->state != HCA_DETACHED) {
3213 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3214 		    &num_ports, &port_size);
3215 		rw_exit(&hca->state_lock);
3216 	} else {
3217 		rw_exit(&hca->state_lock);
3218 		return (RDMA_FAILED);
3219 	}
3220 	if (ibt_status != IBT_SUCCESS) {
3221 #ifdef DEBUG
3222 		cmn_err(CE_NOTE, "rib_register_service: FAILED in "
3223 		    "ibt_query_hca_ports, status = %d\n", ibt_status);
3224 #endif
3225 		return (RDMA_FAILED);
3226 	}
3227 
3228 #ifdef	DEBUG
3229 	if (rib_debug > 1) {
3230 		cmn_err(CE_NOTE, "rib_register_service: Ports detected "
3231 		    "%d\n", num_ports);
3232 
3233 		for (i = 0; i < num_ports; i++) {
3234 			if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3235 				cmn_err(CE_WARN, "rib_register_service "
3236 				    "Port #: %d INACTIVE\n", i+1);
3237 			} else if (port_infop[i].p_linkstate ==
3238 			    IBT_PORT_ACTIVE) {
3239 				cmn_err(CE_NOTE, "rib_register_service "
3240 				    "Port #: %d ACTIVE\n", i+1);
3241 			}
3242 		}
3243 	}
3244 #endif
3245 	/*
3246 	 * Get all the IP addresses on this system to register the
3247 	 * given "service type" on all DNS recognized IP addrs.
3248 	 * Each service type such as NFS will have all the systems
3249 	 * IP addresses as its different names. For now the only
3250 	 * type of service we support in RPCIB is NFS.
3251 	 */
3252 	addrs = get_ip_addrs(&addr_count);
3253 	if (addrs == NULL) {
3254 #ifdef DEBUG
3255 		if (rib_debug) {
3256 		    cmn_err(CE_WARN, "rib_register_service: "
3257 			"get_ip_addrs failed\n");
3258 		}
3259 #endif
3260 		ibt_free_portinfo(port_infop, port_size);
3261 		return (RDMA_FAILED);
3262 	}
3263 
3264 #ifdef	DEBUG
3265 	if (rib_debug > 1) {
3266 		for (i = 0; i < addr_count; i++)
3267 			cmn_err(CE_NOTE, "addr %d: %s\n", i, addrs[i]);
3268 	}
3269 #endif
3270 
3271 	rw_enter(&hca->service_list_lock, RW_WRITER);
3272 	/*
3273 	 * Start registering and binding service to active
3274 	 * on active ports on this HCA.
3275 	 */
3276 	nbinds = 0;
3277 	new_service = NULL;
3278 
3279 	/*
3280 	 * We use IP addresses as the service names for
3281 	 * service registration.  Register each of them
3282 	 * with CM to obtain a svc_id and svc_hdl.  We do not
3283 	 * register the service with machine's loopback address.
3284 	 */
3285 	for (j = 1; j < addr_count; j++) {
3286 	    (void) bzero(&srv_id, sizeof (ib_svc_id_t));
3287 	    (void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3288 	    (void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3289 
3290 	    sdesc.sd_handler = rib_srv_cm_handler;
3291 	    sdesc.sd_flags = 0;
3292 
3293 	    ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3294 			    &sdesc, 0, 1, &srv_hdl, &srv_id);
3295 	    if (ibt_status != IBT_SUCCESS) {
3296 #ifdef DEBUG
3297 		if (rib_debug) {
3298 		    cmn_err(CE_WARN, "rib_register_service: "
3299 			"ibt_register_service FAILED, status "
3300 			"= %d\n", ibt_status);
3301 		}
3302 #endif
3303 		/*
3304 		 * No need to go on, since we failed to obtain
3305 		 * a srv_id and srv_hdl. Move on to the next
3306 		 * IP addr as a service name.
3307 		 */
3308 		continue;
3309 	    }
3310 	    for (i = 0; i < num_ports; i++) {
3311 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3312 			continue;
3313 
3314 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3315 		    pkey = port_infop[i].p_pkey_tbl[pki];
3316 		    if ((pkey & IBSRM_HB) && (pkey != IB_PKEY_INVALID_FULL)) {
3317 
3318 			/*
3319 			 * Allocate and prepare a service entry
3320 			 */
3321 			new_service = kmem_zalloc(1 * sizeof (rib_service_t),
3322 			    KM_SLEEP);
3323 			new_service->srv_type = service_type;
3324 			new_service->srv_port = i + 1;
3325 			new_service->srv_id = srv_id;
3326 			new_service->srv_hdl = srv_hdl;
3327 			new_service->srv_sbind_hdl = kmem_zalloc(1 *
3328 			    sizeof (ibt_sbind_hdl_t), KM_SLEEP);
3329 
3330 			new_service->srv_name = kmem_zalloc(IB_SVC_NAME_LEN,
3331 			    KM_SLEEP);
3332 			(void) bcopy(addrs[j], new_service->srv_name,
3333 			    IB_SVC_NAME_LEN);
3334 			(void) strlcat(new_service->srv_name, "::NFS",
3335 				IB_SVC_NAME_LEN);
3336 			new_service->srv_next = NULL;
3337 
3338 			/*
3339 			 * Bind the service, specified by the IP address,
3340 			 * to the port/pkey using the srv_hdl returned
3341 			 * from ibt_register_service().
3342 			 */
3343 			(void) bzero(&sbind, sizeof (ibt_srv_bind_t));
3344 			sbind.sb_pkey = pkey;
3345 			sbind.sb_lease = 0xFFFFFFFF;
3346 			sbind.sb_key[0] = NFS_SEC_KEY0;
3347 			sbind.sb_key[1] = NFS_SEC_KEY1;
3348 			sbind.sb_name = new_service->srv_name;
3349 
3350 #ifdef	DEBUG
3351 			if (rib_debug > 1) {
3352 				cmn_err(CE_NOTE, "rib_register_service: "
3353 				    "binding service using name: %s\n",
3354 				    sbind.sb_name);
3355 			}
3356 #endif
3357 			ibt_status = ibt_bind_service(srv_hdl,
3358 			    port_infop[i].p_sgid_tbl[0], &sbind, rib_stat,
3359 			    new_service->srv_sbind_hdl);
3360 			if (ibt_status != IBT_SUCCESS) {
3361 #ifdef	DEBUG
3362 			    if (rib_debug) {
3363 				cmn_err(CE_WARN, "rib_register_service: FAILED"
3364 				    " in ibt_bind_service, status = %d\n",
3365 				    ibt_status);
3366 			    }
3367 #endif
3368 				kmem_free(new_service->srv_sbind_hdl,
3369 				    sizeof (ibt_sbind_hdl_t));
3370 				kmem_free(new_service->srv_name,
3371 				    IB_SVC_NAME_LEN);
3372 				kmem_free(new_service,
3373 				    sizeof (rib_service_t));
3374 				new_service = NULL;
3375 				continue;
3376 			}
3377 #ifdef	DEBUG
3378 			if (rib_debug > 1) {
3379 				if (ibt_status == IBT_SUCCESS)
3380 					cmn_err(CE_NOTE, "rib_regstr_service: "
3381 					    "Serv: %s REGISTERED on port: %d",
3382 					    sbind.sb_name, i+1);
3383 			}
3384 #endif
3385 			/*
3386 			 * Add to the service list for this HCA
3387 			 */
3388 			new_service->srv_next = hca->service_list;
3389 			hca->service_list = new_service;
3390 			new_service = NULL;
3391 			nbinds ++;
3392 		    }
3393 		}
3394 	    }
3395 	}
3396 	rw_exit(&hca->service_list_lock);
3397 
3398 #ifdef	DEBUG
3399 	if (rib_debug > 1) {
3400 		/*
3401 		 * Change this print to a more generic one, as rpcib
3402 		 * is supposed to handle multiple service types.
3403 		 */
3404 		for (temp_srv = hca->service_list; temp_srv != NULL;
3405 			temp_srv = temp_srv->srv_next) {
3406 				cmn_err(CE_NOTE, "NFS-IB, active on port:"
3407 					" %d\n"
3408 					"Using name: %s", temp_srv->srv_port,
3409 					temp_srv->srv_name);
3410 		}
3411 	}
3412 #endif
3413 
3414 	ibt_free_portinfo(port_infop, port_size);
3415 	for (i = 0; i < addr_count; i++) {
3416 		if (addrs[i])
3417 			kmem_free(addrs[i], IB_SVC_NAME_LEN);
3418 	}
3419 	kmem_free(addrs, addr_count * sizeof (char *));
3420 
3421 	if (nbinds == 0) {
3422 #ifdef	DEBUG
3423 	    if (rib_debug) {
3424 		cmn_err(CE_WARN, "rib_register_service: "
3425 		    "bind_service FAILED!\n");
3426 	    }
3427 #endif
3428 		return (RDMA_FAILED);
3429 	} else {
3430 		/*
3431 		 * Put this plugin into accept state, since atleast
3432 		 * one registration was successful.
3433 		 */
3434 		mutex_enter(&plugin_state_lock);
3435 		plugin_state = ACCEPT;
3436 		mutex_exit(&plugin_state_lock);
3437 		return (RDMA_SUCCESS);
3438 	}
3439 }
3440 
3441 void
3442 rib_listen(struct rdma_svc_data *rd)
3443 {
3444 	rdma_stat status = RDMA_SUCCESS;
3445 
3446 	rd->active = 0;
3447 	rd->err_code = RDMA_FAILED;
3448 
3449 	/*
3450 	 * First check if a hca is still attached
3451 	 */
3452 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
3453 	if (rib_stat->hca->state != HCA_INITED) {
3454 		rw_exit(&rib_stat->hca->state_lock);
3455 		return;
3456 	}
3457 	rw_exit(&rib_stat->hca->state_lock);
3458 
3459 	rib_stat->q = &rd->q;
3460 	/*
3461 	 * Register the Address translation service
3462 	 */
3463 	mutex_enter(&rib_stat->open_hca_lock);
3464 	if (ats_running == 0) {
3465 		if (rib_register_ats(rib_stat->hca) != RDMA_SUCCESS) {
3466 #ifdef	DEBUG
3467 		    if (rib_debug) {
3468 			cmn_err(CE_WARN,
3469 			    "rib_listen(): ats registration failed!");
3470 		    }
3471 #endif
3472 		    mutex_exit(&rib_stat->open_hca_lock);
3473 		    return;
3474 		} else {
3475 			ats_running = 1;
3476 		}
3477 	}
3478 	mutex_exit(&rib_stat->open_hca_lock);
3479 
3480 	/*
3481 	 * Right now the only service type is NFS. Hence force feed this
3482 	 * value. Ideally to communicate the service type it should be
3483 	 * passed down in rdma_svc_data.
3484 	 */
3485 	rib_stat->service_type = NFS;
3486 	status = rib_register_service(rib_stat->hca, NFS);
3487 	if (status != RDMA_SUCCESS) {
3488 		rd->err_code = status;
3489 		return;
3490 	}
3491 	/*
3492 	 * Service active on an HCA, check rd->err_code for more
3493 	 * explainable errors.
3494 	 */
3495 	rd->active = 1;
3496 	rd->err_code = status;
3497 }
3498 
3499 /* XXXX */
3500 /* ARGSUSED */
3501 static void
3502 rib_listen_stop(struct rdma_svc_data *svcdata)
3503 {
3504 	rib_hca_t		*hca;
3505 
3506 	/*
3507 	 * KRPC called the RDMATF to stop the listeners, this means
3508 	 * stop sending incomming or recieved requests to KRPC master
3509 	 * transport handle for RDMA-IB. This is also means that the
3510 	 * master transport handle, responsible for us, is going away.
3511 	 */
3512 	mutex_enter(&plugin_state_lock);
3513 	plugin_state = NO_ACCEPT;
3514 	if (svcdata != NULL)
3515 		svcdata->active = 0;
3516 	mutex_exit(&plugin_state_lock);
3517 
3518 	/*
3519 	 * First check if a hca is still attached
3520 	 */
3521 	hca = rib_stat->hca;
3522 	rw_enter(&hca->state_lock, RW_READER);
3523 	if (hca->state != HCA_INITED) {
3524 		rw_exit(&hca->state_lock);
3525 		return;
3526 	}
3527 	rib_stop_services(hca);
3528 	rw_exit(&hca->state_lock);
3529 }
3530 
3531 /*
3532  * Traverse the HCA's service list to unbind and deregister services.
3533  * Instead of unbinding the service for a service handle by
3534  * calling ibt_unbind_service() for each port/pkey, we unbind
3535  * all the services for the service handle by making only one
3536  * call to ibt_unbind_all_services().  Then, we deregister the
3537  * service for the service handle.
3538  *
3539  * When traversing the entries in service_list, we compare the
3540  * srv_hdl of the current entry with that of the next.  If they
3541  * are different or if the next entry is NULL, the current entry
3542  * marks the last binding of the service handle.  In this case,
3543  * call ibt_unbind_all_services() and deregister the service for
3544  * the service handle.  If they are the same, the current and the
3545  * next entries are bound to the same service handle.  In this
3546  * case, move on to the next entry.
3547  */
3548 static void
3549 rib_stop_services(rib_hca_t *hca)
3550 {
3551 	rib_service_t		*srv_list, *to_remove;
3552 	ibt_status_t   		ibt_status;
3553 
3554 	/*
3555 	 * unbind and deregister the services for this service type.
3556 	 * Right now there is only one service type. In future it will
3557 	 * be passed down to this function.
3558 	 */
3559 	rw_enter(&hca->service_list_lock, RW_WRITER);
3560 	srv_list = hca->service_list;
3561 	while (srv_list != NULL) {
3562 		to_remove = srv_list;
3563 		srv_list = to_remove->srv_next;
3564 		if (srv_list == NULL || bcmp(to_remove->srv_hdl,
3565 		    srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) {
3566 
3567 		    ibt_status = ibt_unbind_all_services(to_remove->srv_hdl);
3568 		    if (ibt_status != IBT_SUCCESS) {
3569 			cmn_err(CE_WARN, "rib_listen_stop: "
3570 			    "ibt_unbind_all_services FAILED"
3571 				" status: %d\n", ibt_status);
3572 		    }
3573 
3574 		    ibt_status =
3575 			ibt_deregister_service(hca->ibt_clnt_hdl,
3576 				to_remove->srv_hdl);
3577 		    if (ibt_status != IBT_SUCCESS) {
3578 			cmn_err(CE_WARN, "rib_listen_stop: "
3579 			    "ibt_deregister_service FAILED"
3580 				" status: %d\n", ibt_status);
3581 		    }
3582 
3583 #ifdef	DEBUG
3584 		    if (rib_debug > 1) {
3585 			if (ibt_status == IBT_SUCCESS)
3586 				cmn_err(CE_NOTE, "rib_listen_stop: "
3587 				    "Successfully stopped and"
3588 				    " UNREGISTERED service: %s\n",
3589 				    to_remove->srv_name);
3590 		    }
3591 #endif
3592 		}
3593 		kmem_free(to_remove->srv_name, IB_SVC_NAME_LEN);
3594 		kmem_free(to_remove->srv_sbind_hdl,
3595 			sizeof (ibt_sbind_hdl_t));
3596 
3597 		kmem_free(to_remove, sizeof (rib_service_t));
3598 	}
3599 	hca->service_list = NULL;
3600 	rw_exit(&hca->service_list_lock);
3601 }
3602 
3603 static struct svc_recv *
3604 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3605 {
3606 	struct svc_recv	*recvp;
3607 
3608 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3609 	recvp->vaddr = sgl->ds_va;
3610 	recvp->qp = qp;
3611 	recvp->bytes_xfer = 0;
3612 	return (recvp);
3613 }
3614 
3615 static int
3616 rib_free_svc_recv(struct svc_recv *recvp)
3617 {
3618 	kmem_free(recvp, sizeof (*recvp));
3619 
3620 	return (0);
3621 }
3622 
3623 static struct reply *
3624 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3625 {
3626 	struct reply	*rep;
3627 
3628 
3629 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3630 	if (rep == NULL) {
3631 		mutex_exit(&qp->replylist_lock);
3632 		cmn_err(CE_WARN, "rib_addreplylist: no memory\n");
3633 		return (NULL);
3634 	}
3635 	rep->xid = msgid;
3636 	rep->vaddr_cq = NULL;
3637 	rep->bytes_xfer = 0;
3638 	rep->status = (uint_t)REPLY_WAIT;
3639 	rep->prev = NULL;
3640 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3641 
3642 	mutex_enter(&qp->replylist_lock);
3643 	if (qp->replylist) {
3644 		rep->next = qp->replylist;
3645 		qp->replylist->prev = rep;
3646 	}
3647 	qp->rep_list_size++;
3648 	if (rib_debug > 1)
3649 	    cmn_err(CE_NOTE, "rib_addreplylist: qp:%p, rep_list_size:%d\n",
3650 		(void *)qp, qp->rep_list_size);
3651 	qp->replylist = rep;
3652 	mutex_exit(&qp->replylist_lock);
3653 
3654 	return (rep);
3655 }
3656 
3657 static rdma_stat
3658 rib_rem_replylist(rib_qp_t *qp)
3659 {
3660 	struct reply	*r, *n;
3661 
3662 	mutex_enter(&qp->replylist_lock);
3663 	for (r = qp->replylist; r != NULL; r = n) {
3664 		n = r->next;
3665 		(void) rib_remreply(qp, r);
3666 	}
3667 	mutex_exit(&qp->replylist_lock);
3668 
3669 	return (RDMA_SUCCESS);
3670 }
3671 
3672 static int
3673 rib_remreply(rib_qp_t *qp, struct reply *rep)
3674 {
3675 
3676 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3677 	if (rep->prev) {
3678 		rep->prev->next = rep->next;
3679 	}
3680 	if (rep->next) {
3681 		rep->next->prev = rep->prev;
3682 	}
3683 	if (qp->replylist == rep)
3684 		qp->replylist = rep->next;
3685 
3686 	cv_destroy(&rep->wait_cv);
3687 	qp->rep_list_size--;
3688 	if (rib_debug > 1)
3689 	    cmn_err(CE_NOTE, "rib_remreply: qp:%p, rep_list_size:%d\n",
3690 		(void *)qp, qp->rep_list_size);
3691 
3692 	kmem_free(rep, sizeof (*rep));
3693 
3694 	return (0);
3695 }
3696 
3697 rdma_stat
3698 rib_registermem(CONN *conn, caddr_t buf, uint_t buflen,
3699 	struct mrc *buf_handle)
3700 {
3701 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3702 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3703 	rdma_stat	status;
3704 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3705 
3706 	/*
3707 	 * Note: ALL buffer pools use the same memory type RDMARW.
3708 	 */
3709 	status = rib_reg_mem(hca, buf, buflen, 0, &mr_hdl, &mr_desc);
3710 	if (status == RDMA_SUCCESS) {
3711 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3712 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3713 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3714 	} else {
3715 		buf_handle->mrc_linfo = NULL;
3716 		buf_handle->mrc_lmr = 0;
3717 		buf_handle->mrc_rmr = 0;
3718 	}
3719 	return (status);
3720 }
3721 
3722 static rdma_stat
3723 rib_reg_mem(rib_hca_t *hca, caddr_t buf, uint_t size, ibt_mr_flags_t spec,
3724 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3725 {
3726 	ibt_mr_attr_t	mem_attr;
3727 	ibt_status_t	ibt_status;
3728 
3729 	mem_attr.mr_vaddr = (uintptr_t)buf;
3730 	mem_attr.mr_len = (ib_msglen_t)size;
3731 	mem_attr.mr_as = NULL;
3732 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3733 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3734 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3735 
3736 	rw_enter(&hca->state_lock, RW_READER);
3737 	if (hca->state == HCA_INITED) {
3738 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3739 					&mem_attr, mr_hdlp, mr_descp);
3740 		rw_exit(&hca->state_lock);
3741 	} else {
3742 		rw_exit(&hca->state_lock);
3743 		return (RDMA_FAILED);
3744 	}
3745 
3746 	if (ibt_status != IBT_SUCCESS) {
3747 		cmn_err(CE_WARN, "rib_reg_mem: ibt_register_mr "
3748 			"(spec:%d) failed for addr %llX, status %d",
3749 			spec, (longlong_t)mem_attr.mr_vaddr, ibt_status);
3750 		return (RDMA_FAILED);
3751 	}
3752 	return (RDMA_SUCCESS);
3753 }
3754 
3755 rdma_stat
3756 rib_registermemsync(CONN *conn, caddr_t buf, uint_t buflen,
3757 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle)
3758 {
3759 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3760 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3761 	rdma_stat	status;
3762 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3763 
3764 	/*
3765 	 * Non-coherent memory registration.
3766 	 */
3767 	status = rib_reg_mem(hca, buf, buflen, IBT_MR_NONCOHERENT, &mr_hdl,
3768 			&mr_desc);
3769 	if (status == RDMA_SUCCESS) {
3770 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3771 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3772 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3773 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3774 	} else {
3775 		buf_handle->mrc_linfo = NULL;
3776 		buf_handle->mrc_lmr = 0;
3777 		buf_handle->mrc_rmr = 0;
3778 	}
3779 	return (status);
3780 }
3781 
3782 /* ARGSUSED */
3783 rdma_stat
3784 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3785 {
3786 	rib_hca_t *hca = (ctoqp(conn))->hca;
3787 
3788 	/*
3789 	 * Allow memory deregistration even if HCA is
3790 	 * getting detached. Need all outstanding
3791 	 * memory registrations to be deregistered
3792 	 * before HCA_DETACH_EVENT can be accepted.
3793 	 */
3794 	(void) ibt_deregister_mr(hca->hca_hdl,
3795 			(ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3796 	return (RDMA_SUCCESS);
3797 }
3798 
3799 /* ARGSUSED */
3800 rdma_stat
3801 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3802 		RIB_SYNCMEM_HANDLE sync_handle)
3803 {
3804 	(void) rib_deregistermem(conn, buf, buf_handle);
3805 
3806 	return (RDMA_SUCCESS);
3807 }
3808 
3809 /* ARGSUSED */
3810 rdma_stat
3811 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3812 		int len, int cpu)
3813 {
3814 	ibt_status_t	status;
3815 	rib_hca_t *hca = (ctoqp(conn))->hca;
3816 	ibt_mr_sync_t	mr_segment;
3817 
3818 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3819 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3820 	mr_segment.ms_len = (ib_memlen_t)len;
3821 	if (cpu) {
3822 		/* make incoming data visible to memory */
3823 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3824 	} else {
3825 		/* make memory changes visible to IO */
3826 		mr_segment.ms_flags = IBT_SYNC_READ;
3827 	}
3828 	rw_enter(&hca->state_lock, RW_READER);
3829 	if (hca->state == HCA_INITED) {
3830 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3831 		rw_exit(&hca->state_lock);
3832 	} else {
3833 		rw_exit(&hca->state_lock);
3834 		return (RDMA_FAILED);
3835 	}
3836 
3837 	if (status == IBT_SUCCESS)
3838 		return (RDMA_SUCCESS);
3839 	else {
3840 #ifdef DEBUG
3841 		cmn_err(CE_WARN, "rib_syncmem: ibt_sync_mr failed with %d\n",
3842 			status);
3843 #endif
3844 		return (RDMA_FAILED);
3845 	}
3846 }
3847 
3848 /*
3849  * XXXX	????
3850  */
3851 static rdma_stat
3852 rib_getinfo(rdma_info_t *info)
3853 {
3854 	/*
3855 	 * XXXX	Hack!
3856 	 */
3857 	info->addrlen = 16;
3858 	info->mts = 1000000;
3859 	info->mtu = 1000000;
3860 
3861 	return (RDMA_SUCCESS);
3862 }
3863 
3864 rib_bufpool_t *
3865 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3866 {
3867 	rib_bufpool_t	*rbp = NULL;
3868 	bufpool_t	*bp = NULL;
3869 	caddr_t		buf;
3870 	ibt_mr_attr_t	mem_attr;
3871 	ibt_status_t	ibt_status;
3872 	int		i, j;
3873 
3874 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3875 
3876 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3877 			num * sizeof (void *), KM_SLEEP);
3878 
3879 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3880 	bp->numelems = num;
3881 
3882 	switch (ptype) {
3883 	    case SEND_BUFFER:
3884 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3885 		/* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */
3886 		bp->rsize = RPC_MSG_SZ;
3887 		break;
3888 	    case RECV_BUFFER:
3889 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3890 		/* mem_attr.mr_flags |= IBT_MR_ENABLE_WINDOW_BIND; */
3891 		bp->rsize = RPC_BUF_SIZE;
3892 		break;
3893 	    default:
3894 		goto fail;
3895 	}
3896 
3897 	/*
3898 	 * Register the pool.
3899 	 */
3900 	bp->bufsize = num * bp->rsize;
3901 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3902 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3903 			sizeof (ibt_mr_hdl_t), KM_SLEEP);
3904 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3905 			sizeof (ibt_mr_desc_t), KM_SLEEP);
3906 
3907 	rw_enter(&hca->state_lock, RW_READER);
3908 	if (hca->state != HCA_INITED) {
3909 		rw_exit(&hca->state_lock);
3910 		goto fail;
3911 	}
3912 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3913 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3914 		mem_attr.mr_vaddr = (uintptr_t)buf;
3915 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3916 		mem_attr.mr_as = NULL;
3917 		ibt_status = ibt_register_mr(hca->hca_hdl,
3918 			hca->pd_hdl, &mem_attr, &rbp->mr_hdl[i],
3919 			&rbp->mr_desc[i]);
3920 		if (ibt_status != IBT_SUCCESS) {
3921 		    for (j = 0; j < i; j++) {
3922 			(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[j]);
3923 		    }
3924 		    rw_exit(&hca->state_lock);
3925 		    goto fail;
3926 		}
3927 	}
3928 	rw_exit(&hca->state_lock);
3929 
3930 	buf = (caddr_t)bp->buf;
3931 	for (i = 0; i < num; i++, buf += bp->rsize) {
3932 		bp->buflist[i] = (void *)buf;
3933 	}
3934 	bp->buffree = num - 1;	/* no. of free buffers */
3935 	rbp->bpool = bp;
3936 
3937 	return (rbp);
3938 fail:
3939 	if (bp) {
3940 	    if (bp->buf)
3941 		kmem_free(bp->buf, bp->bufsize);
3942 	    kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3943 	}
3944 	if (rbp) {
3945 	    if (rbp->mr_hdl)
3946 		kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3947 	    if (rbp->mr_desc)
3948 		kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3949 	    kmem_free(rbp, sizeof (rib_bufpool_t));
3950 	}
3951 	return (NULL);
3952 }
3953 
3954 static void
3955 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3956 {
3957 	int i;
3958 	rib_bufpool_t *rbp = NULL;
3959 	bufpool_t *bp;
3960 
3961 	/*
3962 	 * Obtain pool address based on type of pool
3963 	 */
3964 	switch (ptype) {
3965 		case SEND_BUFFER:
3966 			rbp = hca->send_pool;
3967 			break;
3968 		case RECV_BUFFER:
3969 			rbp = hca->recv_pool;
3970 			break;
3971 		default:
3972 			return;
3973 	}
3974 	if (rbp == NULL)
3975 		return;
3976 
3977 	bp = rbp->bpool;
3978 
3979 	/*
3980 	 * Deregister the pool memory and free it.
3981 	 */
3982 	for (i = 0; i < bp->numelems; i++) {
3983 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3984 	}
3985 }
3986 
3987 static void
3988 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3989 {
3990 
3991 	rib_bufpool_t *rbp = NULL;
3992 	bufpool_t *bp;
3993 
3994 	/*
3995 	 * Obtain pool address based on type of pool
3996 	 */
3997 	switch (ptype) {
3998 		case SEND_BUFFER:
3999 			rbp = hca->send_pool;
4000 			break;
4001 		case RECV_BUFFER:
4002 			rbp = hca->recv_pool;
4003 			break;
4004 		default:
4005 			return;
4006 	}
4007 	if (rbp == NULL)
4008 		return;
4009 
4010 	bp = rbp->bpool;
4011 
4012 	/*
4013 	 * Free the pool memory.
4014 	 */
4015 	if (rbp->mr_hdl)
4016 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
4017 
4018 	if (rbp->mr_desc)
4019 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4020 
4021 	if (bp->buf)
4022 		kmem_free(bp->buf, bp->bufsize);
4023 	mutex_destroy(&bp->buflock);
4024 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4025 	kmem_free(rbp, sizeof (rib_bufpool_t));
4026 }
4027 
4028 void
4029 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4030 {
4031 	/*
4032 	 * Deregister the pool memory and free it.
4033 	 */
4034 	rib_rbufpool_deregister(hca, ptype);
4035 	rib_rbufpool_free(hca, ptype);
4036 }
4037 
4038 /*
4039  * Fetch a buffer from the pool of type specified in rdbuf->type.
4040  */
4041 static rdma_stat
4042 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4043 {
4044 
4045 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4046 	if (rdbuf->addr) {
4047 		switch (rdbuf->type) {
4048 		case SEND_BUFFER:
4049 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
4050 			break;
4051 		case RECV_BUFFER:
4052 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
4053 			break;
4054 		default:
4055 			rdbuf->len = 0;
4056 		}
4057 		return (RDMA_SUCCESS);
4058 	} else
4059 		return (RDMA_FAILED);
4060 }
4061 
4062 
4063 /*
4064  * Fetch a buffer of specified type.
4065  * Note that rdbuf->handle is mw's rkey.
4066  */
4067 static void *
4068 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4069 {
4070 	rib_qp_t	*qp = ctoqp(conn);
4071 	rib_hca_t	*hca = qp->hca;
4072 	rdma_btype	ptype = rdbuf->type;
4073 	void		*buf;
4074 	rib_bufpool_t	*rbp = NULL;
4075 	bufpool_t	*bp;
4076 	int		i;
4077 
4078 	/*
4079 	 * Obtain pool address based on type of pool
4080 	 */
4081 	switch (ptype) {
4082 		case SEND_BUFFER:
4083 			rbp = hca->send_pool;
4084 			break;
4085 		case RECV_BUFFER:
4086 			rbp = hca->recv_pool;
4087 			break;
4088 		default:
4089 			return (NULL);
4090 	}
4091 	if (rbp == NULL)
4092 		return (NULL);
4093 
4094 	bp = rbp->bpool;
4095 
4096 	mutex_enter(&bp->buflock);
4097 	if (bp->buffree < 0) {
4098 		cmn_err(CE_WARN, "rib_rbuf_alloc: No free buffers!");
4099 		mutex_exit(&bp->buflock);
4100 		return (NULL);
4101 	}
4102 
4103 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4104 	buf = bp->buflist[bp->buffree];
4105 	rdbuf->addr = buf;
4106 	rdbuf->len = bp->rsize;
4107 	for (i = bp->numelems - 1; i >= 0; i--) {
4108 	    if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4109 		rdbuf->handle.mrc_rmr = (uint32_t)rbp->mr_desc[i].md_rkey;
4110 		rdbuf->handle.mrc_linfo = (uintptr_t)rbp->mr_hdl[i];
4111 		rdbuf->handle.mrc_lmr = (uint32_t)rbp->mr_desc[i].md_lkey;
4112 		bp->buffree--;
4113 		if (rib_debug > 1)
4114 		    cmn_err(CE_NOTE, "rib_rbuf_alloc: %d free bufs "
4115 			"(type %d)\n", bp->buffree+1, ptype);
4116 
4117 		mutex_exit(&bp->buflock);
4118 
4119 		return (buf);
4120 	    }
4121 	}
4122 	cmn_err(CE_WARN, "rib_rbuf_alloc: NO matching buf %p of "
4123 		"type %d found!", buf, ptype);
4124 	mutex_exit(&bp->buflock);
4125 
4126 	return (NULL);
4127 }
4128 
4129 static void
4130 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4131 {
4132 
4133 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4134 }
4135 
4136 static void
4137 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4138 {
4139 	rib_qp_t *qp = ctoqp(conn);
4140 	rib_hca_t *hca = qp->hca;
4141 	rib_bufpool_t *rbp = NULL;
4142 	bufpool_t *bp;
4143 
4144 	/*
4145 	 * Obtain pool address based on type of pool
4146 	 */
4147 	switch (ptype) {
4148 		case SEND_BUFFER:
4149 			rbp = hca->send_pool;
4150 			break;
4151 		case RECV_BUFFER:
4152 			rbp = hca->recv_pool;
4153 			break;
4154 		default:
4155 			return;
4156 	}
4157 	if (rbp == NULL)
4158 		return;
4159 
4160 	bp = rbp->bpool;
4161 
4162 	mutex_enter(&bp->buflock);
4163 	if (++bp->buffree >= bp->numelems) {
4164 		/*
4165 		 * Should never happen
4166 		 */
4167 		cmn_err(CE_WARN, "rib_rbuf_free: One (type %d) "
4168 			"too many frees!", ptype);
4169 		bp->buffree--;
4170 	} else {
4171 		bp->buflist[bp->buffree] = buf;
4172 		if (rib_debug > 1)
4173 		    cmn_err(CE_NOTE, "rib_rbuf_free: %d free bufs "
4174 			"(type %d)\n", bp->buffree+1, ptype);
4175 	}
4176 	mutex_exit(&bp->buflock);
4177 }
4178 
4179 static rdma_stat
4180 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4181 {
4182 	rw_enter(&connlist->conn_lock, RW_WRITER);
4183 	if (connlist->conn_hd) {
4184 		cn->c_next = connlist->conn_hd;
4185 		connlist->conn_hd->c_prev = cn;
4186 	}
4187 	connlist->conn_hd = cn;
4188 	rw_exit(&connlist->conn_lock);
4189 
4190 	return (RDMA_SUCCESS);
4191 }
4192 
4193 static rdma_stat
4194 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4195 {
4196 	rw_enter(&connlist->conn_lock, RW_WRITER);
4197 	if (cn->c_prev) {
4198 		cn->c_prev->c_next = cn->c_next;
4199 	}
4200 	if (cn->c_next) {
4201 		cn->c_next->c_prev = cn->c_prev;
4202 	}
4203 	if (connlist->conn_hd == cn)
4204 		connlist->conn_hd = cn->c_next;
4205 	rw_exit(&connlist->conn_lock);
4206 
4207 	return (RDMA_SUCCESS);
4208 }
4209 
4210 /*
4211  * Connection management.
4212  * IBTF does not support recycling of channels. So connections are only
4213  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR or
4214  * C_DISCONN_PEND state. No C_IDLE state.
4215  * C_CONN_PEND state: Connection establishment in progress to the server.
4216  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4217  * It has an RC channel associated with it. ibt_post_send/recv are allowed
4218  * only in this state.
4219  * C_ERROR state: A connection transitions to this state when WRs on the
4220  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4221  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4222  * C_DISCONN_PEND state: When a connection is in C_ERROR state and when
4223  * c_ref drops to 0 (this indicates that RPC has no more references to this
4224  * connection), the connection should be destroyed. A connection transitions
4225  * into this state when it is being destroyed.
4226  */
4227 static rdma_stat
4228 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn)
4229 {
4230 	CONN *cn;
4231 	int status = RDMA_SUCCESS;
4232 	rib_hca_t *hca = (rib_hca_t *)handle;
4233 	rib_qp_t *qp;
4234 	clock_t cv_stat, timout;
4235 	ibt_path_info_t path;
4236 
4237 again:
4238 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4239 	cn = hca->cl_conn_list.conn_hd;
4240 	while (cn != NULL) {
4241 		/*
4242 		 * First, clear up any connection in the ERROR state
4243 		 */
4244 		mutex_enter(&cn->c_lock);
4245 		if (cn->c_state == C_ERROR) {
4246 			if (cn->c_ref == 0) {
4247 				/*
4248 				 * Remove connection from list and destroy it.
4249 				 */
4250 				cn->c_state = C_DISCONN_PEND;
4251 				mutex_exit(&cn->c_lock);
4252 				rw_exit(&hca->cl_conn_list.conn_lock);
4253 				(void) rib_disconnect_channel(cn,
4254 				    &hca->cl_conn_list);
4255 				goto again;
4256 			}
4257 			mutex_exit(&cn->c_lock);
4258 			cn = cn->c_next;
4259 			continue;
4260 		} else if (cn->c_state == C_DISCONN_PEND) {
4261 			mutex_exit(&cn->c_lock);
4262 			cn = cn->c_next;
4263 			continue;
4264 		}
4265 		if ((cn->c_raddr.len == svcaddr->len) &&
4266 		    bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) {
4267 			/*
4268 			 * Our connection. Give up conn list lock
4269 			 * as we are done traversing the list.
4270 			 */
4271 			rw_exit(&hca->cl_conn_list.conn_lock);
4272 			if (cn->c_state == C_CONNECTED) {
4273 				cn->c_ref++;	/* sharing a conn */
4274 				mutex_exit(&cn->c_lock);
4275 				*conn = cn;
4276 				return (status);
4277 			}
4278 			if (cn->c_state == C_CONN_PEND) {
4279 				/*
4280 				 * Hold a reference to this conn before
4281 				 * we give up the lock.
4282 				 */
4283 				cn->c_ref++;
4284 				timout =  ddi_get_lbolt() +
4285 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
4286 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4287 					&cn->c_lock, timout)) > 0 &&
4288 					cn->c_state == C_CONN_PEND)
4289 					;
4290 				if (cv_stat == 0) {
4291 					cn->c_ref--;
4292 					mutex_exit(&cn->c_lock);
4293 					return (RDMA_INTR);
4294 				}
4295 				if (cv_stat < 0) {
4296 					cn->c_ref--;
4297 					mutex_exit(&cn->c_lock);
4298 					return (RDMA_TIMEDOUT);
4299 				}
4300 				if (cn->c_state == C_CONNECTED) {
4301 					*conn = cn;
4302 					mutex_exit(&cn->c_lock);
4303 					return (status);
4304 				} else {
4305 					cn->c_ref--;
4306 					mutex_exit(&cn->c_lock);
4307 					return (RDMA_TIMEDOUT);
4308 				}
4309 			}
4310 		}
4311 		mutex_exit(&cn->c_lock);
4312 		cn = cn->c_next;
4313 	}
4314 	rw_exit(&hca->cl_conn_list.conn_lock);
4315 
4316 	status = rib_chk_srv_ats(hca, svcaddr, addr_type, &path);
4317 	if (status != RDMA_SUCCESS) {
4318 #ifdef DEBUG
4319 		if (rib_debug) {
4320 			cmn_err(CE_WARN, "rib_conn_get: "
4321 				"No server ATS record!");
4322 		}
4323 #endif
4324 		return (RDMA_FAILED);
4325 	}
4326 
4327 	/*
4328 	 * Channel to server doesn't exist yet, create one.
4329 	 */
4330 	if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) {
4331 		return (RDMA_FAILED);
4332 	}
4333 	cn = qptoc(qp);
4334 	cn->c_state = C_CONN_PEND;
4335 	cn->c_ref = 1;
4336 
4337 	/*
4338 	 * Add to conn list.
4339 	 * We had given up the READER lock. In the time since then,
4340 	 * another thread might have created the connection we are
4341 	 * trying here. But for now, that is quiet alright - there
4342 	 * might be two connections between a pair of hosts instead
4343 	 * of one. If we really want to close that window,
4344 	 * then need to check the list after acquiring the
4345 	 * WRITER lock.
4346 	 */
4347 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4348 	status = rib_conn_to_srv(hca, qp, &path);
4349 	mutex_enter(&cn->c_lock);
4350 	if (status == RDMA_SUCCESS) {
4351 		cn->c_state = C_CONNECTED;
4352 		*conn = cn;
4353 	} else {
4354 		cn->c_state = C_ERROR;
4355 		cn->c_ref--;
4356 #ifdef DEBUG
4357 		if (rib_debug) {
4358 			cmn_err(CE_WARN, "rib_conn_get: FAILED creating"
4359 			    " a channel!");
4360 		}
4361 #endif
4362 	}
4363 	cv_broadcast(&cn->c_cv);
4364 	mutex_exit(&cn->c_lock);
4365 	return (status);
4366 }
4367 
4368 static rdma_stat
4369 rib_conn_release(CONN *conn)
4370 {
4371 	rib_qp_t	*qp = ctoqp(conn);
4372 
4373 	mutex_enter(&conn->c_lock);
4374 	conn->c_ref--;
4375 
4376 	/*
4377 	 * If a conn is C_ERROR, close the channel.
4378 	 * If it's CONNECTED, keep it that way.
4379 	 */
4380 	if (conn->c_ref == 0 && (conn->c_state &  C_ERROR)) {
4381 		conn->c_state = C_DISCONN_PEND;
4382 		mutex_exit(&conn->c_lock);
4383 		if (qp->mode == RIB_SERVER)
4384 			(void) rib_disconnect_channel(conn,
4385 			    &qp->hca->srv_conn_list);
4386 		else
4387 			(void) rib_disconnect_channel(conn,
4388 			    &qp->hca->cl_conn_list);
4389 		return (RDMA_SUCCESS);
4390 	}
4391 	mutex_exit(&conn->c_lock);
4392 	return (RDMA_SUCCESS);
4393 }
4394 
4395 /*
4396  * Add at front of list
4397  */
4398 static struct rdma_done_list *
4399 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4400 {
4401 	struct rdma_done_list *rd;
4402 
4403 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4404 
4405 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4406 	rd->xid = xid;
4407 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4408 
4409 	rd->prev = NULL;
4410 	rd->next = qp->rdlist;
4411 	if (qp->rdlist != NULL)
4412 		qp->rdlist->prev = rd;
4413 	qp->rdlist = rd;
4414 
4415 	return (rd);
4416 }
4417 
4418 static void
4419 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4420 {
4421 	struct rdma_done_list *r;
4422 
4423 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4424 
4425 	r = rd->next;
4426 	if (r != NULL) {
4427 		r->prev = rd->prev;
4428 	}
4429 
4430 	r = rd->prev;
4431 	if (r != NULL) {
4432 		r->next = rd->next;
4433 	} else {
4434 		qp->rdlist = rd->next;
4435 	}
4436 
4437 	cv_destroy(&rd->rdma_done_cv);
4438 	kmem_free(rd, sizeof (*rd));
4439 }
4440 
4441 static void
4442 rdma_done_rem_list(rib_qp_t *qp)
4443 {
4444 	struct rdma_done_list	*r, *n;
4445 
4446 	mutex_enter(&qp->rdlist_lock);
4447 	for (r = qp->rdlist; r != NULL; r = n) {
4448 		n = r->next;
4449 		rdma_done_rm(qp, r);
4450 	}
4451 	mutex_exit(&qp->rdlist_lock);
4452 }
4453 
4454 static void
4455 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4456 {
4457 	struct rdma_done_list *r = qp->rdlist;
4458 
4459 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4460 
4461 	while (r) {
4462 		if (r->xid == xid) {
4463 			cv_signal(&r->rdma_done_cv);
4464 			return;
4465 		} else {
4466 			r = r->next;
4467 		}
4468 	}
4469 	if (rib_debug > 1) {
4470 	    cmn_err(CE_WARN, "rdma_done_notify: "
4471 		"No matching xid for %u, qp %p\n", xid, (void *)qp);
4472 	}
4473 }
4474 
4475 rpcib_ats_t *
4476 get_ibd_entry(ib_gid_t *gid, ib_pkey_t pkey, rpcib_ibd_insts_t *ibds)
4477 {
4478 	rpcib_ats_t		*atsp;
4479 	int			i;
4480 
4481 	for (i = 0, atsp = ibds->rib_ats; i < ibds->rib_ibd_cnt; i++, atsp++) {
4482 		if (atsp->ras_port_gid.gid_prefix == gid->gid_prefix &&
4483 		    atsp->ras_port_gid.gid_guid == gid->gid_guid &&
4484 		    atsp->ras_pkey == pkey) {
4485 			return (atsp);
4486 		}
4487 	}
4488 	return (NULL);
4489 }
4490 
4491 int
4492 rib_get_ibd_insts_cb(dev_info_t *dip, void *arg)
4493 {
4494 	rpcib_ibd_insts_t *ibds = (rpcib_ibd_insts_t *)arg;
4495 	rpcib_ats_t	*atsp;
4496 	ib_pkey_t	pkey;
4497 	uint8_t		port;
4498 	ib_guid_t	hca_guid;
4499 	ib_gid_t	port_gid;
4500 
4501 	if (i_ddi_devi_attached(dip) &&
4502 	    (strcmp(ddi_node_name(dip), "ibport") == 0) &&
4503 	    (strstr(ddi_get_name_addr(dip), "ipib") != NULL)) {
4504 
4505 		if (ibds->rib_ibd_cnt >= ibds->rib_ibd_alloc) {
4506 		    rpcib_ats_t	*tmp;
4507 
4508 		    tmp = (rpcib_ats_t *)kmem_zalloc((ibds->rib_ibd_alloc +
4509 			N_IBD_INSTANCES) * sizeof (rpcib_ats_t), KM_SLEEP);
4510 		    bcopy(ibds->rib_ats, tmp,
4511 			ibds->rib_ibd_alloc * sizeof (rpcib_ats_t));
4512 		    kmem_free(ibds->rib_ats,
4513 			ibds->rib_ibd_alloc * sizeof (rpcib_ats_t));
4514 		    ibds->rib_ats = tmp;
4515 		    ibds->rib_ibd_alloc += N_IBD_INSTANCES;
4516 		}
4517 		if (((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY,
4518 			dip, 0, "hca-guid", 0)) == 0) ||
4519 		    ((port = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
4520 			0, "port-number", 0)) == 0) ||
4521 		    (ibt_get_port_state_byguid(hca_guid, port,
4522 			&port_gid, NULL) != IBT_SUCCESS) ||
4523 		    ((pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
4524 			"port-pkey", IB_PKEY_INVALID_LIMITED)) <=
4525 			IB_PKEY_INVALID_FULL)) {
4526 		    return (DDI_WALK_CONTINUE);
4527 		}
4528 		atsp = &ibds->rib_ats[ibds->rib_ibd_cnt];
4529 		atsp->ras_inst = ddi_get_instance(dip);
4530 		atsp->ras_pkey = pkey;
4531 		atsp->ras_port_gid = port_gid;
4532 		ibds->rib_ibd_cnt++;
4533 	}
4534 	return (DDI_WALK_CONTINUE);
4535 }
4536 
4537 void
4538 rib_get_ibd_insts(rpcib_ibd_insts_t *ibds)
4539 {
4540 	ddi_walk_devs(ddi_root_node(), rib_get_ibd_insts_cb, ibds);
4541 }
4542 
4543 /*
4544  * Return ibd interfaces and ibd instances.
4545  */
4546 int
4547 get_ibd_ipaddr(rpcib_ibd_insts_t *ibds)
4548 {
4549 	TIUSER			*tiptr, *tiptr6;
4550 	vnode_t			*kvp, *kvp6;
4551 	vnode_t			*vp = NULL, *vp6 = NULL;
4552 	struct strioctl		iocb;
4553 	struct lifreq		lif_req;
4554 	int			k, ip_cnt;
4555 	rpcib_ats_t		*atsp;
4556 
4557 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP,
4558 		&kvp) == 0) {
4559 	    if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
4560 		&tiptr, CRED()) == 0) {
4561 		vp = tiptr->fp->f_vnode;
4562 	    } else {
4563 		VN_RELE(kvp);
4564 	    }
4565 	}
4566 
4567 	if (lookupname("/dev/udp6", UIO_SYSSPACE, FOLLOW, NULLVPP,
4568 		&kvp6) == 0) {
4569 	    if (t_kopen((file_t *)NULL, kvp6->v_rdev, FREAD|FWRITE,
4570 		&tiptr6, CRED()) == 0) {
4571 		vp6 = tiptr6->fp->f_vnode;
4572 	    } else {
4573 		VN_RELE(kvp6);
4574 	    }
4575 	}
4576 
4577 	if (vp == NULL && vp6 == NULL)
4578 		return (-1);
4579 
4580 	/* Get ibd ip's */
4581 	ip_cnt = 0;
4582 	for (k = 0, atsp = ibds->rib_ats; k < ibds->rib_ibd_cnt; k++, atsp++) {
4583 		/* IPv4 */
4584 	    if (vp != NULL) {
4585 		(void) bzero((void *)&lif_req, sizeof (struct lifreq));
4586 		(void) snprintf(lif_req.lifr_name,
4587 			sizeof (lif_req.lifr_name), "%s%d",
4588 			IBD_NAME, atsp->ras_inst);
4589 
4590 		(void) bzero((void *)&iocb, sizeof (struct strioctl));
4591 		iocb.ic_cmd = SIOCGLIFADDR;
4592 		iocb.ic_timout = 0;
4593 		iocb.ic_len = sizeof (struct lifreq);
4594 		iocb.ic_dp = (caddr_t)&lif_req;
4595 		if (kstr_ioctl(vp, I_STR, (intptr_t)&iocb) == 0) {
4596 		    atsp->ras_inet_type = AF_INET;
4597 		    bcopy(&lif_req.lifr_addr, &atsp->ras_sin,
4598 			sizeof (struct sockaddr_in));
4599 		    ip_cnt++;
4600 		    continue;
4601 		}
4602 	    }
4603 		/* Try IPv6 */
4604 	    if (vp6 != NULL) {
4605 		(void) bzero((void *)&lif_req, sizeof (struct lifreq));
4606 		(void) snprintf(lif_req.lifr_name,
4607 			sizeof (lif_req.lifr_name), "%s%d",
4608 			IBD_NAME, atsp->ras_inst);
4609 
4610 		(void) bzero((void *)&iocb, sizeof (struct strioctl));
4611 		iocb.ic_cmd = SIOCGLIFADDR;
4612 		iocb.ic_timout = 0;
4613 		iocb.ic_len = sizeof (struct lifreq);
4614 		iocb.ic_dp = (caddr_t)&lif_req;
4615 		if (kstr_ioctl(vp6, I_STR, (intptr_t)&iocb) == 0) {
4616 
4617 		    atsp->ras_inet_type = AF_INET6;
4618 		    bcopy(&lif_req.lifr_addr, &atsp->ras_sin6,
4619 			    sizeof (struct sockaddr_in6));
4620 		    ip_cnt++;
4621 		}
4622 	    }
4623 	}
4624 
4625 	if (vp6 != NULL) {
4626 	    (void) t_kclose(tiptr6, 0);
4627 	    VN_RELE(kvp6);
4628 	}
4629 	if (vp != NULL) {
4630 	    (void) t_kclose(tiptr, 0);
4631 	    VN_RELE(kvp);
4632 	}
4633 
4634 	if (ip_cnt == 0)
4635 	    return (-1);
4636 	else
4637 	    return (0);
4638 }
4639 
4640 char **
4641 get_ip_addrs(int *count)
4642 {
4643 	TIUSER			*tiptr;
4644 	vnode_t			*kvp;
4645 	int			num_of_ifs;
4646 	char			**addresses;
4647 	int			return_code;
4648 
4649 	/*
4650 	 * Open a device for doing down stream kernel ioctls
4651 	 */
4652 	return_code = lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW,
4653 	    NULLVPP, &kvp);
4654 	if (return_code != 0) {
4655 		cmn_err(CE_NOTE, "get_Ip_addrs: lookupname failed\n");
4656 		*count = -1;
4657 		return (NULL);
4658 	}
4659 
4660 	return_code = t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
4661 	    &tiptr, CRED());
4662 	if (return_code != 0) {
4663 		cmn_err(CE_NOTE, "get_Ip_addrs: t_kopen failed\n");
4664 		VN_RELE(kvp);
4665 		*count = -1;
4666 		return (NULL);
4667 	}
4668 
4669 	/*
4670 	 * Perform the first ioctl to get the number of interfaces
4671 	 */
4672 	return_code = get_interfaces(tiptr, &num_of_ifs);
4673 	if (return_code != 0 || num_of_ifs == 0) {
4674 		cmn_err(CE_NOTE, "get_Ip_addrs: get_interfaces failed\n");
4675 		(void) t_kclose(tiptr, 0);
4676 		VN_RELE(kvp);
4677 		*count = -1;
4678 		return (NULL);
4679 	}
4680 
4681 	/*
4682 	 * Perform the second ioctl to get the address on each interface
4683 	 * found.
4684 	 */
4685 	addresses = kmem_zalloc(num_of_ifs * sizeof (char *), KM_SLEEP);
4686 	return_code = find_addrs(tiptr, addresses, num_of_ifs);
4687 	if (return_code <= 0) {
4688 		cmn_err(CE_NOTE, "get_Ip_addrs: find_addrs failed\n");
4689 		(void) t_kclose(tiptr, 0);
4690 		kmem_free(addresses, num_of_ifs * sizeof (char *));
4691 		VN_RELE(kvp);
4692 		*count = -1;
4693 		return (NULL);
4694 	}
4695 
4696 	*count = return_code;
4697 	VN_RELE(kvp);
4698 	(void) t_kclose(tiptr, 0);
4699 	return (addresses);
4700 }
4701 
4702 int
4703 get_interfaces(TIUSER *tiptr, int *num)
4704 {
4705 	struct lifnum		if_buf;
4706 	struct strioctl		iocb;
4707 	vnode_t			*vp;
4708 	int			return_code;
4709 
4710 	/*
4711 	 * Prep the number of interfaces request buffer for ioctl
4712 	 */
4713 	(void) bzero((void *)&if_buf, sizeof (struct lifnum));
4714 	if_buf.lifn_family = AF_UNSPEC;
4715 	if_buf.lifn_flags = 0;
4716 
4717 	/*
4718 	 * Prep the kernel ioctl buffer and send it down stream
4719 	 */
4720 	(void) bzero((void *)&iocb, sizeof (struct strioctl));
4721 	iocb.ic_cmd = SIOCGLIFNUM;
4722 	iocb.ic_timout = 0;
4723 	iocb.ic_len = sizeof (if_buf);
4724 	iocb.ic_dp = (caddr_t)&if_buf;
4725 
4726 	vp = tiptr->fp->f_vnode;
4727 	return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4728 	if (return_code != 0) {
4729 		cmn_err(CE_NOTE, "get_interfaces: kstr_ioctl failed\n");
4730 		*num = -1;
4731 		return (-1);
4732 	}
4733 
4734 	*num = if_buf.lifn_count;
4735 #ifdef	DEBUG
4736 	if (rib_debug > 1)
4737 		cmn_err(CE_NOTE, "Number of interfaces detected: %d\n",
4738 		    if_buf.lifn_count);
4739 #endif
4740 	return (0);
4741 }
4742 
4743 int
4744 find_addrs(TIUSER *tiptr, char **addrs, int num_ifs)
4745 {
4746 	struct lifconf		lifc;
4747 	struct lifreq		*if_data_buf;
4748 	struct strioctl		iocb;
4749 	caddr_t			request_buffer;
4750 	struct sockaddr_in	*sin4;
4751 	struct sockaddr_in6	*sin6;
4752 	vnode_t			*vp;
4753 	int			i, count, return_code;
4754 
4755 	/*
4756 	 * Prep the buffer for requesting all interface's info
4757 	 */
4758 	(void) bzero((void *)&lifc, sizeof (struct lifconf));
4759 	lifc.lifc_family = AF_UNSPEC;
4760 	lifc.lifc_flags = 0;
4761 	lifc.lifc_len = num_ifs * sizeof (struct lifreq);
4762 
4763 	request_buffer = kmem_zalloc(num_ifs * sizeof (struct lifreq),
4764 	    KM_SLEEP);
4765 
4766 	lifc.lifc_buf = request_buffer;
4767 
4768 	/*
4769 	 * Prep the kernel ioctl buffer and send it down stream
4770 	 */
4771 	(void) bzero((void *)&iocb, sizeof (struct strioctl));
4772 	iocb.ic_cmd = SIOCGLIFCONF;
4773 	iocb.ic_timout = 0;
4774 	iocb.ic_len = sizeof (struct lifconf);
4775 	iocb.ic_dp = (caddr_t)&lifc;
4776 
4777 	vp = tiptr->fp->f_vnode;
4778 	return_code = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4779 	if (return_code != 0) {
4780 		cmn_err(CE_NOTE, "find_addrs: kstr_ioctl failed\n");
4781 		kmem_free(request_buffer, num_ifs * sizeof (struct lifreq));
4782 		return (-1);
4783 	}
4784 
4785 	/*
4786 	 * Extract addresses and fill them in the requested array
4787 	 * IB_SVC_NAME_LEN is defined to be 64 so it  covers both IPv4 &
4788 	 * IPv6. Here count is the number of IP addresses collected.
4789 	 */
4790 	if_data_buf = lifc.lifc_req;
4791 	count = 0;
4792 	for (i = lifc.lifc_len / sizeof (struct lifreq); i > 0; i--,
4793 	if_data_buf++) {
4794 		if (if_data_buf->lifr_addr.ss_family == AF_INET) {
4795 			sin4 = (struct sockaddr_in *)&if_data_buf->lifr_addr;
4796 			addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
4797 			(void) inet_ntop(AF_INET, &sin4->sin_addr,
4798 			    addrs[count], IB_SVC_NAME_LEN);
4799 			count ++;
4800 		}
4801 
4802 		if (if_data_buf->lifr_addr.ss_family == AF_INET6) {
4803 			sin6 = (struct sockaddr_in6 *)&if_data_buf->lifr_addr;
4804 			addrs[count] = kmem_zalloc(IB_SVC_NAME_LEN, KM_SLEEP);
4805 			(void) inet_ntop(AF_INET6, &sin6->sin6_addr,
4806 			    addrs[count], IB_SVC_NAME_LEN);
4807 			count ++;
4808 		}
4809 	}
4810 
4811 	kmem_free(request_buffer, num_ifs * sizeof (struct lifreq));
4812 	return (count);
4813 }
4814 
4815 /*
4816  * Goes through all connections and closes the channel
4817  * This will cause all the WRs on those channels to be
4818  * flushed.
4819  */
4820 static void
4821 rib_close_channels(rib_conn_list_t *connlist)
4822 {
4823 	CONN 		*conn;
4824 	rib_qp_t	*qp;
4825 
4826 	rw_enter(&connlist->conn_lock, RW_READER);
4827 	conn = connlist->conn_hd;
4828 	while (conn != NULL) {
4829 		mutex_enter(&conn->c_lock);
4830 		qp = ctoqp(conn);
4831 		if (conn->c_state & C_CONNECTED) {
4832 			/*
4833 			 * Live connection in CONNECTED state.
4834 			 * Call ibt_close_rc_channel in nonblocking mode
4835 			 * with no callbacks.
4836 			 */
4837 			conn->c_state = C_ERROR;
4838 			(void) ibt_close_rc_channel(qp->qp_hdl,
4839 				IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
4840 			(void) ibt_free_channel(qp->qp_hdl);
4841 			qp->qp_hdl = NULL;
4842 		} else {
4843 			if (conn->c_state == C_ERROR &&
4844 				qp->qp_hdl != NULL) {
4845 				/*
4846 				 * Connection in ERROR state but
4847 				 * channel is not yet freed.
4848 				 */
4849 				(void) ibt_close_rc_channel(qp->qp_hdl,
4850 					IBT_NOCALLBACKS, NULL, 0, NULL,
4851 					NULL, 0);
4852 				(void) ibt_free_channel(qp->qp_hdl);
4853 				qp->qp_hdl = NULL;
4854 			}
4855 		}
4856 		mutex_exit(&conn->c_lock);
4857 		conn = conn->c_next;
4858 	}
4859 	rw_exit(&connlist->conn_lock);
4860 }
4861 
4862 /*
4863  * Frees up all connections that are no longer being referenced
4864  */
4865 static void
4866 rib_purge_connlist(rib_conn_list_t *connlist)
4867 {
4868 	CONN 		*conn;
4869 
4870 top:
4871 	rw_enter(&connlist->conn_lock, RW_READER);
4872 	conn = connlist->conn_hd;
4873 	while (conn != NULL) {
4874 		mutex_enter(&conn->c_lock);
4875 
4876 		/*
4877 		 * At this point connection is either in ERROR
4878 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4879 		 * then some other thread is culling that connection.
4880 		 * If not and if c_ref is 0, then destroy the connection.
4881 		 */
4882 		if (conn->c_ref == 0 &&
4883 			conn->c_state != C_DISCONN_PEND) {
4884 			/*
4885 			 * Cull the connection
4886 			 */
4887 			conn->c_state = C_DISCONN_PEND;
4888 			mutex_exit(&conn->c_lock);
4889 			rw_exit(&connlist->conn_lock);
4890 			(void) rib_disconnect_channel(conn, connlist);
4891 			goto top;
4892 		} else {
4893 			/*
4894 			 * conn disconnect already scheduled or will
4895 			 * happen from conn_release when c_ref drops to 0.
4896 			 */
4897 			mutex_exit(&conn->c_lock);
4898 		}
4899 		conn = conn->c_next;
4900 	}
4901 	rw_exit(&connlist->conn_lock);
4902 
4903 	/*
4904 	 * At this point, only connections with c_ref != 0 are on the list
4905 	 */
4906 }
4907 
4908 /*
4909  * Cleans and closes up all uses of the HCA
4910  */
4911 static void
4912 rib_detach_hca(rib_hca_t *hca)
4913 {
4914 
4915 	/*
4916 	 * Stop all services on the HCA
4917 	 * Go through cl_conn_list and close all rc_channels
4918 	 * Go through svr_conn_list and close all rc_channels
4919 	 * Free connections whose c_ref has dropped to 0
4920 	 * Destroy all CQs
4921 	 * Deregister and released all buffer pool memory after all
4922 	 * connections are destroyed
4923 	 * Free the protection domain
4924 	 * ibt_close_hca()
4925 	 */
4926 	rw_enter(&hca->state_lock, RW_WRITER);
4927 	if (hca->state == HCA_DETACHED) {
4928 		rw_exit(&hca->state_lock);
4929 		return;
4930 	}
4931 
4932 	hca->state = HCA_DETACHED;
4933 	rib_stat->nhca_inited--;
4934 
4935 	rib_stop_services(hca);
4936 	rib_deregister_ats();
4937 	rib_close_channels(&hca->cl_conn_list);
4938 	rib_close_channels(&hca->srv_conn_list);
4939 	rw_exit(&hca->state_lock);
4940 
4941 	rib_purge_connlist(&hca->cl_conn_list);
4942 	rib_purge_connlist(&hca->srv_conn_list);
4943 
4944 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4945 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4946 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4947 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4948 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4949 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4950 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4951 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4952 
4953 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4954 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4955 	if (hca->srv_conn_list.conn_hd == NULL &&
4956 		hca->cl_conn_list.conn_hd == NULL) {
4957 		/*
4958 		 * conn_lists are NULL, so destroy
4959 		 * buffers, close hca and be done.
4960 		 */
4961 		rib_rbufpool_destroy(hca, RECV_BUFFER);
4962 		rib_rbufpool_destroy(hca, SEND_BUFFER);
4963 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4964 		(void) ibt_close_hca(hca->hca_hdl);
4965 		hca->hca_hdl = NULL;
4966 	}
4967 	rw_exit(&hca->cl_conn_list.conn_lock);
4968 	rw_exit(&hca->srv_conn_list.conn_lock);
4969 
4970 	if (hca->hca_hdl != NULL) {
4971 		mutex_enter(&hca->inuse_lock);
4972 		while (hca->inuse)
4973 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4974 		mutex_exit(&hca->inuse_lock);
4975 		/*
4976 		 * conn_lists are now NULL, so destroy
4977 		 * buffers, close hca and be done.
4978 		 */
4979 		rib_rbufpool_destroy(hca, RECV_BUFFER);
4980 		rib_rbufpool_destroy(hca, SEND_BUFFER);
4981 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4982 		(void) ibt_close_hca(hca->hca_hdl);
4983 		hca->hca_hdl = NULL;
4984 	}
4985 }
4986