xref: /titanic_51/usr/src/uts/common/rpc/rpcib.c (revision bbaa8b60dd95d714741fc474adad3cf710ef4efd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2007, The Ohio State University. All rights reserved.
27  *
28  * Portions of this source code is developed by the team members of
29  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
30  * headed by Professor Dhabaleswar K. (DK) Panda.
31  *
32  * Acknowledgements to contributions from developors:
33  *   Ranjit Noronha: noronha@cse.ohio-state.edu
34  *   Lei Chai      : chail@cse.ohio-state.edu
35  *   Weikuan Yu    : yuw@cse.ohio-state.edu
36  *
37  */
38 
39 /*
40  * The rpcib plugin. Implements the interface for RDMATF's
41  * interaction with IBTF.
42  */
43 
44 #include <sys/param.h>
45 #include <sys/types.h>
46 #include <sys/user.h>
47 #include <sys/systm.h>
48 #include <sys/sysmacros.h>
49 #include <sys/proc.h>
50 #include <sys/socket.h>
51 #include <sys/file.h>
52 #include <sys/stream.h>
53 #include <sys/strsubr.h>
54 #include <sys/stropts.h>
55 #include <sys/errno.h>
56 #include <sys/kmem.h>
57 #include <sys/debug.h>
58 #include <sys/pathname.h>
59 #include <sys/kstat.h>
60 #include <sys/t_lock.h>
61 #include <sys/ddi.h>
62 #include <sys/cmn_err.h>
63 #include <sys/time.h>
64 #include <sys/isa_defs.h>
65 #include <sys/callb.h>
66 #include <sys/sunddi.h>
67 #include <sys/sunndi.h>
68 #include <sys/sdt.h>
69 #include <sys/ib/ibtl/ibti.h>
70 #include <rpc/rpc.h>
71 #include <rpc/ib.h>
72 #include <sys/modctl.h>
73 #include <sys/kstr.h>
74 #include <sys/sockio.h>
75 #include <sys/vnode.h>
76 #include <sys/tiuser.h>
77 #include <net/if.h>
78 #include <net/if_types.h>
79 #include <sys/cred.h>
80 #include <rpc/rpc_rdma.h>
81 #include <nfs/nfs.h>
82 #include <sys/atomic.h>
83 
84 #define	NFS_RDMA_PORT	20049
85 
86 
87 /*
88  * Convenience structures for connection management
89  */
90 typedef struct rpcib_ipaddrs {
91 	void	*ri_list;	/* pointer to list of addresses */
92 	uint_t	ri_count;	/* number of addresses in list */
93 	uint_t	ri_size;	/* size of ri_list in bytes */
94 } rpcib_ipaddrs_t;
95 
96 
97 typedef struct rpcib_ping {
98 	rib_hca_t  *hca;
99 	ibt_path_info_t path;
100 	ibt_ip_addr_t srcip;
101 	ibt_ip_addr_t dstip;
102 } rpcib_ping_t;
103 
104 /*
105  * Prototype declarations for driver ops
106  */
107 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
108 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
109 				void *, void **);
110 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
111 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
112 static int	rpcib_do_ip_ioctl(int, int, void *);
113 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
114 static int rpcib_cache_kstat_update(kstat_t *, int);
115 static void rib_force_cleanup(void *);
116 static void rib_stop_hca_services(rib_hca_t *);
117 static void rib_attach_hca(void);
118 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
119 		struct netbuf *d_svcaddr, CONN **conn);
120 
121 struct {
122 	kstat_named_t cache_limit;
123 	kstat_named_t cache_allocation;
124 	kstat_named_t cache_hits;
125 	kstat_named_t cache_misses;
126 	kstat_named_t cache_misses_above_the_limit;
127 } rpcib_kstat = {
128 	{"cache_limit",			KSTAT_DATA_UINT64 },
129 	{"cache_allocation",		KSTAT_DATA_UINT64 },
130 	{"cache_hits",			KSTAT_DATA_UINT64 },
131 	{"cache_misses",		KSTAT_DATA_UINT64 },
132 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
133 };
134 
135 /* rpcib cb_ops */
136 static struct cb_ops rpcib_cbops = {
137 	nulldev,		/* open */
138 	nulldev,		/* close */
139 	nodev,			/* strategy */
140 	nodev,			/* print */
141 	nodev,			/* dump */
142 	nodev,			/* read */
143 	nodev,			/* write */
144 	nodev,			/* ioctl */
145 	nodev,			/* devmap */
146 	nodev,			/* mmap */
147 	nodev,			/* segmap */
148 	nochpoll,		/* poll */
149 	ddi_prop_op,		/* prop_op */
150 	NULL,			/* stream */
151 	D_MP,			/* cb_flag */
152 	CB_REV,			/* rev */
153 	nodev,			/* int (*cb_aread)() */
154 	nodev			/* int (*cb_awrite)() */
155 };
156 
157 /*
158  * Device options
159  */
160 static struct dev_ops rpcib_ops = {
161 	DEVO_REV,		/* devo_rev, */
162 	0,			/* refcnt  */
163 	rpcib_getinfo,		/* info */
164 	nulldev,		/* identify */
165 	nulldev,		/* probe */
166 	rpcib_attach,		/* attach */
167 	rpcib_detach,		/* detach */
168 	nodev,			/* reset */
169 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
170 	NULL,			/* bus operations */
171 	NULL,			/* power */
172 	ddi_quiesce_not_needed,		/* quiesce */
173 };
174 
175 /*
176  * Module linkage information.
177  */
178 
179 static struct modldrv rib_modldrv = {
180 	&mod_driverops,		/* Driver module */
181 	"RPCIB plugin driver",	/* Driver name and version */
182 	&rpcib_ops,		/* Driver ops */
183 };
184 
185 static struct modlinkage rib_modlinkage = {
186 	MODREV_1,
187 	(void *)&rib_modldrv,
188 	NULL
189 };
190 
191 typedef struct rib_lrc_entry {
192 	struct rib_lrc_entry *forw;
193 	struct rib_lrc_entry *back;
194 	char *lrc_buf;
195 
196 	uint32_t lrc_len;
197 	void  *avl_node;
198 	bool_t registered;
199 
200 	struct mrc lrc_mhandle;
201 	bool_t lrc_on_freed_list;
202 } rib_lrc_entry_t;
203 
204 typedef	struct cache_struct	{
205 	rib_lrc_entry_t		r;
206 	uint32_t		len;
207 	uint32_t		elements;
208 	kmutex_t		node_lock;
209 	avl_node_t		avl_link;
210 } cache_avl_struct_t;
211 
212 uint64_t	cache_limit = 100 * 1024 * 1024;
213 static uint64_t	cache_watermark = 80 * 1024 * 1024;
214 static bool_t	stats_enabled = FALSE;
215 
216 static uint64_t max_unsignaled_rws = 5;
217 int nfs_rdma_port = NFS_RDMA_PORT;
218 
219 #define	RIBNETID_TCP	"tcp"
220 #define	RIBNETID_TCP6	"tcp6"
221 
222 /*
223  * rib_stat: private data pointer used when registering
224  *	with the IBTF.  It is returned to the consumer
225  *	in all callbacks.
226  */
227 static rpcib_state_t *rib_stat = NULL;
228 
229 #define	RNR_RETRIES	IBT_RNR_RETRY_1
230 #define	MAX_PORTS	2
231 #define	RDMA_DUMMY_WRID	0x4D3A1D4D3A1D
232 #define	RDMA_CONN_REAP_RETRY	10	/* 10 secs */
233 
234 int preposted_rbufs = RDMA_BUFS_GRANT;
235 int send_threshold = 1;
236 
237 /*
238  * Old cards with Tavor driver have limited memory footprint
239  * when booted in 32bit. The rib_max_rbufs tunable can be
240  * tuned for more buffers if needed.
241  */
242 
243 #if !defined(_ELF64) && !defined(__sparc)
244 int rib_max_rbufs = MAX_BUFS;
245 #else
246 int rib_max_rbufs = 10 * MAX_BUFS;
247 #endif	/* !(_ELF64) && !(__sparc) */
248 
249 int rib_conn_timeout = 60 * 12;		/* 12 minutes */
250 
251 /*
252  * State of the plugin.
253  * ACCEPT = accepting new connections and requests.
254  * NO_ACCEPT = not accepting new connection and requests.
255  * This should eventually move to rpcib_state_t structure, since this
256  * will tell in which state the plugin is for a particular type of service
257  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
258  * state for one and in no_accept state for the other.
259  */
260 int		plugin_state;
261 kmutex_t	plugin_state_lock;
262 
263 ldi_ident_t rpcib_li;
264 
265 /*
266  * RPCIB RDMATF operations
267  */
268 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
269 static rdma_stat rib_disconnect(CONN *conn);
270 static void rib_listen(struct rdma_svc_data *rd);
271 static void rib_listen_stop(struct rdma_svc_data *rd);
272 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
273 	uint_t buflen, struct mrc *buf_handle);
274 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
275 	struct mrc buf_handle);
276 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
277 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
278 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
279 		struct mrc buf_handle);
280 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
281 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
282 	void *lrc);
283 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
284 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
285 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
286 	caddr_t buf, int len, int cpu);
287 
288 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
289 
290 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
291 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
292 
293 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
294 
295 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
296 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
297 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
298 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
299 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
300 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
301 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
302 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
303 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
304 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
305 	int addr_type, void *, CONN **);
306 static rdma_stat rib_conn_release(CONN *conn);
307 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
308 	rpcib_ping_t *, CONN **);
309 static rdma_stat rib_getinfo(rdma_info_t *info);
310 
311 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
312 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
313 static void rib_destroy_cache(rib_hca_t *hca);
314 static	void	rib_server_side_cache_reclaim(void *argp);
315 static int avl_compare(const void *t1, const void *t2);
316 
317 static void rib_stop_services(rib_hca_t *);
318 static void rib_close_channels(rib_conn_list_t *);
319 static void rib_conn_close(void *);
320 static void rib_recv_rele(rib_qp_t *);
321 static rdma_stat rib_conn_release_locked(CONN *conn);
322 
323 /*
324  * RPCIB addressing operations
325  */
326 
327 /*
328  * RDMA operations the RPCIB module exports
329  */
330 static rdmaops_t rib_ops = {
331 	rib_reachable,
332 	rib_conn_get,
333 	rib_conn_release,
334 	rib_listen,
335 	rib_listen_stop,
336 	rib_registermem,
337 	rib_deregistermem,
338 	rib_registermemsync,
339 	rib_deregistermemsync,
340 	rib_syncmem,
341 	rib_reg_buf_alloc,
342 	rib_reg_buf_free,
343 	rib_send,
344 	rib_send_resp,
345 	rib_post_resp,
346 	rib_post_resp_remove,
347 	rib_post_recv,
348 	rib_recv,
349 	rib_read,
350 	rib_write,
351 	rib_getinfo,
352 };
353 
354 /*
355  * RDMATF RPCIB plugin details
356  */
357 static rdma_mod_t rib_mod = {
358 	"ibtf",		/* api name */
359 	RDMATF_VERS_1,
360 	0,
361 	&rib_ops,	/* rdma op vector for ibtf */
362 };
363 
364 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
365 static rdma_stat rib_qp_init(rib_qp_t *, int);
366 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
367 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
368 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
369 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
370 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
371 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
372 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
373 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
374 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
375 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
376 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
377 	rib_qp_t **);
378 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
379 	rib_qp_t **);
380 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
381 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
382 static int rib_free_sendwait(struct send_wid *);
383 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
384 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
385 static void rdma_done_rem_list(rib_qp_t *);
386 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
387 
388 static void rib_async_handler(void *,
389 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
390 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
391 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
392 static int rib_free_svc_recv(struct svc_recv *);
393 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
394 static void rib_free_wid(struct recv_wid *);
395 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
396 static void rib_detach_hca(ibt_hca_hdl_t);
397 static void rib_close_a_channel(CONN *);
398 static void rib_send_hold(rib_qp_t *);
399 static void rib_send_rele(rib_qp_t *);
400 
401 /*
402  * Registration with IBTF as a consumer
403  */
404 static struct ibt_clnt_modinfo_s rib_modinfo = {
405 	IBTI_V_CURR,
406 	IBT_GENERIC,
407 	rib_async_handler,	/* async event handler */
408 	NULL,			/* Memory Region Handler */
409 	"nfs/ib"
410 };
411 
412 /*
413  * Global strucuture
414  */
415 
416 typedef struct rpcib_s {
417 	dev_info_t	*rpcib_dip;
418 	kmutex_t	rpcib_mutex;
419 } rpcib_t;
420 
421 rpcib_t rpcib;
422 
423 /*
424  * /etc/system controlled variable to control
425  * debugging in rpcib kernel module.
426  * Set it to values greater that 1 to control
427  * the amount of debugging messages required.
428  */
429 int rib_debug = 0;
430 
431 int
432 _init(void)
433 {
434 	int error;
435 
436 	error = mod_install((struct modlinkage *)&rib_modlinkage);
437 	if (error != 0) {
438 		/*
439 		 * Could not load module
440 		 */
441 		return (error);
442 	}
443 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
444 	return (0);
445 }
446 
447 int
448 _fini()
449 {
450 	int status;
451 
452 	/*
453 	 * Remove module
454 	 */
455 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
456 		return (status);
457 	}
458 	mutex_destroy(&plugin_state_lock);
459 	return (0);
460 }
461 
462 int
463 _info(struct modinfo *modinfop)
464 {
465 	return (mod_info(&rib_modlinkage, modinfop));
466 }
467 
468 /*
469  * rpcib_getinfo()
470  * Given the device number, return the devinfo pointer or the
471  * instance number.
472  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
473  */
474 
475 /*ARGSUSED*/
476 static int
477 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
478 {
479 	int ret = DDI_SUCCESS;
480 
481 	switch (cmd) {
482 	case DDI_INFO_DEVT2DEVINFO:
483 		if (rpcib.rpcib_dip != NULL)
484 			*result = rpcib.rpcib_dip;
485 		else {
486 			*result = NULL;
487 			ret = DDI_FAILURE;
488 		}
489 		break;
490 
491 	case DDI_INFO_DEVT2INSTANCE:
492 		*result = NULL;
493 		break;
494 
495 	default:
496 		ret = DDI_FAILURE;
497 	}
498 	return (ret);
499 }
500 
501 static void
502 rpcib_free_hca_list()
503 {
504 	rib_hca_t *hca, *hcap;
505 
506 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
507 	hca = rib_stat->hcas_list;
508 	rib_stat->hcas_list = NULL;
509 	rw_exit(&rib_stat->hcas_list_lock);
510 	while (hca != NULL) {
511 		rw_enter(&hca->state_lock, RW_WRITER);
512 		hcap = hca;
513 		hca = hca->next;
514 		rib_stat->nhca_inited--;
515 		rib_mod.rdma_count--;
516 		hcap->state = HCA_DETACHED;
517 		rw_exit(&hcap->state_lock);
518 		rib_stop_hca_services(hcap);
519 
520 		kmem_free(hcap, sizeof (*hcap));
521 	}
522 }
523 
524 static rdma_stat
525 rpcib_free_service_list()
526 {
527 	rib_service_t *service;
528 	ibt_status_t ret;
529 
530 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
531 	while (rib_stat->service_list != NULL) {
532 		service = rib_stat->service_list;
533 		ret = ibt_unbind_all_services(service->srv_hdl);
534 		if (ret != IBT_SUCCESS) {
535 			rw_exit(&rib_stat->service_list_lock);
536 #ifdef DEBUG
537 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
538 			    "ibt_unbind_all_services failed (%d)\n", (int)ret);
539 #endif
540 			return (RDMA_FAILED);
541 		}
542 		ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
543 		    service->srv_hdl);
544 		if (ret != IBT_SUCCESS) {
545 			rw_exit(&rib_stat->service_list_lock);
546 #ifdef DEBUG
547 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
548 			    "ibt_deregister_service failed (%d)\n", (int)ret);
549 #endif
550 			return (RDMA_FAILED);
551 		}
552 		rib_stat->service_list = service->next;
553 		kmem_free(service, sizeof (rib_service_t));
554 	}
555 	rw_exit(&rib_stat->service_list_lock);
556 
557 	return (RDMA_SUCCESS);
558 }
559 
560 static int
561 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
562 {
563 	ibt_status_t	ibt_status;
564 	rdma_stat	r_status;
565 
566 	switch (cmd) {
567 	case DDI_ATTACH:
568 		break;
569 	case DDI_RESUME:
570 		return (DDI_SUCCESS);
571 	default:
572 		return (DDI_FAILURE);
573 	}
574 
575 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
576 
577 	mutex_enter(&rpcib.rpcib_mutex);
578 	if (rpcib.rpcib_dip != NULL) {
579 		mutex_exit(&rpcib.rpcib_mutex);
580 		return (DDI_FAILURE);
581 	}
582 	rpcib.rpcib_dip = dip;
583 	mutex_exit(&rpcib.rpcib_mutex);
584 	/*
585 	 * Create the "rpcib" minor-node.
586 	 */
587 	if (ddi_create_minor_node(dip,
588 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
589 		/* Error message, no cmn_err as they print on console */
590 		return (DDI_FAILURE);
591 	}
592 
593 	if (rib_stat == NULL) {
594 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
595 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
596 		rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
597 		mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
598 	}
599 
600 	rib_stat->hca_count = ibt_get_hca_list(NULL);
601 	if (rib_stat->hca_count < 1) {
602 		mutex_destroy(&rib_stat->listen_lock);
603 		rw_destroy(&rib_stat->hcas_list_lock);
604 		mutex_destroy(&rib_stat->open_hca_lock);
605 		kmem_free(rib_stat, sizeof (*rib_stat));
606 		rib_stat = NULL;
607 		return (DDI_FAILURE);
608 	}
609 
610 	ibt_status = ibt_attach(&rib_modinfo, dip,
611 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
612 
613 	if (ibt_status != IBT_SUCCESS) {
614 		mutex_destroy(&rib_stat->listen_lock);
615 		rw_destroy(&rib_stat->hcas_list_lock);
616 		mutex_destroy(&rib_stat->open_hca_lock);
617 		kmem_free(rib_stat, sizeof (*rib_stat));
618 		rib_stat = NULL;
619 		return (DDI_FAILURE);
620 	}
621 
622 	rib_stat->service_list = NULL;
623 	rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
624 	mutex_enter(&rib_stat->open_hca_lock);
625 	if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
626 		mutex_exit(&rib_stat->open_hca_lock);
627 		goto open_fail;
628 	}
629 	mutex_exit(&rib_stat->open_hca_lock);
630 
631 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
632 	    DDI_PROP_SUCCESS) {
633 		cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
634 		    "failed.");
635 		goto register_fail;
636 	}
637 
638 	/*
639 	 * Register with rdmatf
640 	 */
641 	r_status = rdma_register_mod(&rib_mod);
642 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
643 		cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
644 		    "status = %d", r_status);
645 		goto register_fail;
646 	}
647 
648 	return (DDI_SUCCESS);
649 
650 register_fail:
651 
652 open_fail:
653 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
654 	rpcib_free_hca_list();
655 	(void) rpcib_free_service_list();
656 	mutex_destroy(&rib_stat->listen_lock);
657 	rw_destroy(&rib_stat->hcas_list_lock);
658 	mutex_destroy(&rib_stat->open_hca_lock);
659 	rw_destroy(&rib_stat->service_list_lock);
660 	kmem_free(rib_stat, sizeof (*rib_stat));
661 	rib_stat = NULL;
662 	return (DDI_FAILURE);
663 }
664 
665 /*ARGSUSED*/
666 static int
667 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
668 {
669 	switch (cmd) {
670 
671 	case DDI_DETACH:
672 		break;
673 
674 	case DDI_SUSPEND:
675 	default:
676 		return (DDI_FAILURE);
677 	}
678 
679 	/*
680 	 * Detach the hca and free resources
681 	 */
682 	mutex_enter(&plugin_state_lock);
683 	plugin_state = NO_ACCEPT;
684 	mutex_exit(&plugin_state_lock);
685 
686 	if (rpcib_free_service_list() != RDMA_SUCCESS)
687 		return (DDI_FAILURE);
688 	rpcib_free_hca_list();
689 
690 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
691 	mutex_destroy(&rib_stat->listen_lock);
692 	rw_destroy(&rib_stat->hcas_list_lock);
693 	mutex_destroy(&rib_stat->open_hca_lock);
694 	rw_destroy(&rib_stat->service_list_lock);
695 
696 	kmem_free(rib_stat, sizeof (*rib_stat));
697 	rib_stat = NULL;
698 
699 	mutex_enter(&rpcib.rpcib_mutex);
700 	rpcib.rpcib_dip = NULL;
701 	mutex_exit(&rpcib.rpcib_mutex);
702 	mutex_destroy(&rpcib.rpcib_mutex);
703 	return (DDI_SUCCESS);
704 }
705 
706 
707 static void rib_rbufpool_free(rib_hca_t *, int);
708 static void rib_rbufpool_deregister(rib_hca_t *, int);
709 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
710 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
711 static rdma_stat rib_rem_replylist(rib_qp_t *);
712 static int rib_remreply(rib_qp_t *, struct reply *);
713 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
714 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
715 
716 
717 /*
718  * One CQ pair per HCA
719  */
720 static rdma_stat
721 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
722 	rib_cq_t **cqp)
723 {
724 	rib_cq_t	*cq;
725 	ibt_cq_attr_t	cq_attr;
726 	uint32_t	real_size;
727 	ibt_status_t	status;
728 	rdma_stat	error = RDMA_SUCCESS;
729 
730 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
731 	cq->rib_hca = hca;
732 	bzero(&cq_attr, sizeof (cq_attr));
733 	cq_attr.cq_size = cq_size;
734 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
735 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
736 	    &real_size);
737 	if (status != IBT_SUCCESS) {
738 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
739 		    " status=%d", status);
740 		error = RDMA_FAILED;
741 		goto fail;
742 	}
743 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
744 
745 	/*
746 	 * Enable CQ callbacks. CQ Callbacks are single shot
747 	 * (e.g. you have to call ibt_enable_cq_notify()
748 	 * after each callback to get another one).
749 	 */
750 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
751 	if (status != IBT_SUCCESS) {
752 		cmn_err(CE_WARN, "rib_create_cq: "
753 		    "enable_cq_notify failed, status %d", status);
754 		error = RDMA_FAILED;
755 		goto fail;
756 	}
757 	*cqp = cq;
758 
759 	return (error);
760 fail:
761 	if (cq->rib_cq_hdl)
762 		(void) ibt_free_cq(cq->rib_cq_hdl);
763 	if (cq)
764 		kmem_free(cq, sizeof (rib_cq_t));
765 	return (error);
766 }
767 
768 /*
769  * rpcib_find_hca
770  *
771  * Caller should have already locked the hcas_lock before calling
772  * this function.
773  */
774 static rib_hca_t *
775 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
776 {
777 	rib_hca_t *hca = ribstat->hcas_list;
778 
779 	while (hca && hca->hca_guid != guid)
780 		hca = hca->next;
781 
782 	return (hca);
783 }
784 
785 static rdma_stat
786 rpcib_open_hcas(rpcib_state_t *ribstat)
787 {
788 	rib_hca_t		*hca;
789 	ibt_status_t		ibt_status;
790 	rdma_stat		status;
791 	ibt_hca_portinfo_t	*pinfop;
792 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
793 	uint_t			size, cq_size;
794 	int			i;
795 	kstat_t *ksp;
796 	cache_avl_struct_t example_avl_node;
797 	char rssc_name[32];
798 	int old_nhca_inited = ribstat->nhca_inited;
799 	ib_guid_t		*hca_guids;
800 
801 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
802 
803 	ribstat->hca_count = ibt_get_hca_list(&hca_guids);
804 	if (ribstat->hca_count == 0)
805 		return (RDMA_FAILED);
806 
807 	rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
808 	/*
809 	 * Open a hca and setup for RDMA
810 	 */
811 	for (i = 0; i < ribstat->hca_count; i++) {
812 		if (rpcib_find_hca(ribstat, hca_guids[i]))
813 			continue;
814 		hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
815 
816 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
817 		    hca_guids[i], &hca->hca_hdl);
818 		if (ibt_status != IBT_SUCCESS) {
819 			kmem_free(hca, sizeof (rib_hca_t));
820 			continue;
821 		}
822 		hca->hca_guid = hca_guids[i];
823 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
824 		hca->state = HCA_INITED;
825 
826 		/*
827 		 * query HCA info
828 		 */
829 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
830 		if (ibt_status != IBT_SUCCESS) {
831 			goto fail1;
832 		}
833 
834 		/*
835 		 * One PD (Protection Domain) per HCA.
836 		 * A qp is allowed to access a memory region
837 		 * only when it's in the same PD as that of
838 		 * the memory region.
839 		 */
840 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
841 		if (ibt_status != IBT_SUCCESS) {
842 			goto fail1;
843 		}
844 
845 		/*
846 		 * query HCA ports
847 		 */
848 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
849 		    0, &pinfop, &hca->hca_nports, &size);
850 		if (ibt_status != IBT_SUCCESS) {
851 			goto fail2;
852 		}
853 		hca->hca_ports = pinfop;
854 		hca->hca_pinfosz = size;
855 		pinfop = NULL;
856 
857 		cq_size = DEF_CQ_SIZE; /* default cq size */
858 		/*
859 		 * Create 2 pairs of cq's (1 pair for client
860 		 * and the other pair for server) on this hca.
861 		 * If number of qp's gets too large, then several
862 		 * cq's will be needed.
863 		 */
864 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
865 		    &hca->svc_rcq);
866 		if (status != RDMA_SUCCESS) {
867 			goto fail3;
868 		}
869 
870 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
871 		    &hca->svc_scq);
872 		if (status != RDMA_SUCCESS) {
873 			goto fail3;
874 		}
875 
876 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
877 		    &hca->clnt_rcq);
878 		if (status != RDMA_SUCCESS) {
879 			goto fail3;
880 		}
881 
882 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
883 		    &hca->clnt_scq);
884 		if (status != RDMA_SUCCESS) {
885 			goto fail3;
886 		}
887 
888 		/*
889 		 * Create buffer pools.
890 		 * Note rib_rbuf_create also allocates memory windows.
891 		 */
892 		hca->recv_pool = rib_rbufpool_create(hca,
893 		    RECV_BUFFER, rib_max_rbufs);
894 		if (hca->recv_pool == NULL) {
895 			goto fail3;
896 		}
897 
898 		hca->send_pool = rib_rbufpool_create(hca,
899 		    SEND_BUFFER, rib_max_rbufs);
900 		if (hca->send_pool == NULL) {
901 			rib_rbufpool_destroy(hca, RECV_BUFFER);
902 			goto fail3;
903 		}
904 
905 		if (hca->server_side_cache == NULL) {
906 			(void) sprintf(rssc_name,
907 			    "rib_srvr_cache_%llx",
908 			    (long long unsigned int) hca->hca_guid);
909 			hca->server_side_cache = kmem_cache_create(
910 			    rssc_name,
911 			    sizeof (cache_avl_struct_t), 0,
912 			    NULL,
913 			    NULL,
914 			    rib_server_side_cache_reclaim,
915 			    hca, NULL, 0);
916 		}
917 
918 		avl_create(&hca->avl_tree,
919 		    avl_compare,
920 		    sizeof (cache_avl_struct_t),
921 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
922 		    (uint_t)(uintptr_t)&example_avl_node);
923 
924 		rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
925 		    hca->iblock);
926 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
927 		rw_init(&hca->avl_rw_lock,
928 		    NULL, RW_DRIVER, hca->iblock);
929 		mutex_init(&hca->cache_allocation_lock,
930 		    NULL, MUTEX_DRIVER, NULL);
931 		hca->avl_init = TRUE;
932 
933 		/* Create kstats for the cache */
934 		ASSERT(INGLOBALZONE(curproc));
935 
936 		if (!stats_enabled) {
937 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
938 			    KSTAT_TYPE_NAMED,
939 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
940 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
941 			    GLOBAL_ZONEID);
942 			if (ksp) {
943 				ksp->ks_data = (void *) &rpcib_kstat;
944 				ksp->ks_update = rpcib_cache_kstat_update;
945 				kstat_install(ksp);
946 				stats_enabled = TRUE;
947 			}
948 		}
949 		if (hca->cleanup_helper == NULL) {
950 			char tq_name[sizeof (hca->hca_guid) * 2 + 1];
951 
952 			(void) snprintf(tq_name, sizeof (tq_name), "%llX",
953 			    (unsigned long long int) hca->hca_guid);
954 			hca->cleanup_helper = ddi_taskq_create(NULL,
955 			    tq_name, 1, TASKQ_DEFAULTPRI, 0);
956 		}
957 
958 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
959 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
960 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
961 		    hca->iblock);
962 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
963 		    hca->iblock);
964 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
965 		hca->inuse = TRUE;
966 
967 		hca->next = ribstat->hcas_list;
968 		ribstat->hcas_list = hca;
969 		ribstat->nhca_inited++;
970 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
971 		continue;
972 
973 fail3:
974 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
975 fail2:
976 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
977 fail1:
978 		(void) ibt_close_hca(hca->hca_hdl);
979 		kmem_free(hca, sizeof (rib_hca_t));
980 	}
981 	rw_exit(&ribstat->hcas_list_lock);
982 	ibt_free_hca_list(hca_guids, ribstat->hca_count);
983 	rib_mod.rdma_count = rib_stat->nhca_inited;
984 
985 	/*
986 	 * return success if at least one new hca has been configured.
987 	 */
988 	if (ribstat->nhca_inited != old_nhca_inited)
989 		return (RDMA_SUCCESS);
990 	else
991 		return (RDMA_FAILED);
992 }
993 
994 /*
995  * Callback routines
996  */
997 
998 /*
999  * SCQ handlers
1000  */
1001 /* ARGSUSED */
1002 static void
1003 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1004 {
1005 	ibt_status_t	ibt_status;
1006 	ibt_wc_t	wc;
1007 	struct send_wid	*wd;
1008 	CONN		*conn;
1009 	rib_qp_t	*qp;
1010 	int		i;
1011 
1012 	/*
1013 	 * Re-enable cq notify here to avoid missing any
1014 	 * completion queue notification.
1015 	 */
1016 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1017 
1018 	ibt_status = IBT_SUCCESS;
1019 	while (ibt_status != IBT_CQ_EMPTY) {
1020 		bzero(&wc, sizeof (wc));
1021 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1022 		if (ibt_status != IBT_SUCCESS)
1023 			return;
1024 
1025 		/*
1026 		 * Got a send completion
1027 		 */
1028 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1029 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1030 			qp = wd->qp;
1031 			conn = qptoc(qp);
1032 
1033 			mutex_enter(&wd->sendwait_lock);
1034 			switch (wc.wc_status) {
1035 			case IBT_WC_SUCCESS:
1036 				wd->status = RDMA_SUCCESS;
1037 				break;
1038 			default:
1039 /*
1040  *    RC Send Q Error Code		Local state     Remote State
1041  *    ==================== 		===========     ============
1042  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
1043  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
1044  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
1045  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
1046  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
1047  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
1048  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
1049  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
1050  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
1051  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
1052  *    IBT_WC_WR_FLUSHED_ERR               ERROR           None
1053  */
1054 				/*
1055 				 * Channel in error state. Set connection to
1056 				 * ERROR and cleanup will happen either from
1057 				 * conn_release  or from rib_conn_get
1058 				 */
1059 				wd->status = RDMA_FAILED;
1060 				mutex_enter(&conn->c_lock);
1061 				if (conn->c_state != C_DISCONN_PEND)
1062 					conn->c_state = C_ERROR_CONN;
1063 				mutex_exit(&conn->c_lock);
1064 				break;
1065 			}
1066 
1067 			if (wd->cv_sig == 1) {
1068 				/*
1069 				 * Notify poster
1070 				 */
1071 				cv_signal(&wd->wait_cv);
1072 				mutex_exit(&wd->sendwait_lock);
1073 			} else {
1074 				/*
1075 				 * Poster not waiting for notification.
1076 				 * Free the send buffers and send_wid
1077 				 */
1078 				for (i = 0; i < wd->nsbufs; i++) {
1079 					rib_rbuf_free(qptoc(wd->qp),
1080 					    SEND_BUFFER,
1081 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1082 				}
1083 
1084 				/* decrement the send ref count */
1085 				rib_send_rele(qp);
1086 
1087 				mutex_exit(&wd->sendwait_lock);
1088 				(void) rib_free_sendwait(wd);
1089 			}
1090 		}
1091 	}
1092 }
1093 
1094 /* ARGSUSED */
1095 static void
1096 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1097 {
1098 	ibt_status_t	ibt_status;
1099 	ibt_wc_t	wc;
1100 	struct send_wid	*wd;
1101 	rib_qp_t	*qp;
1102 	CONN		*conn;
1103 	int		i;
1104 
1105 	/*
1106 	 * Re-enable cq notify here to avoid missing any
1107 	 * completion queue notification.
1108 	 */
1109 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1110 
1111 	ibt_status = IBT_SUCCESS;
1112 	while (ibt_status != IBT_CQ_EMPTY) {
1113 		bzero(&wc, sizeof (wc));
1114 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1115 		if (ibt_status != IBT_SUCCESS)
1116 			return;
1117 
1118 		/*
1119 		 * Got a send completion
1120 		 */
1121 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1122 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1123 			qp = wd->qp;
1124 			conn = qptoc(qp);
1125 			mutex_enter(&wd->sendwait_lock);
1126 
1127 			switch (wc.wc_status) {
1128 			case IBT_WC_SUCCESS:
1129 				wd->status = RDMA_SUCCESS;
1130 				break;
1131 			default:
1132 				/*
1133 				 * Channel in error state. Set connection to
1134 				 * ERROR and cleanup will happen either from
1135 				 * conn_release  or conn timeout.
1136 				 */
1137 				wd->status = RDMA_FAILED;
1138 				mutex_enter(&conn->c_lock);
1139 				if (conn->c_state != C_DISCONN_PEND)
1140 					conn->c_state = C_ERROR_CONN;
1141 				mutex_exit(&conn->c_lock);
1142 				break;
1143 			}
1144 
1145 			if (wd->cv_sig == 1) {
1146 				/*
1147 				 * Update completion status and notify poster
1148 				 */
1149 				cv_signal(&wd->wait_cv);
1150 				mutex_exit(&wd->sendwait_lock);
1151 			} else {
1152 				/*
1153 				 * Poster not waiting for notification.
1154 				 * Free the send buffers and send_wid
1155 				 */
1156 				for (i = 0; i < wd->nsbufs; i++) {
1157 					rib_rbuf_free(qptoc(wd->qp),
1158 					    SEND_BUFFER,
1159 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1160 				}
1161 
1162 				/* decrement the send ref count */
1163 				rib_send_rele(qp);
1164 
1165 				mutex_exit(&wd->sendwait_lock);
1166 				(void) rib_free_sendwait(wd);
1167 			}
1168 		}
1169 	}
1170 }
1171 
1172 /*
1173  * RCQ handler
1174  */
1175 /* ARGSUSED */
1176 static void
1177 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1178 {
1179 	rib_qp_t	*qp;
1180 	ibt_status_t	ibt_status;
1181 	ibt_wc_t	wc;
1182 	struct recv_wid	*rwid;
1183 
1184 	/*
1185 	 * Re-enable cq notify here to avoid missing any
1186 	 * completion queue notification.
1187 	 */
1188 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1189 
1190 	ibt_status = IBT_SUCCESS;
1191 	while (ibt_status != IBT_CQ_EMPTY) {
1192 		bzero(&wc, sizeof (wc));
1193 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1194 		if (ibt_status != IBT_SUCCESS)
1195 			return;
1196 
1197 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1198 		qp = rwid->qp;
1199 
1200 		if (wc.wc_status == IBT_WC_SUCCESS) {
1201 			XDR	inxdrs, *xdrs;
1202 			uint_t	xid, vers, op, find_xid = 0;
1203 			struct reply	*r;
1204 			CONN *conn = qptoc(qp);
1205 			uint32_t rdma_credit = 0;
1206 
1207 			xdrs = &inxdrs;
1208 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1209 			    wc.wc_bytes_xfer, XDR_DECODE);
1210 			/*
1211 			 * Treat xid as opaque (xid is the first entity
1212 			 * in the rpc rdma message).
1213 			 */
1214 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1215 
1216 			/* Skip xid and set the xdr position accordingly. */
1217 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1218 			(void) xdr_u_int(xdrs, &vers);
1219 			(void) xdr_u_int(xdrs, &rdma_credit);
1220 			(void) xdr_u_int(xdrs, &op);
1221 			XDR_DESTROY(xdrs);
1222 
1223 			if (vers != RPCRDMA_VERS) {
1224 				/*
1225 				 * Invalid RPC/RDMA version. Cannot
1226 				 * interoperate.  Set connection to
1227 				 * ERROR state and bail out.
1228 				 */
1229 				mutex_enter(&conn->c_lock);
1230 				if (conn->c_state != C_DISCONN_PEND)
1231 					conn->c_state = C_ERROR_CONN;
1232 				mutex_exit(&conn->c_lock);
1233 				rib_rbuf_free(conn, RECV_BUFFER,
1234 				    (void *)(uintptr_t)rwid->addr);
1235 				rib_free_wid(rwid);
1236 				rib_recv_rele(qp);
1237 				continue;
1238 			}
1239 
1240 			mutex_enter(&qp->replylist_lock);
1241 			for (r = qp->replylist; r != NULL; r = r->next) {
1242 				if (r->xid == xid) {
1243 					find_xid = 1;
1244 					switch (op) {
1245 					case RDMA_MSG:
1246 					case RDMA_NOMSG:
1247 					case RDMA_MSGP:
1248 						r->status = RDMA_SUCCESS;
1249 						r->vaddr_cq = rwid->addr;
1250 						r->bytes_xfer =
1251 						    wc.wc_bytes_xfer;
1252 						cv_signal(&r->wait_cv);
1253 						break;
1254 					default:
1255 						rib_rbuf_free(qptoc(qp),
1256 						    RECV_BUFFER,
1257 						    (void *)(uintptr_t)
1258 						    rwid->addr);
1259 						break;
1260 					}
1261 					break;
1262 				}
1263 			}
1264 			mutex_exit(&qp->replylist_lock);
1265 			if (find_xid == 0) {
1266 				/* RPC caller not waiting for reply */
1267 
1268 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1269 				    int, xid);
1270 
1271 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1272 				    (void *)(uintptr_t)rwid->addr);
1273 			}
1274 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1275 			CONN *conn = qptoc(qp);
1276 
1277 			/*
1278 			 * Connection being flushed. Just free
1279 			 * the posted buffer
1280 			 */
1281 			rib_rbuf_free(conn, RECV_BUFFER,
1282 			    (void *)(uintptr_t)rwid->addr);
1283 		} else {
1284 			CONN *conn = qptoc(qp);
1285 /*
1286  *  RC Recv Q Error Code		Local state     Remote State
1287  *  ====================		===========     ============
1288  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1289  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1290  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1291  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1292  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1293  *  IBT_WC_WR_FLUSHED_ERR               None            None
1294  */
1295 			/*
1296 			 * Channel in error state. Set connection
1297 			 * in ERROR state.
1298 			 */
1299 			mutex_enter(&conn->c_lock);
1300 			if (conn->c_state != C_DISCONN_PEND)
1301 				conn->c_state = C_ERROR_CONN;
1302 			mutex_exit(&conn->c_lock);
1303 			rib_rbuf_free(conn, RECV_BUFFER,
1304 			    (void *)(uintptr_t)rwid->addr);
1305 		}
1306 		rib_free_wid(rwid);
1307 		rib_recv_rele(qp);
1308 	}
1309 }
1310 
1311 /* Server side */
1312 /* ARGSUSED */
1313 static void
1314 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1315 {
1316 	rdma_recv_data_t *rdp;
1317 	rib_qp_t	*qp;
1318 	ibt_status_t	ibt_status;
1319 	ibt_wc_t	wc;
1320 	struct svc_recv	*s_recvp;
1321 	CONN		*conn;
1322 	mblk_t		*mp;
1323 
1324 	/*
1325 	 * Re-enable cq notify here to avoid missing any
1326 	 * completion queue notification.
1327 	 */
1328 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1329 
1330 	ibt_status = IBT_SUCCESS;
1331 	while (ibt_status != IBT_CQ_EMPTY) {
1332 		bzero(&wc, sizeof (wc));
1333 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1334 		if (ibt_status != IBT_SUCCESS)
1335 			return;
1336 
1337 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1338 		qp = s_recvp->qp;
1339 		conn = qptoc(qp);
1340 
1341 		if (wc.wc_status == IBT_WC_SUCCESS) {
1342 			XDR	inxdrs, *xdrs;
1343 			uint_t	xid, vers, op;
1344 			uint32_t rdma_credit;
1345 
1346 			xdrs = &inxdrs;
1347 			/* s_recvp->vaddr stores data */
1348 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1349 			    wc.wc_bytes_xfer, XDR_DECODE);
1350 
1351 			/*
1352 			 * Treat xid as opaque (xid is the first entity
1353 			 * in the rpc rdma message).
1354 			 */
1355 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1356 			/* Skip xid and set the xdr position accordingly. */
1357 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1358 			if (!xdr_u_int(xdrs, &vers) ||
1359 			    !xdr_u_int(xdrs, &rdma_credit) ||
1360 			    !xdr_u_int(xdrs, &op)) {
1361 				rib_rbuf_free(conn, RECV_BUFFER,
1362 				    (void *)(uintptr_t)s_recvp->vaddr);
1363 				XDR_DESTROY(xdrs);
1364 				rib_recv_rele(qp);
1365 				(void) rib_free_svc_recv(s_recvp);
1366 				continue;
1367 			}
1368 			XDR_DESTROY(xdrs);
1369 
1370 			if (vers != RPCRDMA_VERS) {
1371 				/*
1372 				 * Invalid RPC/RDMA version.
1373 				 * Drop rpc rdma message.
1374 				 */
1375 				rib_rbuf_free(conn, RECV_BUFFER,
1376 				    (void *)(uintptr_t)s_recvp->vaddr);
1377 				rib_recv_rele(qp);
1378 				(void) rib_free_svc_recv(s_recvp);
1379 				continue;
1380 			}
1381 			/*
1382 			 * Is this for RDMA_DONE?
1383 			 */
1384 			if (op == RDMA_DONE) {
1385 				rib_rbuf_free(conn, RECV_BUFFER,
1386 				    (void *)(uintptr_t)s_recvp->vaddr);
1387 				/*
1388 				 * Wake up the thread waiting on
1389 				 * a RDMA_DONE for xid
1390 				 */
1391 				mutex_enter(&qp->rdlist_lock);
1392 				rdma_done_notify(qp, xid);
1393 				mutex_exit(&qp->rdlist_lock);
1394 				rib_recv_rele(qp);
1395 				(void) rib_free_svc_recv(s_recvp);
1396 				continue;
1397 			}
1398 
1399 			mutex_enter(&plugin_state_lock);
1400 			mutex_enter(&conn->c_lock);
1401 			if ((plugin_state == ACCEPT) &&
1402 			    (conn->c_state == C_CONNECTED)) {
1403 				conn->c_ref++;
1404 				mutex_exit(&conn->c_lock);
1405 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1406 				    == NULL)
1407 					(void) strwaitbuf(
1408 					    sizeof (*rdp), BPRI_LO);
1409 				/*
1410 				 * Plugin is in accept state, hence the master
1411 				 * transport queue for this is still accepting
1412 				 * requests. Hence we can call svc_queuereq to
1413 				 * queue this recieved msg.
1414 				 */
1415 				rdp = (rdma_recv_data_t *)mp->b_rptr;
1416 				rdp->conn = conn;
1417 				rdp->rpcmsg.addr =
1418 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1419 				rdp->rpcmsg.type = RECV_BUFFER;
1420 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1421 				rdp->status = wc.wc_status;
1422 				mp->b_wptr += sizeof (*rdp);
1423 				svc_queuereq((queue_t *)rib_stat->q, mp);
1424 				mutex_exit(&plugin_state_lock);
1425 			} else {
1426 				/*
1427 				 * The master transport for this is going
1428 				 * away and the queue is not accepting anymore
1429 				 * requests for krpc, so don't do anything, just
1430 				 * free the msg.
1431 				 */
1432 				mutex_exit(&conn->c_lock);
1433 				mutex_exit(&plugin_state_lock);
1434 				rib_rbuf_free(conn, RECV_BUFFER,
1435 				    (void *)(uintptr_t)s_recvp->vaddr);
1436 			}
1437 		} else {
1438 			rib_rbuf_free(conn, RECV_BUFFER,
1439 			    (void *)(uintptr_t)s_recvp->vaddr);
1440 		}
1441 		rib_recv_rele(qp);
1442 		(void) rib_free_svc_recv(s_recvp);
1443 	}
1444 }
1445 
1446 static void
1447 rib_attach_hca()
1448 {
1449 	mutex_enter(&rib_stat->open_hca_lock);
1450 	(void) rpcib_open_hcas(rib_stat);
1451 	rib_listen(NULL);
1452 	mutex_exit(&rib_stat->open_hca_lock);
1453 }
1454 
1455 /*
1456  * Handles DR event of IBT_HCA_DETACH_EVENT.
1457  */
1458 /* ARGSUSED */
1459 static void
1460 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1461 	ibt_async_code_t code, ibt_async_event_t *event)
1462 {
1463 	switch (code) {
1464 	case IBT_HCA_ATTACH_EVENT:
1465 		rib_attach_hca();
1466 		break;
1467 	case IBT_HCA_DETACH_EVENT:
1468 		rib_detach_hca(hca_hdl);
1469 #ifdef DEBUG
1470 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1471 #endif
1472 		break;
1473 	case IBT_EVENT_PORT_UP:
1474 		/*
1475 		 * A port is up. We should call rib_listen() since there is
1476 		 * a chance that rib_listen() may have failed during
1477 		 * rib_attach_hca() because the port had not been up yet.
1478 		 */
1479 		rib_listen(NULL);
1480 #ifdef DEBUG
1481 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1482 #endif
1483 		break;
1484 #ifdef DEBUG
1485 	case IBT_EVENT_PATH_MIGRATED:
1486 		cmn_err(CE_NOTE, "rib_async_handler(): "
1487 		    "IBT_EVENT_PATH_MIGRATED\n");
1488 		break;
1489 	case IBT_EVENT_SQD:
1490 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1491 		break;
1492 	case IBT_EVENT_COM_EST:
1493 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1494 		break;
1495 	case IBT_ERROR_CATASTROPHIC_CHAN:
1496 		cmn_err(CE_NOTE, "rib_async_handler(): "
1497 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1498 		break;
1499 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1500 		cmn_err(CE_NOTE, "rib_async_handler(): "
1501 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1502 		break;
1503 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1504 		cmn_err(CE_NOTE, "rib_async_handler(): "
1505 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1506 		break;
1507 	case IBT_ERROR_PATH_MIGRATE_REQ:
1508 		cmn_err(CE_NOTE, "rib_async_handler(): "
1509 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1510 		break;
1511 	case IBT_ERROR_CQ:
1512 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1513 		break;
1514 	case IBT_ERROR_PORT_DOWN:
1515 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1516 		break;
1517 	case IBT_ASYNC_OPAQUE1:
1518 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1519 		break;
1520 	case IBT_ASYNC_OPAQUE2:
1521 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1522 		break;
1523 	case IBT_ASYNC_OPAQUE3:
1524 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1525 		break;
1526 	case IBT_ASYNC_OPAQUE4:
1527 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1528 		break;
1529 #endif
1530 	default:
1531 		break;
1532 	}
1533 }
1534 
1535 /*
1536  * Client's reachable function.
1537  */
1538 static rdma_stat
1539 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1540 {
1541 	rdma_stat	status;
1542 	rpcib_ping_t	rpt;
1543 	struct netbuf	saddr;
1544 	CONN		*conn;
1545 
1546 	bzero(&saddr, sizeof (struct netbuf));
1547 	status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
1548 
1549 	if (status == RDMA_SUCCESS) {
1550 		*handle = (void *)rpt.hca;
1551 		/* release the reference */
1552 		(void) rib_conn_release(conn);
1553 		return (RDMA_SUCCESS);
1554 	} else {
1555 		*handle = NULL;
1556 		DTRACE_PROBE(rpcib__i__pingfailed);
1557 		return (RDMA_FAILED);
1558 	}
1559 }
1560 
1561 /* Client side qp creation */
1562 static rdma_stat
1563 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1564 {
1565 	rib_qp_t	*kqp = NULL;
1566 	CONN		*conn;
1567 	rdma_clnt_cred_ctrl_t *cc_info;
1568 
1569 	ASSERT(qp != NULL);
1570 	*qp = NULL;
1571 
1572 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1573 	conn = qptoc(kqp);
1574 	kqp->hca = hca;
1575 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1576 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1577 
1578 	kqp->mode = RIB_CLIENT;
1579 	kqp->chan_flags = IBT_BLOCKING;
1580 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1581 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1582 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1583 	/*
1584 	 * Initialize
1585 	 */
1586 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1587 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1588 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1589 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1590 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1591 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1592 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1593 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1594 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1595 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1596 	/*
1597 	 * Initialize the client credit control
1598 	 * portion of the rdmaconn struct.
1599 	 */
1600 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1601 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1602 	cc_info->clnt_cc_granted_ops = 0;
1603 	cc_info->clnt_cc_in_flight_ops = 0;
1604 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1605 
1606 	*qp = kqp;
1607 	return (RDMA_SUCCESS);
1608 }
1609 
1610 /* Server side qp creation */
1611 static rdma_stat
1612 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1613 {
1614 	rib_qp_t	*kqp = NULL;
1615 	ibt_chan_sizes_t	chan_sizes;
1616 	ibt_rc_chan_alloc_args_t	qp_attr;
1617 	ibt_status_t		ibt_status;
1618 	rdma_srv_cred_ctrl_t *cc_info;
1619 
1620 	*qp = NULL;
1621 
1622 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1623 	kqp->hca = hca;
1624 	kqp->port_num = port;
1625 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1626 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1627 
1628 	/*
1629 	 * Create the qp handle
1630 	 */
1631 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1632 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1633 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1634 	qp_attr.rc_pd = hca->pd_hdl;
1635 	qp_attr.rc_hca_port_num = port;
1636 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1637 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1638 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1639 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1640 	qp_attr.rc_clone_chan = NULL;
1641 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1642 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1643 
1644 	rw_enter(&hca->state_lock, RW_READER);
1645 	if (hca->state != HCA_DETACHED) {
1646 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1647 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1648 		    &chan_sizes);
1649 	} else {
1650 		rw_exit(&hca->state_lock);
1651 		goto fail;
1652 	}
1653 	rw_exit(&hca->state_lock);
1654 
1655 	if (ibt_status != IBT_SUCCESS) {
1656 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1657 		    int, ibt_status);
1658 		goto fail;
1659 	}
1660 
1661 	kqp->mode = RIB_SERVER;
1662 	kqp->chan_flags = IBT_BLOCKING;
1663 	kqp->q = q;	/* server ONLY */
1664 
1665 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1666 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1667 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1668 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1669 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1670 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1671 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1672 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1673 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1674 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1675 	/*
1676 	 * Set the private data area to qp to be used in callbacks
1677 	 */
1678 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1679 	kqp->rdmaconn.c_state = C_CONNECTED;
1680 
1681 	/*
1682 	 * Initialize the server credit control
1683 	 * portion of the rdmaconn struct.
1684 	 */
1685 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1686 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1687 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1688 	cc_info->srv_cc_cur_buffers_used = 0;
1689 	cc_info->srv_cc_posted = preposted_rbufs;
1690 
1691 	*qp = kqp;
1692 
1693 	return (RDMA_SUCCESS);
1694 fail:
1695 	if (kqp)
1696 		kmem_free(kqp, sizeof (rib_qp_t));
1697 
1698 	return (RDMA_FAILED);
1699 }
1700 
1701 /* ARGSUSED */
1702 ibt_cm_status_t
1703 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1704     ibt_cm_return_args_t *ret_args, void *priv_data,
1705     ibt_priv_data_len_t len)
1706 {
1707 	rib_hca_t	*hca;
1708 
1709 	hca = (rib_hca_t *)clnt_hdl;
1710 
1711 	switch (event->cm_type) {
1712 
1713 	/* got a connection close event */
1714 	case IBT_CM_EVENT_CONN_CLOSED:
1715 	{
1716 		CONN	*conn;
1717 		rib_qp_t *qp;
1718 
1719 		/* check reason why connection was closed */
1720 		switch (event->cm_event.closed) {
1721 		case IBT_CM_CLOSED_DREP_RCVD:
1722 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1723 		case IBT_CM_CLOSED_DUP:
1724 		case IBT_CM_CLOSED_ABORT:
1725 		case IBT_CM_CLOSED_ALREADY:
1726 			/*
1727 			 * These cases indicate the local end initiated
1728 			 * the closing of the channel. Nothing to do here.
1729 			 */
1730 			break;
1731 		default:
1732 			/*
1733 			 * Reason for CONN_CLOSED event must be one of
1734 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1735 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1736 			 * the remote end is closing the channel. In these
1737 			 * cases free the channel and transition to error
1738 			 * state
1739 			 */
1740 			qp = ibt_get_chan_private(event->cm_channel);
1741 			conn = qptoc(qp);
1742 			mutex_enter(&conn->c_lock);
1743 			if (conn->c_state == C_DISCONN_PEND) {
1744 				mutex_exit(&conn->c_lock);
1745 				break;
1746 			}
1747 
1748 			conn->c_state = C_ERROR_CONN;
1749 
1750 			/*
1751 			 * Free the conn if c_ref is down to 0 already
1752 			 */
1753 			if (conn->c_ref == 0) {
1754 				/*
1755 				 * Remove from list and free conn
1756 				 */
1757 				conn->c_state = C_DISCONN_PEND;
1758 				mutex_exit(&conn->c_lock);
1759 				rw_enter(&hca->state_lock, RW_READER);
1760 				if (hca->state != HCA_DETACHED)
1761 					(void) rib_disconnect_channel(conn,
1762 					    &hca->cl_conn_list);
1763 				rw_exit(&hca->state_lock);
1764 			} else {
1765 				/*
1766 				 * conn will be freed when c_ref goes to 0.
1767 				 * Indicate to cleaning thread not to close
1768 				 * the connection, but just free the channel.
1769 				 */
1770 				conn->c_flags |= C_CLOSE_NOTNEEDED;
1771 				mutex_exit(&conn->c_lock);
1772 			}
1773 #ifdef DEBUG
1774 			if (rib_debug)
1775 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1776 				    "(CONN_CLOSED) channel disconnected");
1777 #endif
1778 			break;
1779 		}
1780 		break;
1781 	}
1782 	default:
1783 		break;
1784 	}
1785 	return (IBT_CM_ACCEPT);
1786 }
1787 
1788 /*
1789  * Connect to the server.
1790  */
1791 rdma_stat
1792 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1793 {
1794 	ibt_chan_open_args_t	chan_args;	/* channel args */
1795 	ibt_chan_sizes_t	chan_sizes;
1796 	ibt_rc_chan_alloc_args_t	qp_attr;
1797 	ibt_status_t		ibt_status;
1798 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1799 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1800 	ibt_ip_cm_info_t	ipcm_info;
1801 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1802 
1803 
1804 	(void) bzero(&chan_args, sizeof (chan_args));
1805 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1806 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1807 
1808 	ipcm_info.src_addr.family = rptp->srcip.family;
1809 	switch (ipcm_info.src_addr.family) {
1810 	case AF_INET:
1811 		ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1812 		break;
1813 	case AF_INET6:
1814 		ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1815 		break;
1816 	}
1817 
1818 	ipcm_info.dst_addr.family = rptp->srcip.family;
1819 	switch (ipcm_info.dst_addr.family) {
1820 	case AF_INET:
1821 		ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1822 		break;
1823 	case AF_INET6:
1824 		ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1825 		break;
1826 	}
1827 
1828 	ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1829 
1830 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1831 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1832 
1833 	if (ibt_status != IBT_SUCCESS) {
1834 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1835 		return (-1);
1836 	}
1837 
1838 	qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1839 	/* Alloc a RC channel */
1840 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1841 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1842 	qp_attr.rc_pd = hca->pd_hdl;
1843 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1844 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1845 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1846 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1847 	qp_attr.rc_clone_chan = NULL;
1848 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1849 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1850 
1851 	rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1852 	chan_args.oc_path = &rptp->path;
1853 
1854 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1855 	chan_args.oc_cm_clnt_private = (void *)hca;
1856 	chan_args.oc_rdma_ra_out = 4;
1857 	chan_args.oc_rdma_ra_in = 4;
1858 	chan_args.oc_path_retry_cnt = 2;
1859 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1860 	chan_args.oc_priv_data = cmp_ip_pvt;
1861 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1862 
1863 refresh:
1864 	rw_enter(&hca->state_lock, RW_READER);
1865 	if (hca->state != HCA_DETACHED) {
1866 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1867 		    IBT_ACHAN_NO_FLAGS,
1868 		    &qp_attr, &qp->qp_hdl,
1869 		    &chan_sizes);
1870 	} else {
1871 		rw_exit(&hca->state_lock);
1872 		return (RDMA_FAILED);
1873 	}
1874 	rw_exit(&hca->state_lock);
1875 
1876 	if (ibt_status != IBT_SUCCESS) {
1877 		DTRACE_PROBE1(rpcib__i_conntosrv,
1878 		    int, ibt_status);
1879 		return (RDMA_FAILED);
1880 	}
1881 
1882 	/* Connect to the Server */
1883 	(void) bzero(&ret_args, sizeof (ret_args));
1884 	mutex_enter(&qp->cb_lock);
1885 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1886 	    IBT_BLOCKING, &chan_args, &ret_args);
1887 	if (ibt_status != IBT_SUCCESS) {
1888 		DTRACE_PROBE2(rpcib__i_openrctosrv,
1889 		    int, ibt_status, int, ret_args.rc_status);
1890 
1891 		(void) ibt_free_channel(qp->qp_hdl);
1892 		qp->qp_hdl = NULL;
1893 		mutex_exit(&qp->cb_lock);
1894 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1895 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1896 			/*
1897 			 * Got IBT_CM_CONN_STALE probably because of stale
1898 			 * data on the passive end of a channel that existed
1899 			 * prior to reboot. Retry establishing a channel
1900 			 * REFRESH_ATTEMPTS times, during which time the
1901 			 * stale conditions on the server might clear up.
1902 			 */
1903 			goto refresh;
1904 		}
1905 		return (RDMA_FAILED);
1906 	}
1907 	mutex_exit(&qp->cb_lock);
1908 	/*
1909 	 * Set the private data area to qp to be used in callbacks
1910 	 */
1911 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1912 	return (RDMA_SUCCESS);
1913 }
1914 
1915 rdma_stat
1916 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1917 {
1918 	uint_t			i, addr_count;
1919 	ibt_status_t		ibt_status;
1920 	uint8_t			num_paths_p;
1921 	ibt_ip_path_attr_t	ipattr;
1922 	ibt_path_ip_src_t	srcip;
1923 	rpcib_ipaddrs_t		addrs4;
1924 	rpcib_ipaddrs_t		addrs6;
1925 	struct sockaddr_in	*sinp;
1926 	struct sockaddr_in6	*sin6p;
1927 	rdma_stat		retval = RDMA_FAILED;
1928 	rib_hca_t *hca;
1929 
1930 	if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1931 		return (RDMA_INVAL);
1932 	ASSERT(raddr->buf != NULL);
1933 
1934 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1935 
1936 	if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1937 	    (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1938 		retval = RDMA_FAILED;
1939 		goto done2;
1940 	}
1941 
1942 	if (addr_type == AF_INET) {
1943 		addr_count = addrs4.ri_count;
1944 		sinp = (struct sockaddr_in *)raddr->buf;
1945 		rptp->dstip.family = AF_INET;
1946 		rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1947 		sinp = addrs4.ri_list;
1948 	} else {
1949 		addr_count = addrs6.ri_count;
1950 		sin6p = (struct sockaddr_in6 *)raddr->buf;
1951 		rptp->dstip.family = AF_INET6;
1952 		rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1953 		sin6p = addrs6.ri_list;
1954 	}
1955 
1956 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1957 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1958 		rw_enter(&hca->state_lock, RW_READER);
1959 		if (hca->state == HCA_DETACHED) {
1960 			rw_exit(&hca->state_lock);
1961 			continue;
1962 		}
1963 
1964 		ipattr.ipa_dst_ip 	= &rptp->dstip;
1965 		ipattr.ipa_hca_guid	= hca->hca_guid;
1966 		ipattr.ipa_ndst		= 1;
1967 		ipattr.ipa_max_paths	= 1;
1968 		ipattr.ipa_src_ip.family = rptp->dstip.family;
1969 		for (i = 0; i < addr_count; i++) {
1970 			num_paths_p = 0;
1971 			if (addr_type == AF_INET) {
1972 				ipattr.ipa_src_ip.un.ip4addr =
1973 				    sinp[i].sin_addr.s_addr;
1974 			} else {
1975 				ipattr.ipa_src_ip.un.ip6addr =
1976 				    sin6p[i].sin6_addr;
1977 			}
1978 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
1979 
1980 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1981 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1982 			    &num_paths_p, &srcip);
1983 			if (ibt_status == IBT_SUCCESS &&
1984 			    num_paths_p != 0 &&
1985 			    rptp->path.pi_hca_guid == hca->hca_guid) {
1986 				rptp->hca = hca;
1987 				rw_exit(&hca->state_lock);
1988 				if (addr_type == AF_INET) {
1989 					rptp->srcip.family = AF_INET;
1990 					rptp->srcip.un.ip4addr =
1991 					    srcip.ip_primary.un.ip4addr;
1992 				} else {
1993 					rptp->srcip.family = AF_INET6;
1994 					rptp->srcip.un.ip6addr =
1995 					    srcip.ip_primary.un.ip6addr;
1996 
1997 				}
1998 				retval = RDMA_SUCCESS;
1999 				goto done1;
2000 			}
2001 		}
2002 		rw_exit(&hca->state_lock);
2003 	}
2004 done1:
2005 	rw_exit(&rib_stat->hcas_list_lock);
2006 done2:
2007 	if (addrs4.ri_size > 0)
2008 		kmem_free(addrs4.ri_list, addrs4.ri_size);
2009 	if (addrs6.ri_size > 0)
2010 		kmem_free(addrs6.ri_list, addrs6.ri_size);
2011 	return (retval);
2012 }
2013 
2014 /*
2015  * Close channel, remove from connection list and
2016  * free up resources allocated for that channel.
2017  */
2018 rdma_stat
2019 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2020 {
2021 	rib_qp_t	*qp = ctoqp(conn);
2022 	rib_hca_t	*hca;
2023 
2024 	mutex_enter(&conn->c_lock);
2025 	if (conn->c_timeout != NULL) {
2026 		mutex_exit(&conn->c_lock);
2027 		(void) untimeout(conn->c_timeout);
2028 		mutex_enter(&conn->c_lock);
2029 	}
2030 
2031 	while (conn->c_flags & C_CLOSE_PENDING) {
2032 		cv_wait(&conn->c_cv, &conn->c_lock);
2033 	}
2034 	mutex_exit(&conn->c_lock);
2035 
2036 	/*
2037 	 * c_ref == 0 and connection is in C_DISCONN_PEND
2038 	 */
2039 	hca = qp->hca;
2040 	if (conn_list != NULL)
2041 		(void) rib_rm_conn(conn, conn_list);
2042 
2043 	/*
2044 	 * There is only one case where we get here with
2045 	 * qp_hdl = NULL, which is during connection setup on
2046 	 * the client. In such a case there are no posted
2047 	 * send/recv buffers.
2048 	 */
2049 	if (qp->qp_hdl != NULL) {
2050 		mutex_enter(&qp->posted_rbufs_lock);
2051 		while (qp->n_posted_rbufs)
2052 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2053 		mutex_exit(&qp->posted_rbufs_lock);
2054 
2055 		mutex_enter(&qp->send_rbufs_lock);
2056 		while (qp->n_send_rbufs)
2057 			cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2058 			mutex_exit(&qp->send_rbufs_lock);
2059 
2060 		(void) ibt_free_channel(qp->qp_hdl);
2061 			qp->qp_hdl = NULL;
2062 	}
2063 
2064 	ASSERT(qp->rdlist == NULL);
2065 
2066 	if (qp->replylist != NULL) {
2067 		(void) rib_rem_replylist(qp);
2068 	}
2069 
2070 	cv_destroy(&qp->cb_conn_cv);
2071 	cv_destroy(&qp->posted_rbufs_cv);
2072 	cv_destroy(&qp->send_rbufs_cv);
2073 	mutex_destroy(&qp->cb_lock);
2074 	mutex_destroy(&qp->replylist_lock);
2075 	mutex_destroy(&qp->posted_rbufs_lock);
2076 	mutex_destroy(&qp->send_rbufs_lock);
2077 	mutex_destroy(&qp->rdlist_lock);
2078 
2079 	cv_destroy(&conn->c_cv);
2080 	mutex_destroy(&conn->c_lock);
2081 
2082 	if (conn->c_raddr.buf != NULL) {
2083 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2084 	}
2085 	if (conn->c_laddr.buf != NULL) {
2086 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2087 	}
2088 	if (conn->c_netid != NULL) {
2089 		kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1));
2090 	}
2091 	if (conn->c_addrmask.buf != NULL) {
2092 		kmem_free(conn->c_addrmask.buf, conn->c_addrmask.len);
2093 	}
2094 
2095 	/*
2096 	 * Credit control cleanup.
2097 	 */
2098 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2099 		rdma_clnt_cred_ctrl_t *cc_info;
2100 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2101 		cv_destroy(&cc_info->clnt_cc_cv);
2102 	}
2103 
2104 	kmem_free(qp, sizeof (rib_qp_t));
2105 
2106 	/*
2107 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2108 	 * then the hca is no longer being used.
2109 	 */
2110 	if (conn_list != NULL) {
2111 		rw_enter(&hca->state_lock, RW_READER);
2112 		if (hca->state == HCA_DETACHED) {
2113 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2114 			if (hca->srv_conn_list.conn_hd == NULL) {
2115 				rw_enter(&hca->cl_conn_list.conn_lock,
2116 				    RW_READER);
2117 
2118 				if (hca->cl_conn_list.conn_hd == NULL) {
2119 					mutex_enter(&hca->inuse_lock);
2120 					hca->inuse = FALSE;
2121 					cv_signal(&hca->cb_cv);
2122 					mutex_exit(&hca->inuse_lock);
2123 				}
2124 				rw_exit(&hca->cl_conn_list.conn_lock);
2125 			}
2126 			rw_exit(&hca->srv_conn_list.conn_lock);
2127 		}
2128 		rw_exit(&hca->state_lock);
2129 	}
2130 
2131 	return (RDMA_SUCCESS);
2132 }
2133 
2134 /*
2135  * All sends are done under the protection of
2136  * the wdesc->sendwait_lock. n_send_rbufs count
2137  * is protected using the send_rbufs_lock.
2138  * lock ordering is:
2139  * sendwait_lock -> send_rbufs_lock
2140  */
2141 
2142 void
2143 rib_send_hold(rib_qp_t *qp)
2144 {
2145 	mutex_enter(&qp->send_rbufs_lock);
2146 	qp->n_send_rbufs++;
2147 	mutex_exit(&qp->send_rbufs_lock);
2148 }
2149 
2150 void
2151 rib_send_rele(rib_qp_t *qp)
2152 {
2153 	mutex_enter(&qp->send_rbufs_lock);
2154 	qp->n_send_rbufs--;
2155 	if (qp->n_send_rbufs == 0)
2156 		cv_signal(&qp->send_rbufs_cv);
2157 	mutex_exit(&qp->send_rbufs_lock);
2158 }
2159 
2160 void
2161 rib_recv_rele(rib_qp_t *qp)
2162 {
2163 	mutex_enter(&qp->posted_rbufs_lock);
2164 	qp->n_posted_rbufs--;
2165 	if (qp->n_posted_rbufs == 0)
2166 		cv_signal(&qp->posted_rbufs_cv);
2167 	mutex_exit(&qp->posted_rbufs_lock);
2168 }
2169 
2170 /*
2171  * Wait for send completion notification. Only on receiving a
2172  * notification be it a successful or error completion, free the
2173  * send_wid.
2174  */
2175 static rdma_stat
2176 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2177 {
2178 	clock_t timout, cv_wait_ret;
2179 	rdma_stat error = RDMA_SUCCESS;
2180 	int	i;
2181 
2182 	/*
2183 	 * Wait for send to complete
2184 	 */
2185 	ASSERT(wd != NULL);
2186 	mutex_enter(&wd->sendwait_lock);
2187 	if (wd->status == (uint_t)SEND_WAIT) {
2188 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2189 		    ddi_get_lbolt();
2190 
2191 		if (qp->mode == RIB_SERVER) {
2192 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2193 			    &wd->sendwait_lock, timout)) > 0 &&
2194 			    wd->status == (uint_t)SEND_WAIT)
2195 				;
2196 			switch (cv_wait_ret) {
2197 			case -1:	/* timeout */
2198 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2199 
2200 				wd->cv_sig = 0;		/* no signal needed */
2201 				error = RDMA_TIMEDOUT;
2202 				break;
2203 			default:	/* got send completion */
2204 				break;
2205 			}
2206 		} else {
2207 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2208 			    &wd->sendwait_lock, timout)) > 0 &&
2209 			    wd->status == (uint_t)SEND_WAIT)
2210 				;
2211 			switch (cv_wait_ret) {
2212 			case -1:	/* timeout */
2213 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2214 
2215 				wd->cv_sig = 0;		/* no signal needed */
2216 				error = RDMA_TIMEDOUT;
2217 				break;
2218 			case 0:		/* interrupted */
2219 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2220 
2221 				wd->cv_sig = 0;		/* no signal needed */
2222 				error = RDMA_INTR;
2223 				break;
2224 			default:	/* got send completion */
2225 				break;
2226 			}
2227 		}
2228 	}
2229 
2230 	if (wd->status != (uint_t)SEND_WAIT) {
2231 		/* got send completion */
2232 		if (wd->status != RDMA_SUCCESS) {
2233 			switch (wd->status) {
2234 			case RDMA_CONNLOST:
2235 				error = RDMA_CONNLOST;
2236 				break;
2237 			default:
2238 				error = RDMA_FAILED;
2239 				break;
2240 			}
2241 		}
2242 		for (i = 0; i < wd->nsbufs; i++) {
2243 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2244 			    (void *)(uintptr_t)wd->sbufaddr[i]);
2245 		}
2246 
2247 		rib_send_rele(qp);
2248 
2249 		mutex_exit(&wd->sendwait_lock);
2250 		(void) rib_free_sendwait(wd);
2251 
2252 	} else {
2253 		mutex_exit(&wd->sendwait_lock);
2254 	}
2255 	return (error);
2256 }
2257 
2258 static struct send_wid *
2259 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2260 {
2261 	struct send_wid	*wd;
2262 
2263 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2264 	wd->xid = xid;
2265 	wd->cv_sig = cv_sig;
2266 	wd->qp = qp;
2267 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2268 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2269 	wd->status = (uint_t)SEND_WAIT;
2270 
2271 	return (wd);
2272 }
2273 
2274 static int
2275 rib_free_sendwait(struct send_wid *wdesc)
2276 {
2277 	cv_destroy(&wdesc->wait_cv);
2278 	mutex_destroy(&wdesc->sendwait_lock);
2279 	kmem_free(wdesc, sizeof (*wdesc));
2280 
2281 	return (0);
2282 }
2283 
2284 static rdma_stat
2285 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2286 {
2287 	mutex_enter(&qp->replylist_lock);
2288 	if (rep != NULL) {
2289 		(void) rib_remreply(qp, rep);
2290 		mutex_exit(&qp->replylist_lock);
2291 		return (RDMA_SUCCESS);
2292 	}
2293 	mutex_exit(&qp->replylist_lock);
2294 	return (RDMA_FAILED);
2295 }
2296 
2297 /*
2298  * Send buffers are freed here only in case of error in posting
2299  * on QP. If the post succeeded, the send buffers are freed upon
2300  * send completion in rib_sendwait() or in the scq_handler.
2301  */
2302 rdma_stat
2303 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2304 	int send_sig, int cv_sig, caddr_t *swid)
2305 {
2306 	struct send_wid	*wdesc;
2307 	struct clist	*clp;
2308 	ibt_status_t	ibt_status = IBT_SUCCESS;
2309 	rdma_stat	ret = RDMA_SUCCESS;
2310 	ibt_send_wr_t	tx_wr;
2311 	int		i, nds;
2312 	ibt_wr_ds_t	sgl[DSEG_MAX];
2313 	uint_t		total_msg_size;
2314 	rib_qp_t	*qp;
2315 
2316 	qp = ctoqp(conn);
2317 
2318 	ASSERT(cl != NULL);
2319 
2320 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2321 
2322 	nds = 0;
2323 	total_msg_size = 0;
2324 	clp = cl;
2325 	while (clp != NULL) {
2326 		if (nds >= DSEG_MAX) {
2327 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2328 			return (RDMA_FAILED);
2329 		}
2330 		sgl[nds].ds_va = clp->w.c_saddr;
2331 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2332 		sgl[nds].ds_len = clp->c_len;
2333 		total_msg_size += clp->c_len;
2334 		clp = clp->c_next;
2335 		nds++;
2336 	}
2337 
2338 	if (send_sig) {
2339 		/* Set SEND_SIGNAL flag. */
2340 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2341 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2342 		*swid = (caddr_t)wdesc;
2343 		tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2344 		mutex_enter(&wdesc->sendwait_lock);
2345 		wdesc->nsbufs = nds;
2346 		for (i = 0; i < nds; i++) {
2347 			wdesc->sbufaddr[i] = sgl[i].ds_va;
2348 		}
2349 	} else {
2350 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2351 		*swid = NULL;
2352 		tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2353 	}
2354 
2355 	tx_wr.wr_opcode = IBT_WRC_SEND;
2356 	tx_wr.wr_trans = IBT_RC_SRV;
2357 	tx_wr.wr_nds = nds;
2358 	tx_wr.wr_sgl = sgl;
2359 
2360 	mutex_enter(&conn->c_lock);
2361 	if (conn->c_state == C_CONNECTED) {
2362 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2363 	}
2364 	if (conn->c_state != C_CONNECTED ||
2365 	    ibt_status != IBT_SUCCESS) {
2366 		if (conn->c_state != C_DISCONN_PEND)
2367 			conn->c_state = C_ERROR_CONN;
2368 		mutex_exit(&conn->c_lock);
2369 		if (send_sig) {
2370 			for (i = 0; i < nds; i++) {
2371 				rib_rbuf_free(conn, SEND_BUFFER,
2372 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2373 			}
2374 			mutex_exit(&wdesc->sendwait_lock);
2375 			(void) rib_free_sendwait(wdesc);
2376 		}
2377 		return (RDMA_CONNLOST);
2378 	}
2379 
2380 	mutex_exit(&conn->c_lock);
2381 
2382 	if (send_sig) {
2383 		rib_send_hold(qp);
2384 		mutex_exit(&wdesc->sendwait_lock);
2385 		if (cv_sig) {
2386 			/*
2387 			 * cv_wait for send to complete.
2388 			 * We can fail due to a timeout or signal or
2389 			 * unsuccessful send.
2390 			 */
2391 			ret = rib_sendwait(qp, wdesc);
2392 
2393 			return (ret);
2394 		}
2395 	}
2396 
2397 	return (RDMA_SUCCESS);
2398 }
2399 
2400 
2401 rdma_stat
2402 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2403 {
2404 	rdma_stat	ret;
2405 	caddr_t		wd;
2406 
2407 	/* send-wait & cv_signal */
2408 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2409 	return (ret);
2410 }
2411 
2412 /*
2413  * Deprecated/obsolete interface not used currently
2414  * but earlier used for READ-READ protocol.
2415  * Send RPC reply and wait for RDMA_DONE.
2416  */
2417 rdma_stat
2418 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2419 {
2420 	rdma_stat ret = RDMA_SUCCESS;
2421 	struct rdma_done_list *rd;
2422 	clock_t cv_wait_ret;
2423 	caddr_t *wid = NULL;
2424 	rib_qp_t *qp = ctoqp(conn);
2425 
2426 	mutex_enter(&qp->rdlist_lock);
2427 	rd = rdma_done_add(qp, msgid);
2428 
2429 	/* No cv_signal (whether send-wait or no-send-wait) */
2430 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2431 
2432 	if (ret != RDMA_SUCCESS) {
2433 		rdma_done_rm(qp, rd);
2434 	} else {
2435 		/*
2436 		 * Wait for RDMA_DONE from remote end
2437 		 */
2438 		cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv,
2439 		    &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000),
2440 		    TR_CLOCK_TICK);
2441 
2442 		rdma_done_rm(qp, rd);
2443 
2444 		if (cv_wait_ret < 0) {
2445 			ret = RDMA_TIMEDOUT;
2446 		}
2447 	}
2448 
2449 	mutex_exit(&qp->rdlist_lock);
2450 	return (ret);
2451 }
2452 
2453 static struct recv_wid *
2454 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2455 {
2456 	struct recv_wid	*rwid;
2457 
2458 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2459 	rwid->xid = msgid;
2460 	rwid->addr = sgl->ds_va;
2461 	rwid->qp = qp;
2462 
2463 	return (rwid);
2464 }
2465 
2466 static void
2467 rib_free_wid(struct recv_wid *rwid)
2468 {
2469 	kmem_free(rwid, sizeof (struct recv_wid));
2470 }
2471 
2472 rdma_stat
2473 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2474 {
2475 	rib_qp_t	*qp = ctoqp(conn);
2476 	struct clist	*clp = cl;
2477 	struct reply	*rep;
2478 	struct recv_wid	*rwid;
2479 	int		nds;
2480 	ibt_wr_ds_t	sgl[DSEG_MAX];
2481 	ibt_recv_wr_t	recv_wr;
2482 	rdma_stat	ret;
2483 	ibt_status_t	ibt_status;
2484 
2485 	/*
2486 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2487 	 */
2488 
2489 	nds = 0;
2490 	while (cl != NULL) {
2491 		if (nds >= DSEG_MAX) {
2492 			ret = RDMA_FAILED;
2493 			goto done;
2494 		}
2495 		sgl[nds].ds_va = cl->w.c_saddr;
2496 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2497 		sgl[nds].ds_len = cl->c_len;
2498 		cl = cl->c_next;
2499 		nds++;
2500 	}
2501 
2502 	if (nds != 1) {
2503 		ret = RDMA_FAILED;
2504 		goto done;
2505 	}
2506 
2507 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2508 	recv_wr.wr_nds = nds;
2509 	recv_wr.wr_sgl = sgl;
2510 
2511 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2512 	if (rwid) {
2513 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2514 	} else {
2515 		ret = RDMA_NORESOURCE;
2516 		goto done;
2517 	}
2518 	rep = rib_addreplylist(qp, msgid);
2519 	if (!rep) {
2520 		rib_free_wid(rwid);
2521 		ret = RDMA_NORESOURCE;
2522 		goto done;
2523 	}
2524 
2525 	mutex_enter(&conn->c_lock);
2526 
2527 	if (conn->c_state == C_CONNECTED) {
2528 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2529 	}
2530 
2531 	if (conn->c_state != C_CONNECTED ||
2532 	    ibt_status != IBT_SUCCESS) {
2533 		if (conn->c_state != C_DISCONN_PEND)
2534 			conn->c_state = C_ERROR_CONN;
2535 		mutex_exit(&conn->c_lock);
2536 		rib_free_wid(rwid);
2537 		(void) rib_rem_rep(qp, rep);
2538 		ret = RDMA_CONNLOST;
2539 		goto done;
2540 	}
2541 
2542 	mutex_enter(&qp->posted_rbufs_lock);
2543 	qp->n_posted_rbufs++;
2544 	mutex_exit(&qp->posted_rbufs_lock);
2545 
2546 	mutex_exit(&conn->c_lock);
2547 	return (RDMA_SUCCESS);
2548 
2549 done:
2550 	while (clp != NULL) {
2551 		rib_rbuf_free(conn, RECV_BUFFER,
2552 		    (void *)(uintptr_t)clp->w.c_saddr3);
2553 		clp = clp->c_next;
2554 	}
2555 	return (ret);
2556 }
2557 
2558 rdma_stat
2559 rib_svc_post(CONN* conn, struct clist *cl)
2560 {
2561 	rib_qp_t	*qp = ctoqp(conn);
2562 	struct svc_recv	*s_recvp;
2563 	int		nds;
2564 	ibt_wr_ds_t	sgl[DSEG_MAX];
2565 	ibt_recv_wr_t	recv_wr;
2566 	ibt_status_t	ibt_status;
2567 
2568 	nds = 0;
2569 	while (cl != NULL) {
2570 		if (nds >= DSEG_MAX) {
2571 			return (RDMA_FAILED);
2572 		}
2573 		sgl[nds].ds_va = cl->w.c_saddr;
2574 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2575 		sgl[nds].ds_len = cl->c_len;
2576 		cl = cl->c_next;
2577 		nds++;
2578 	}
2579 
2580 	if (nds != 1) {
2581 		rib_rbuf_free(conn, RECV_BUFFER,
2582 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2583 
2584 		return (RDMA_FAILED);
2585 	}
2586 
2587 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2588 	recv_wr.wr_nds = nds;
2589 	recv_wr.wr_sgl = sgl;
2590 
2591 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2592 	/* Use s_recvp's addr as wr id */
2593 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2594 	mutex_enter(&conn->c_lock);
2595 	if (conn->c_state == C_CONNECTED) {
2596 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2597 	}
2598 	if (conn->c_state != C_CONNECTED ||
2599 	    ibt_status != IBT_SUCCESS) {
2600 		if (conn->c_state != C_DISCONN_PEND)
2601 			conn->c_state = C_ERROR_CONN;
2602 		mutex_exit(&conn->c_lock);
2603 		rib_rbuf_free(conn, RECV_BUFFER,
2604 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2605 		(void) rib_free_svc_recv(s_recvp);
2606 
2607 		return (RDMA_CONNLOST);
2608 	}
2609 	mutex_exit(&conn->c_lock);
2610 
2611 	return (RDMA_SUCCESS);
2612 }
2613 
2614 /* Client */
2615 rdma_stat
2616 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2617 {
2618 	return (rib_clnt_post(conn, cl, msgid));
2619 }
2620 
2621 /* Client */
2622 rdma_stat
2623 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2624 {
2625 	rib_qp_t	*qp = ctoqp(conn);
2626 	struct reply	*rep;
2627 
2628 	mutex_enter(&qp->replylist_lock);
2629 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2630 		if (rep->xid == msgid) {
2631 			if (rep->vaddr_cq) {
2632 				rib_rbuf_free(conn, RECV_BUFFER,
2633 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2634 			}
2635 			(void) rib_remreply(qp, rep);
2636 			break;
2637 		}
2638 	}
2639 	mutex_exit(&qp->replylist_lock);
2640 
2641 	return (RDMA_SUCCESS);
2642 }
2643 
2644 /* Server */
2645 rdma_stat
2646 rib_post_recv(CONN *conn, struct clist *cl)
2647 {
2648 	rib_qp_t	*qp = ctoqp(conn);
2649 
2650 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2651 		mutex_enter(&qp->posted_rbufs_lock);
2652 		qp->n_posted_rbufs++;
2653 		mutex_exit(&qp->posted_rbufs_lock);
2654 		return (RDMA_SUCCESS);
2655 	}
2656 	return (RDMA_FAILED);
2657 }
2658 
2659 /*
2660  * Client side only interface to "recv" the rpc reply buf
2661  * posted earlier by rib_post_resp(conn, cl, msgid).
2662  */
2663 rdma_stat
2664 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2665 {
2666 	struct reply *rep = NULL;
2667 	clock_t timout, cv_wait_ret;
2668 	rdma_stat ret = RDMA_SUCCESS;
2669 	rib_qp_t *qp = ctoqp(conn);
2670 
2671 	/*
2672 	 * Find the reply structure for this msgid
2673 	 */
2674 	mutex_enter(&qp->replylist_lock);
2675 
2676 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2677 		if (rep->xid == msgid)
2678 			break;
2679 	}
2680 
2681 	if (rep != NULL) {
2682 		/*
2683 		 * If message not yet received, wait.
2684 		 */
2685 		if (rep->status == (uint_t)REPLY_WAIT) {
2686 			timout = ddi_get_lbolt() +
2687 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2688 
2689 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2690 			    &qp->replylist_lock, timout)) > 0 &&
2691 			    rep->status == (uint_t)REPLY_WAIT)
2692 				;
2693 
2694 			switch (cv_wait_ret) {
2695 			case -1:	/* timeout */
2696 				ret = RDMA_TIMEDOUT;
2697 				break;
2698 			case 0:
2699 				ret = RDMA_INTR;
2700 				break;
2701 			default:
2702 				break;
2703 			}
2704 		}
2705 
2706 		if (rep->status == RDMA_SUCCESS) {
2707 			struct clist *cl = NULL;
2708 
2709 			/*
2710 			 * Got message successfully
2711 			 */
2712 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2713 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2714 			*clp = cl;
2715 		} else {
2716 			if (rep->status != (uint_t)REPLY_WAIT) {
2717 				/*
2718 				 * Got error in reply message. Free
2719 				 * recv buffer here.
2720 				 */
2721 				ret = rep->status;
2722 				rib_rbuf_free(conn, RECV_BUFFER,
2723 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2724 			}
2725 		}
2726 		(void) rib_remreply(qp, rep);
2727 	} else {
2728 		/*
2729 		 * No matching reply structure found for given msgid on the
2730 		 * reply wait list.
2731 		 */
2732 		ret = RDMA_INVAL;
2733 		DTRACE_PROBE(rpcib__i__nomatchxid2);
2734 	}
2735 
2736 	/*
2737 	 * Done.
2738 	 */
2739 	mutex_exit(&qp->replylist_lock);
2740 	return (ret);
2741 }
2742 
2743 /*
2744  * RDMA write a buffer to the remote address.
2745  */
2746 rdma_stat
2747 rib_write(CONN *conn, struct clist *cl, int wait)
2748 {
2749 	ibt_send_wr_t	tx_wr;
2750 	int		cv_sig;
2751 	ibt_wr_ds_t	sgl[DSEG_MAX];
2752 	struct send_wid	*wdesc;
2753 	ibt_status_t	ibt_status;
2754 	rdma_stat	ret = RDMA_SUCCESS;
2755 	rib_qp_t	*qp = ctoqp(conn);
2756 	uint64_t	n_writes = 0;
2757 
2758 	if (cl == NULL) {
2759 		return (RDMA_FAILED);
2760 	}
2761 
2762 	while ((cl != NULL)) {
2763 		if (cl->c_len > 0) {
2764 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2765 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2766 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2767 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2768 			sgl[0].ds_va = cl->w.c_saddr;
2769 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2770 			sgl[0].ds_len = cl->c_len;
2771 
2772 			if (wait) {
2773 				cv_sig = 1;
2774 			} else {
2775 				if (n_writes > max_unsignaled_rws) {
2776 					n_writes = 0;
2777 					cv_sig = 1;
2778 				} else {
2779 					cv_sig = 0;
2780 				}
2781 			}
2782 
2783 			if (cv_sig) {
2784 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2785 				wdesc = rib_init_sendwait(0, cv_sig, qp);
2786 				tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2787 				mutex_enter(&wdesc->sendwait_lock);
2788 			} else {
2789 				tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2790 				tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2791 			}
2792 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2793 			tx_wr.wr_trans = IBT_RC_SRV;
2794 			tx_wr.wr_nds = 1;
2795 			tx_wr.wr_sgl = sgl;
2796 
2797 			mutex_enter(&conn->c_lock);
2798 			if (conn->c_state == C_CONNECTED) {
2799 				ibt_status =
2800 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2801 			}
2802 			if (conn->c_state != C_CONNECTED ||
2803 			    ibt_status != IBT_SUCCESS) {
2804 				if (conn->c_state != C_DISCONN_PEND)
2805 					conn->c_state = C_ERROR_CONN;
2806 				mutex_exit(&conn->c_lock);
2807 				if (cv_sig) {
2808 					mutex_exit(&wdesc->sendwait_lock);
2809 					(void) rib_free_sendwait(wdesc);
2810 				}
2811 				return (RDMA_CONNLOST);
2812 			}
2813 
2814 			mutex_exit(&conn->c_lock);
2815 
2816 			/*
2817 			 * Wait for send to complete
2818 			 */
2819 			if (cv_sig) {
2820 
2821 				rib_send_hold(qp);
2822 				mutex_exit(&wdesc->sendwait_lock);
2823 
2824 				ret = rib_sendwait(qp, wdesc);
2825 				if (ret != 0)
2826 					return (ret);
2827 			}
2828 			n_writes ++;
2829 		}
2830 		cl = cl->c_next;
2831 	}
2832 	return (RDMA_SUCCESS);
2833 }
2834 
2835 /*
2836  * RDMA Read a buffer from the remote address.
2837  */
2838 rdma_stat
2839 rib_read(CONN *conn, struct clist *cl, int wait)
2840 {
2841 	ibt_send_wr_t	rx_wr;
2842 	int		cv_sig = 0;
2843 	ibt_wr_ds_t	sgl;
2844 	struct send_wid	*wdesc;
2845 	ibt_status_t	ibt_status = IBT_SUCCESS;
2846 	rdma_stat	ret = RDMA_SUCCESS;
2847 	rib_qp_t	*qp = ctoqp(conn);
2848 
2849 	if (cl == NULL) {
2850 		return (RDMA_FAILED);
2851 	}
2852 
2853 	while (cl != NULL) {
2854 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2855 		/*
2856 		 * Remote address is at the head chunk item in list.
2857 		 */
2858 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2859 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2860 
2861 		sgl.ds_va = cl->u.c_daddr;
2862 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2863 		sgl.ds_len = cl->c_len;
2864 
2865 		/*
2866 		 * If there are multiple chunks to be read, and
2867 		 * wait is set, ask for signal only for the last chunk
2868 		 * and wait only on the last chunk. The completion of
2869 		 * RDMA_READ on last chunk ensures that reads on all
2870 		 * previous chunks are also completed.
2871 		 */
2872 		if (wait && (cl->c_next == NULL)) {
2873 			cv_sig = 1;
2874 			wdesc = rib_init_sendwait(0, cv_sig, qp);
2875 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2876 			rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2877 			mutex_enter(&wdesc->sendwait_lock);
2878 		} else {
2879 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2880 			rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2881 		}
2882 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2883 		rx_wr.wr_trans = IBT_RC_SRV;
2884 		rx_wr.wr_nds = 1;
2885 		rx_wr.wr_sgl = &sgl;
2886 
2887 		mutex_enter(&conn->c_lock);
2888 		if (conn->c_state == C_CONNECTED) {
2889 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2890 		}
2891 		if (conn->c_state != C_CONNECTED ||
2892 		    ibt_status != IBT_SUCCESS) {
2893 			if (conn->c_state != C_DISCONN_PEND)
2894 				conn->c_state = C_ERROR_CONN;
2895 			mutex_exit(&conn->c_lock);
2896 			if (wait && (cl->c_next == NULL)) {
2897 				mutex_exit(&wdesc->sendwait_lock);
2898 				(void) rib_free_sendwait(wdesc);
2899 			}
2900 			return (RDMA_CONNLOST);
2901 		}
2902 
2903 		mutex_exit(&conn->c_lock);
2904 
2905 		/*
2906 		 * Wait for send to complete if this is the
2907 		 * last item in the list.
2908 		 */
2909 		if (wait && cl->c_next == NULL) {
2910 			rib_send_hold(qp);
2911 			mutex_exit(&wdesc->sendwait_lock);
2912 
2913 			ret = rib_sendwait(qp, wdesc);
2914 
2915 			if (ret != 0)
2916 				return (ret);
2917 		}
2918 		cl = cl->c_next;
2919 	}
2920 	return (RDMA_SUCCESS);
2921 }
2922 
2923 /*
2924  * rib_srv_cm_handler()
2925  *    Connection Manager callback to handle RC connection requests.
2926  */
2927 /* ARGSUSED */
2928 static ibt_cm_status_t
2929 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2930 	ibt_cm_return_args_t *ret_args, void *priv_data,
2931 	ibt_priv_data_len_t len)
2932 {
2933 	queue_t		*q;
2934 	rib_qp_t	*qp;
2935 	rib_hca_t	*hca;
2936 	rdma_stat	status = RDMA_SUCCESS;
2937 	int		i;
2938 	struct clist	cl;
2939 	rdma_buf_t	rdbuf = {0};
2940 	void		*buf = NULL;
2941 	CONN		*conn;
2942 	ibt_ip_cm_info_t	ipinfo;
2943 	struct sockaddr_in *s;
2944 	struct sockaddr_in6 *s6;
2945 	int sin_size = sizeof (struct sockaddr_in);
2946 	int in_size = sizeof (struct in_addr);
2947 	int sin6_size = sizeof (struct sockaddr_in6);
2948 
2949 	ASSERT(any != NULL);
2950 	ASSERT(event != NULL);
2951 
2952 	hca = (rib_hca_t *)any;
2953 
2954 	/* got a connection request */
2955 	switch (event->cm_type) {
2956 	case IBT_CM_EVENT_REQ_RCV:
2957 		/*
2958 		 * If the plugin is in the NO_ACCEPT state, bail out.
2959 		 */
2960 		mutex_enter(&plugin_state_lock);
2961 		if (plugin_state == NO_ACCEPT) {
2962 			mutex_exit(&plugin_state_lock);
2963 			return (IBT_CM_REJECT);
2964 		}
2965 		mutex_exit(&plugin_state_lock);
2966 
2967 		/*
2968 		 * Need to send a MRA MAD to CM so that it does not
2969 		 * timeout on us.
2970 		 */
2971 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2972 		    event->cm_event.req.req_timeout * 8, NULL, 0);
2973 
2974 		mutex_enter(&rib_stat->open_hca_lock);
2975 		q = rib_stat->q;
2976 		mutex_exit(&rib_stat->open_hca_lock);
2977 
2978 		status = rib_svc_create_chan(hca, (caddr_t)q,
2979 		    event->cm_event.req.req_prim_hca_port, &qp);
2980 
2981 		if (status) {
2982 			return (IBT_CM_REJECT);
2983 		}
2984 
2985 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2986 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2987 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2988 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2989 
2990 		/*
2991 		 * Pre-posts RECV buffers
2992 		 */
2993 		conn = qptoc(qp);
2994 		for (i = 0; i < preposted_rbufs; i++) {
2995 			bzero(&rdbuf, sizeof (rdbuf));
2996 			rdbuf.type = RECV_BUFFER;
2997 			buf = rib_rbuf_alloc(conn, &rdbuf);
2998 			if (buf == NULL) {
2999 				/*
3000 				 * A connection is not established yet.
3001 				 * Just flush the channel. Buffers
3002 				 * posted till now will error out with
3003 				 * IBT_WC_WR_FLUSHED_ERR.
3004 				 */
3005 				(void) ibt_flush_channel(qp->qp_hdl);
3006 				(void) rib_disconnect_channel(conn, NULL);
3007 				return (IBT_CM_REJECT);
3008 			}
3009 
3010 			bzero(&cl, sizeof (cl));
3011 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
3012 			cl.c_len = rdbuf.len;
3013 			cl.c_smemhandle.mrc_lmr =
3014 			    rdbuf.handle.mrc_lmr; /* lkey */
3015 			cl.c_next = NULL;
3016 			status = rib_post_recv(conn, &cl);
3017 			if (status != RDMA_SUCCESS) {
3018 				/*
3019 				 * A connection is not established yet.
3020 				 * Just flush the channel. Buffers
3021 				 * posted till now will error out with
3022 				 * IBT_WC_WR_FLUSHED_ERR.
3023 				 */
3024 				(void) ibt_flush_channel(qp->qp_hdl);
3025 				(void) rib_disconnect_channel(conn, NULL);
3026 				return (IBT_CM_REJECT);
3027 			}
3028 		}
3029 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
3030 
3031 		/*
3032 		 * Get the address translation
3033 		 */
3034 		rw_enter(&hca->state_lock, RW_READER);
3035 		if (hca->state == HCA_DETACHED) {
3036 			rw_exit(&hca->state_lock);
3037 			return (IBT_CM_REJECT);
3038 		}
3039 		rw_exit(&hca->state_lock);
3040 
3041 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3042 
3043 		if (ibt_get_ip_data(event->cm_priv_data_len,
3044 		    event->cm_priv_data,
3045 		    &ipinfo) != IBT_SUCCESS) {
3046 
3047 			return (IBT_CM_REJECT);
3048 		}
3049 
3050 		switch (ipinfo.src_addr.family) {
3051 		case AF_INET:
3052 
3053 			conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1,
3054 			    KM_SLEEP);
3055 			(void) strcpy(conn->c_netid, RIBNETID_TCP);
3056 
3057 			conn->c_raddr.maxlen =
3058 			    conn->c_raddr.len = sin_size;
3059 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3060 
3061 			s = (struct sockaddr_in *)conn->c_raddr.buf;
3062 			s->sin_family = AF_INET;
3063 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3064 			    &s->sin_addr, in_size);
3065 
3066 			conn->c_laddr.maxlen =
3067 			    conn->c_laddr.len = sin_size;
3068 			conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3069 
3070 			s = (struct sockaddr_in *)conn->c_laddr.buf;
3071 			s->sin_family = AF_INET;
3072 			bcopy((void *)&ipinfo.dst_addr.un.ip4addr,
3073 			    &s->sin_addr, in_size);
3074 
3075 			conn->c_addrmask.maxlen = conn->c_addrmask.len =
3076 			    sizeof (struct sockaddr_in);
3077 			conn->c_addrmask.buf =
3078 			    kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3079 			((struct sockaddr_in *)
3080 			    conn->c_addrmask.buf)->sin_addr.s_addr =
3081 			    (uint32_t)~0;
3082 			((struct sockaddr_in *)
3083 			    conn->c_addrmask.buf)->sin_family =
3084 			    (sa_family_t)~0;
3085 			break;
3086 
3087 		case AF_INET6:
3088 
3089 			conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1,
3090 			    KM_SLEEP);
3091 			(void) strcpy(conn->c_netid, RIBNETID_TCP6);
3092 
3093 			conn->c_raddr.maxlen =
3094 			    conn->c_raddr.len = sin6_size;
3095 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3096 
3097 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3098 			s6->sin6_family = AF_INET6;
3099 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3100 			    &s6->sin6_addr,
3101 			    sizeof (struct in6_addr));
3102 
3103 			conn->c_laddr.maxlen =
3104 			    conn->c_laddr.len = sin6_size;
3105 			conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3106 
3107 			s6 = (struct sockaddr_in6 *)conn->c_laddr.buf;
3108 			s6->sin6_family = AF_INET6;
3109 			bcopy((void *)&ipinfo.dst_addr.un.ip6addr,
3110 			    &s6->sin6_addr,
3111 			    sizeof (struct in6_addr));
3112 
3113 			conn->c_addrmask.maxlen = conn->c_addrmask.len =
3114 			    sizeof (struct sockaddr_in6);
3115 			conn->c_addrmask.buf =
3116 			    kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3117 			(void) memset(&((struct sockaddr_in6 *)
3118 			    conn->c_addrmask.buf)->sin6_addr, (uchar_t)~0,
3119 			    sizeof (struct in6_addr));
3120 			((struct sockaddr_in6 *)
3121 			    conn->c_addrmask.buf)->sin6_family =
3122 			    (sa_family_t)~0;
3123 			break;
3124 
3125 		default:
3126 			return (IBT_CM_REJECT);
3127 		}
3128 
3129 		break;
3130 
3131 	case IBT_CM_EVENT_CONN_CLOSED:
3132 	{
3133 		CONN		*conn;
3134 		rib_qp_t	*qp;
3135 
3136 		switch (event->cm_event.closed) {
3137 		case IBT_CM_CLOSED_DREP_RCVD:
3138 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
3139 		case IBT_CM_CLOSED_DUP:
3140 		case IBT_CM_CLOSED_ABORT:
3141 		case IBT_CM_CLOSED_ALREADY:
3142 			/*
3143 			 * These cases indicate the local end initiated
3144 			 * the closing of the channel. Nothing to do here.
3145 			 */
3146 			break;
3147 		default:
3148 			/*
3149 			 * Reason for CONN_CLOSED event must be one of
3150 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3151 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
3152 			 * the remote end is closing the channel. In these
3153 			 * cases free the channel and transition to error
3154 			 * state
3155 			 */
3156 			qp = ibt_get_chan_private(event->cm_channel);
3157 			conn = qptoc(qp);
3158 			mutex_enter(&conn->c_lock);
3159 			if (conn->c_state == C_DISCONN_PEND) {
3160 				mutex_exit(&conn->c_lock);
3161 				break;
3162 			}
3163 			conn->c_state = C_ERROR_CONN;
3164 
3165 			/*
3166 			 * Free the conn if c_ref goes down to 0
3167 			 */
3168 			if (conn->c_ref == 0) {
3169 				/*
3170 				 * Remove from list and free conn
3171 				 */
3172 				conn->c_state = C_DISCONN_PEND;
3173 				mutex_exit(&conn->c_lock);
3174 				(void) rib_disconnect_channel(conn,
3175 				    &hca->srv_conn_list);
3176 			} else {
3177 				/*
3178 				 * conn will be freed when c_ref goes to 0.
3179 				 * Indicate to cleaning thread not to close
3180 				 * the connection, but just free the channel.
3181 				 */
3182 				conn->c_flags |= C_CLOSE_NOTNEEDED;
3183 				mutex_exit(&conn->c_lock);
3184 			}
3185 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3186 			break;
3187 		}
3188 		break;
3189 	}
3190 	case IBT_CM_EVENT_CONN_EST:
3191 		/*
3192 		 * RTU received, hence connection established.
3193 		 */
3194 		if (rib_debug > 1)
3195 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3196 			    "(CONN_EST) channel established");
3197 		break;
3198 
3199 	default:
3200 		if (rib_debug > 2) {
3201 			/* Let CM handle the following events. */
3202 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3203 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3204 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3205 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3206 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3207 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3208 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3209 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3210 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3211 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3212 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3213 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3214 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3215 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3216 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
3217 			}
3218 		}
3219 		return (IBT_CM_DEFAULT);
3220 	}
3221 
3222 	/* accept all other CM messages (i.e. let the CM handle them) */
3223 	return (IBT_CM_ACCEPT);
3224 }
3225 
3226 static rdma_stat
3227 rib_register_service(rib_hca_t *hca, int service_type,
3228 	uint8_t protocol_num, in_port_t dst_port)
3229 {
3230 	ibt_srv_desc_t		sdesc;
3231 	ibt_hca_portinfo_t	*port_infop;
3232 	ib_svc_id_t		srv_id;
3233 	ibt_srv_hdl_t		srv_hdl;
3234 	uint_t			port_size;
3235 	uint_t			pki, i, num_ports, nbinds;
3236 	ibt_status_t		ibt_status;
3237 	rib_service_t		*service;
3238 	ib_pkey_t		pkey;
3239 
3240 	/*
3241 	 * Query all ports for the given HCA
3242 	 */
3243 	rw_enter(&hca->state_lock, RW_READER);
3244 	if (hca->state != HCA_DETACHED) {
3245 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3246 		    &num_ports, &port_size);
3247 		rw_exit(&hca->state_lock);
3248 	} else {
3249 		rw_exit(&hca->state_lock);
3250 		return (RDMA_FAILED);
3251 	}
3252 	if (ibt_status != IBT_SUCCESS) {
3253 		return (RDMA_FAILED);
3254 	}
3255 
3256 	DTRACE_PROBE1(rpcib__i__regservice_numports,
3257 	    int, num_ports);
3258 
3259 	for (i = 0; i < num_ports; i++) {
3260 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3261 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3262 			    int, i+1);
3263 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3264 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
3265 			    int, i+1);
3266 		}
3267 	}
3268 
3269 	/*
3270 	 * Get all the IP addresses on this system to register the
3271 	 * given "service type" on all DNS recognized IP addrs.
3272 	 * Each service type such as NFS will have all the systems
3273 	 * IP addresses as its different names. For now the only
3274 	 * type of service we support in RPCIB is NFS.
3275 	 */
3276 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3277 	/*
3278 	 * Start registering and binding service to active
3279 	 * on active ports on this HCA.
3280 	 */
3281 	nbinds = 0;
3282 	for (service = rib_stat->service_list;
3283 	    service && (service->srv_type != service_type);
3284 	    service = service->next)
3285 		;
3286 
3287 	if (service == NULL) {
3288 		/*
3289 		 * We use IP addresses as the service names for
3290 		 * service registration.  Register each of them
3291 		 * with CM to obtain a svc_id and svc_hdl.  We do not
3292 		 * register the service with machine's loopback address.
3293 		 */
3294 		(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3295 		(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3296 		(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3297 		sdesc.sd_handler = rib_srv_cm_handler;
3298 		sdesc.sd_flags = 0;
3299 		ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3300 		    &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3301 		    1, &srv_hdl, &srv_id);
3302 		if ((ibt_status != IBT_SUCCESS) &&
3303 		    (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3304 			rw_exit(&rib_stat->service_list_lock);
3305 			DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3306 			    int, ibt_status);
3307 			ibt_free_portinfo(port_infop, port_size);
3308 			return (RDMA_FAILED);
3309 		}
3310 
3311 		/*
3312 		 * Allocate and prepare a service entry
3313 		 */
3314 		service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3315 
3316 		service->srv_type = service_type;
3317 		service->srv_hdl = srv_hdl;
3318 		service->srv_id = srv_id;
3319 
3320 		service->next = rib_stat->service_list;
3321 		rib_stat->service_list = service;
3322 		DTRACE_PROBE1(rpcib__i__regservice__new__service,
3323 		    int, service->srv_type);
3324 	} else {
3325 		srv_hdl = service->srv_hdl;
3326 		srv_id = service->srv_id;
3327 		DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3328 		    int, service->srv_type);
3329 	}
3330 
3331 	for (i = 0; i < num_ports; i++) {
3332 		ibt_sbind_hdl_t		sbp;
3333 		rib_hca_service_t	*hca_srv;
3334 		ib_gid_t		gid;
3335 
3336 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3337 			continue;
3338 
3339 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3340 			pkey = port_infop[i].p_pkey_tbl[pki];
3341 
3342 			rw_enter(&hca->bound_services_lock, RW_READER);
3343 			gid = port_infop[i].p_sgid_tbl[0];
3344 			for (hca_srv = hca->bound_services; hca_srv;
3345 			    hca_srv = hca_srv->next) {
3346 				if ((hca_srv->srv_id == service->srv_id) &&
3347 				    (hca_srv->gid.gid_prefix ==
3348 				    gid.gid_prefix) &&
3349 				    (hca_srv->gid.gid_guid == gid.gid_guid))
3350 					break;
3351 			}
3352 			rw_exit(&hca->bound_services_lock);
3353 			if (hca_srv != NULL) {
3354 				/*
3355 				 * port is alreay bound the the service
3356 				 */
3357 				DTRACE_PROBE1(
3358 				    rpcib__i__regservice__already__bound,
3359 				    int, i+1);
3360 				nbinds++;
3361 				continue;
3362 			}
3363 
3364 			if ((pkey & IBSRM_HB) &&
3365 			    (pkey != IB_PKEY_INVALID_FULL)) {
3366 
3367 				sbp = NULL;
3368 				ibt_status = ibt_bind_service(srv_hdl,
3369 				    gid, NULL, hca, &sbp);
3370 
3371 				if (ibt_status == IBT_SUCCESS) {
3372 					hca_srv = kmem_zalloc(
3373 					    sizeof (rib_hca_service_t),
3374 					    KM_SLEEP);
3375 					hca_srv->srv_id = srv_id;
3376 					hca_srv->gid = gid;
3377 					hca_srv->sbind_hdl = sbp;
3378 
3379 					rw_enter(&hca->bound_services_lock,
3380 					    RW_WRITER);
3381 					hca_srv->next = hca->bound_services;
3382 					hca->bound_services = hca_srv;
3383 					rw_exit(&hca->bound_services_lock);
3384 					nbinds++;
3385 				}
3386 
3387 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3388 				    int, ibt_status);
3389 			}
3390 		}
3391 	}
3392 	rw_exit(&rib_stat->service_list_lock);
3393 
3394 	ibt_free_portinfo(port_infop, port_size);
3395 
3396 	if (nbinds == 0) {
3397 		return (RDMA_FAILED);
3398 	} else {
3399 		/*
3400 		 * Put this plugin into accept state, since atleast
3401 		 * one registration was successful.
3402 		 */
3403 		mutex_enter(&plugin_state_lock);
3404 		plugin_state = ACCEPT;
3405 		mutex_exit(&plugin_state_lock);
3406 		return (RDMA_SUCCESS);
3407 	}
3408 }
3409 
3410 void
3411 rib_listen(struct rdma_svc_data *rd)
3412 {
3413 	rdma_stat status;
3414 	int n_listening = 0;
3415 	rib_hca_t *hca;
3416 
3417 	mutex_enter(&rib_stat->listen_lock);
3418 	/*
3419 	 * if rd parameter is NULL then it means that rib_stat->q is
3420 	 * already initialized by a call from RDMA and we just want to
3421 	 * add a newly attached HCA to the same listening state as other
3422 	 * HCAs.
3423 	 */
3424 	if (rd == NULL) {
3425 		if (rib_stat->q == NULL) {
3426 			mutex_exit(&rib_stat->listen_lock);
3427 			return;
3428 		}
3429 	} else {
3430 		rib_stat->q = &rd->q;
3431 	}
3432 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3433 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3434 		/*
3435 		 * First check if a hca is still attached
3436 		 */
3437 		rw_enter(&hca->state_lock, RW_READER);
3438 		if (hca->state != HCA_INITED) {
3439 			rw_exit(&hca->state_lock);
3440 			continue;
3441 		}
3442 		rw_exit(&hca->state_lock);
3443 
3444 		/*
3445 		 * Right now the only service type is NFS. Hence
3446 		 * force feed this value. Ideally to communicate
3447 		 * the service type it should be passed down in
3448 		 * rdma_svc_data.
3449 		 */
3450 		status = rib_register_service(hca, NFS,
3451 		    IPPROTO_TCP, nfs_rdma_port);
3452 		if (status == RDMA_SUCCESS)
3453 			n_listening++;
3454 	}
3455 	rw_exit(&rib_stat->hcas_list_lock);
3456 
3457 	/*
3458 	 * Service active on an HCA, check rd->err_code for more
3459 	 * explainable errors.
3460 	 */
3461 	if (rd) {
3462 		if (n_listening > 0) {
3463 			rd->active = 1;
3464 			rd->err_code = RDMA_SUCCESS;
3465 		} else {
3466 			rd->active = 0;
3467 			rd->err_code = RDMA_FAILED;
3468 		}
3469 	}
3470 	mutex_exit(&rib_stat->listen_lock);
3471 }
3472 
3473 /* XXXX */
3474 /* ARGSUSED */
3475 static void
3476 rib_listen_stop(struct rdma_svc_data *svcdata)
3477 {
3478 	rib_hca_t		*hca;
3479 
3480 	mutex_enter(&rib_stat->listen_lock);
3481 	/*
3482 	 * KRPC called the RDMATF to stop the listeners, this means
3483 	 * stop sending incomming or recieved requests to KRPC master
3484 	 * transport handle for RDMA-IB. This is also means that the
3485 	 * master transport handle, responsible for us, is going away.
3486 	 */
3487 	mutex_enter(&plugin_state_lock);
3488 	plugin_state = NO_ACCEPT;
3489 	if (svcdata != NULL)
3490 		svcdata->active = 0;
3491 	mutex_exit(&plugin_state_lock);
3492 
3493 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3494 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3495 		/*
3496 		 * First check if a hca is still attached
3497 		 */
3498 		rw_enter(&hca->state_lock, RW_READER);
3499 		if (hca->state == HCA_DETACHED) {
3500 			rw_exit(&hca->state_lock);
3501 			continue;
3502 		}
3503 		rib_close_channels(&hca->srv_conn_list);
3504 		rib_stop_services(hca);
3505 		rw_exit(&hca->state_lock);
3506 	}
3507 	rw_exit(&rib_stat->hcas_list_lock);
3508 
3509 	/*
3510 	 * Avoid rib_listen() using the stale q field.
3511 	 * This could happen if a port goes up after all services
3512 	 * are already unregistered.
3513 	 */
3514 	rib_stat->q = NULL;
3515 	mutex_exit(&rib_stat->listen_lock);
3516 }
3517 
3518 /*
3519  * Traverse the HCA's service list to unbind and deregister services.
3520  * For each bound service of HCA to be removed, first find the corresponding
3521  * service handle (srv_hdl) and then unbind the service by calling
3522  * ibt_unbind_service().
3523  */
3524 static void
3525 rib_stop_services(rib_hca_t *hca)
3526 {
3527 	rib_hca_service_t *srv_list, *to_remove;
3528 
3529 	/*
3530 	 * unbind and deregister the services for this service type.
3531 	 * Right now there is only one service type. In future it will
3532 	 * be passed down to this function.
3533 	 */
3534 	rw_enter(&hca->bound_services_lock, RW_READER);
3535 	srv_list = hca->bound_services;
3536 	hca->bound_services = NULL;
3537 	rw_exit(&hca->bound_services_lock);
3538 
3539 	while (srv_list != NULL) {
3540 		rib_service_t *sc;
3541 
3542 		to_remove = srv_list;
3543 		srv_list = to_remove->next;
3544 		rw_enter(&rib_stat->service_list_lock, RW_READER);
3545 		for (sc = rib_stat->service_list;
3546 		    sc && (sc->srv_id != to_remove->srv_id);
3547 		    sc = sc->next)
3548 			;
3549 		/*
3550 		 * if sc is NULL then the service doesn't exist anymore,
3551 		 * probably just removed completely through rib_stat.
3552 		 */
3553 		if (sc != NULL)
3554 			(void) ibt_unbind_service(sc->srv_hdl,
3555 			    to_remove->sbind_hdl);
3556 		rw_exit(&rib_stat->service_list_lock);
3557 		kmem_free(to_remove, sizeof (rib_hca_service_t));
3558 	}
3559 }
3560 
3561 static struct svc_recv *
3562 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3563 {
3564 	struct svc_recv	*recvp;
3565 
3566 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3567 	recvp->vaddr = sgl->ds_va;
3568 	recvp->qp = qp;
3569 	recvp->bytes_xfer = 0;
3570 	return (recvp);
3571 }
3572 
3573 static int
3574 rib_free_svc_recv(struct svc_recv *recvp)
3575 {
3576 	kmem_free(recvp, sizeof (*recvp));
3577 
3578 	return (0);
3579 }
3580 
3581 static struct reply *
3582 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3583 {
3584 	struct reply	*rep;
3585 
3586 
3587 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3588 	if (rep == NULL) {
3589 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3590 		return (NULL);
3591 	}
3592 	rep->xid = msgid;
3593 	rep->vaddr_cq = NULL;
3594 	rep->bytes_xfer = 0;
3595 	rep->status = (uint_t)REPLY_WAIT;
3596 	rep->prev = NULL;
3597 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3598 
3599 	mutex_enter(&qp->replylist_lock);
3600 	if (qp->replylist) {
3601 		rep->next = qp->replylist;
3602 		qp->replylist->prev = rep;
3603 	}
3604 	qp->rep_list_size++;
3605 
3606 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3607 	    int, qp->rep_list_size);
3608 
3609 	qp->replylist = rep;
3610 	mutex_exit(&qp->replylist_lock);
3611 
3612 	return (rep);
3613 }
3614 
3615 static rdma_stat
3616 rib_rem_replylist(rib_qp_t *qp)
3617 {
3618 	struct reply	*r, *n;
3619 
3620 	mutex_enter(&qp->replylist_lock);
3621 	for (r = qp->replylist; r != NULL; r = n) {
3622 		n = r->next;
3623 		(void) rib_remreply(qp, r);
3624 	}
3625 	mutex_exit(&qp->replylist_lock);
3626 
3627 	return (RDMA_SUCCESS);
3628 }
3629 
3630 static int
3631 rib_remreply(rib_qp_t *qp, struct reply *rep)
3632 {
3633 
3634 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3635 	if (rep->prev) {
3636 		rep->prev->next = rep->next;
3637 	}
3638 	if (rep->next) {
3639 		rep->next->prev = rep->prev;
3640 	}
3641 	if (qp->replylist == rep)
3642 		qp->replylist = rep->next;
3643 
3644 	cv_destroy(&rep->wait_cv);
3645 	qp->rep_list_size--;
3646 
3647 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3648 	    int, qp->rep_list_size);
3649 
3650 	kmem_free(rep, sizeof (*rep));
3651 
3652 	return (0);
3653 }
3654 
3655 rdma_stat
3656 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3657 	struct mrc *buf_handle)
3658 {
3659 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3660 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3661 	rdma_stat	status;
3662 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3663 
3664 	/*
3665 	 * Note: ALL buffer pools use the same memory type RDMARW.
3666 	 */
3667 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3668 	if (status == RDMA_SUCCESS) {
3669 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3670 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3671 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3672 	} else {
3673 		buf_handle->mrc_linfo = NULL;
3674 		buf_handle->mrc_lmr = 0;
3675 		buf_handle->mrc_rmr = 0;
3676 	}
3677 	return (status);
3678 }
3679 
3680 static rdma_stat
3681 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3682 	ibt_mr_flags_t spec,
3683 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3684 {
3685 	ibt_mr_attr_t	mem_attr;
3686 	ibt_status_t	ibt_status;
3687 	mem_attr.mr_vaddr = (uintptr_t)buf;
3688 	mem_attr.mr_len = (ib_msglen_t)size;
3689 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3690 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3691 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3692 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3693 
3694 	rw_enter(&hca->state_lock, RW_READER);
3695 	if (hca->state != HCA_DETACHED) {
3696 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3697 		    &mem_attr, mr_hdlp, mr_descp);
3698 		rw_exit(&hca->state_lock);
3699 	} else {
3700 		rw_exit(&hca->state_lock);
3701 		return (RDMA_FAILED);
3702 	}
3703 
3704 	if (ibt_status != IBT_SUCCESS) {
3705 		return (RDMA_FAILED);
3706 	}
3707 	return (RDMA_SUCCESS);
3708 }
3709 
3710 rdma_stat
3711 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3712 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3713 {
3714 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3715 	rib_lrc_entry_t *l;
3716 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3717 	rdma_stat	status;
3718 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3719 
3720 	/*
3721 	 * Non-coherent memory registration.
3722 	 */
3723 	l = (rib_lrc_entry_t *)lrc;
3724 	if (l) {
3725 		if (l->registered) {
3726 			buf_handle->mrc_linfo =
3727 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3728 			buf_handle->mrc_lmr =
3729 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3730 			buf_handle->mrc_rmr =
3731 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3732 			*sync_handle = (RIB_SYNCMEM_HANDLE)
3733 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3734 			return (RDMA_SUCCESS);
3735 		} else {
3736 			/* Always register the whole buffer */
3737 			buf = (caddr_t)l->lrc_buf;
3738 			buflen = l->lrc_len;
3739 		}
3740 	}
3741 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3742 
3743 	if (status == RDMA_SUCCESS) {
3744 		if (l) {
3745 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3746 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3747 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3748 			l->registered		 = TRUE;
3749 		}
3750 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3751 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3752 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3753 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3754 	} else {
3755 		buf_handle->mrc_linfo = NULL;
3756 		buf_handle->mrc_lmr = 0;
3757 		buf_handle->mrc_rmr = 0;
3758 	}
3759 	return (status);
3760 }
3761 
3762 /* ARGSUSED */
3763 rdma_stat
3764 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3765 {
3766 	rib_hca_t *hca = (ctoqp(conn))->hca;
3767 	/*
3768 	 * Allow memory deregistration even if HCA is
3769 	 * getting detached. Need all outstanding
3770 	 * memory registrations to be deregistered
3771 	 * before HCA_DETACH_EVENT can be accepted.
3772 	 */
3773 	(void) ibt_deregister_mr(hca->hca_hdl,
3774 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3775 	return (RDMA_SUCCESS);
3776 }
3777 
3778 /* ARGSUSED */
3779 rdma_stat
3780 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3781 		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3782 {
3783 	rib_lrc_entry_t *l;
3784 	l = (rib_lrc_entry_t *)lrc;
3785 	if (l)
3786 		if (l->registered)
3787 			return (RDMA_SUCCESS);
3788 
3789 	(void) rib_deregistermem(conn, buf, buf_handle);
3790 
3791 	return (RDMA_SUCCESS);
3792 }
3793 
3794 /* ARGSUSED */
3795 rdma_stat
3796 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3797 		int len, int cpu)
3798 {
3799 	ibt_status_t	status;
3800 	rib_hca_t *hca = (ctoqp(conn))->hca;
3801 	ibt_mr_sync_t	mr_segment;
3802 
3803 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3804 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3805 	mr_segment.ms_len = (ib_memlen_t)len;
3806 	if (cpu) {
3807 		/* make incoming data visible to memory */
3808 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3809 	} else {
3810 		/* make memory changes visible to IO */
3811 		mr_segment.ms_flags = IBT_SYNC_READ;
3812 	}
3813 	rw_enter(&hca->state_lock, RW_READER);
3814 	if (hca->state != HCA_DETACHED) {
3815 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3816 		rw_exit(&hca->state_lock);
3817 	} else {
3818 		rw_exit(&hca->state_lock);
3819 		return (RDMA_FAILED);
3820 	}
3821 
3822 	if (status == IBT_SUCCESS)
3823 		return (RDMA_SUCCESS);
3824 	else {
3825 		return (RDMA_FAILED);
3826 	}
3827 }
3828 
3829 /*
3830  * XXXX	????
3831  */
3832 static rdma_stat
3833 rib_getinfo(rdma_info_t *info)
3834 {
3835 	/*
3836 	 * XXXX	Hack!
3837 	 */
3838 	info->addrlen = 16;
3839 	info->mts = 1000000;
3840 	info->mtu = 1000000;
3841 
3842 	return (RDMA_SUCCESS);
3843 }
3844 
3845 rib_bufpool_t *
3846 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3847 {
3848 	rib_bufpool_t	*rbp = NULL;
3849 	bufpool_t	*bp = NULL;
3850 	caddr_t		buf;
3851 	ibt_mr_attr_t	mem_attr;
3852 	ibt_status_t	ibt_status;
3853 	int		i, j;
3854 
3855 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3856 
3857 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3858 	    num * sizeof (void *), KM_SLEEP);
3859 
3860 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3861 	bp->numelems = num;
3862 
3863 
3864 	switch (ptype) {
3865 	case SEND_BUFFER:
3866 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3867 		bp->rsize = RPC_MSG_SZ;
3868 		break;
3869 	case RECV_BUFFER:
3870 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3871 		bp->rsize = RPC_BUF_SIZE;
3872 		break;
3873 	default:
3874 		goto fail;
3875 	}
3876 
3877 	/*
3878 	 * Register the pool.
3879 	 */
3880 	bp->bufsize = num * bp->rsize;
3881 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3882 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3883 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3884 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3885 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3886 	rw_enter(&hca->state_lock, RW_READER);
3887 
3888 	if (hca->state == HCA_DETACHED) {
3889 		rw_exit(&hca->state_lock);
3890 		goto fail;
3891 	}
3892 
3893 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3894 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3895 		mem_attr.mr_vaddr = (uintptr_t)buf;
3896 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3897 		mem_attr.mr_as = NULL;
3898 		ibt_status = ibt_register_mr(hca->hca_hdl,
3899 		    hca->pd_hdl, &mem_attr,
3900 		    &rbp->mr_hdl[i],
3901 		    &rbp->mr_desc[i]);
3902 		if (ibt_status != IBT_SUCCESS) {
3903 			for (j = 0; j < i; j++) {
3904 				(void) ibt_deregister_mr(hca->hca_hdl,
3905 				    rbp->mr_hdl[j]);
3906 			}
3907 			rw_exit(&hca->state_lock);
3908 			goto fail;
3909 		}
3910 	}
3911 	rw_exit(&hca->state_lock);
3912 	buf = (caddr_t)bp->buf;
3913 	for (i = 0; i < num; i++, buf += bp->rsize) {
3914 		bp->buflist[i] = (void *)buf;
3915 	}
3916 	bp->buffree = num - 1;	/* no. of free buffers */
3917 	rbp->bpool = bp;
3918 
3919 	return (rbp);
3920 fail:
3921 	if (bp) {
3922 		if (bp->buf)
3923 			kmem_free(bp->buf, bp->bufsize);
3924 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3925 	}
3926 	if (rbp) {
3927 		if (rbp->mr_hdl)
3928 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3929 		if (rbp->mr_desc)
3930 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3931 		kmem_free(rbp, sizeof (rib_bufpool_t));
3932 	}
3933 	return (NULL);
3934 }
3935 
3936 static void
3937 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3938 {
3939 	int i;
3940 	rib_bufpool_t *rbp = NULL;
3941 	bufpool_t *bp;
3942 
3943 	/*
3944 	 * Obtain pool address based on type of pool
3945 	 */
3946 	switch (ptype) {
3947 		case SEND_BUFFER:
3948 			rbp = hca->send_pool;
3949 			break;
3950 		case RECV_BUFFER:
3951 			rbp = hca->recv_pool;
3952 			break;
3953 		default:
3954 			return;
3955 	}
3956 	if (rbp == NULL)
3957 		return;
3958 
3959 	bp = rbp->bpool;
3960 
3961 	/*
3962 	 * Deregister the pool memory and free it.
3963 	 */
3964 	for (i = 0; i < bp->numelems; i++) {
3965 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3966 	}
3967 }
3968 
3969 static void
3970 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3971 {
3972 
3973 	rib_bufpool_t *rbp = NULL;
3974 	bufpool_t *bp;
3975 
3976 	/*
3977 	 * Obtain pool address based on type of pool
3978 	 */
3979 	switch (ptype) {
3980 		case SEND_BUFFER:
3981 			rbp = hca->send_pool;
3982 			break;
3983 		case RECV_BUFFER:
3984 			rbp = hca->recv_pool;
3985 			break;
3986 		default:
3987 			return;
3988 	}
3989 	if (rbp == NULL)
3990 		return;
3991 
3992 	bp = rbp->bpool;
3993 
3994 	/*
3995 	 * Free the pool memory.
3996 	 */
3997 	if (rbp->mr_hdl)
3998 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3999 
4000 	if (rbp->mr_desc)
4001 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4002 	if (bp->buf)
4003 		kmem_free(bp->buf, bp->bufsize);
4004 	mutex_destroy(&bp->buflock);
4005 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4006 	kmem_free(rbp, sizeof (rib_bufpool_t));
4007 }
4008 
4009 void
4010 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4011 {
4012 	/*
4013 	 * Deregister the pool memory and free it.
4014 	 */
4015 	rib_rbufpool_deregister(hca, ptype);
4016 	rib_rbufpool_free(hca, ptype);
4017 }
4018 
4019 /*
4020  * Fetch a buffer from the pool of type specified in rdbuf->type.
4021  */
4022 static rdma_stat
4023 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4024 {
4025 	rib_lrc_entry_t *rlep;
4026 
4027 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
4028 		rlep = rib_get_cache_buf(conn, rdbuf->len);
4029 		rdbuf->rb_private =  (caddr_t)rlep;
4030 		rdbuf->addr = rlep->lrc_buf;
4031 		rdbuf->handle = rlep->lrc_mhandle;
4032 		return (RDMA_SUCCESS);
4033 	}
4034 
4035 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4036 	if (rdbuf->addr) {
4037 		switch (rdbuf->type) {
4038 		case SEND_BUFFER:
4039 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
4040 			break;
4041 		case RECV_BUFFER:
4042 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
4043 			break;
4044 		default:
4045 			rdbuf->len = 0;
4046 		}
4047 		return (RDMA_SUCCESS);
4048 	} else
4049 		return (RDMA_FAILED);
4050 }
4051 
4052 /*
4053  * Fetch a buffer of specified type.
4054  * Note that rdbuf->handle is mw's rkey.
4055  */
4056 static void *
4057 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4058 {
4059 	rib_qp_t	*qp = ctoqp(conn);
4060 	rib_hca_t	*hca = qp->hca;
4061 	rdma_btype	ptype = rdbuf->type;
4062 	void		*buf;
4063 	rib_bufpool_t	*rbp = NULL;
4064 	bufpool_t	*bp;
4065 	int		i;
4066 
4067 	/*
4068 	 * Obtain pool address based on type of pool
4069 	 */
4070 	switch (ptype) {
4071 	case SEND_BUFFER:
4072 		rbp = hca->send_pool;
4073 		break;
4074 	case RECV_BUFFER:
4075 		rbp = hca->recv_pool;
4076 		break;
4077 	default:
4078 		return (NULL);
4079 	}
4080 	if (rbp == NULL)
4081 		return (NULL);
4082 
4083 	bp = rbp->bpool;
4084 
4085 	mutex_enter(&bp->buflock);
4086 	if (bp->buffree < 0) {
4087 		mutex_exit(&bp->buflock);
4088 		return (NULL);
4089 	}
4090 
4091 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4092 	buf = bp->buflist[bp->buffree];
4093 	rdbuf->addr = buf;
4094 	rdbuf->len = bp->rsize;
4095 	for (i = bp->numelems - 1; i >= 0; i--) {
4096 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4097 			rdbuf->handle.mrc_rmr =
4098 			    (uint32_t)rbp->mr_desc[i].md_rkey;
4099 			rdbuf->handle.mrc_linfo =
4100 			    (uintptr_t)rbp->mr_hdl[i];
4101 			rdbuf->handle.mrc_lmr =
4102 			    (uint32_t)rbp->mr_desc[i].md_lkey;
4103 			bp->buffree--;
4104 
4105 			mutex_exit(&bp->buflock);
4106 
4107 			return (buf);
4108 		}
4109 	}
4110 
4111 	mutex_exit(&bp->buflock);
4112 
4113 	return (NULL);
4114 }
4115 
4116 static void
4117 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4118 {
4119 
4120 	if (rdbuf->type == RDMA_LONG_BUFFER) {
4121 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4122 		rdbuf->rb_private = NULL;
4123 		return;
4124 	}
4125 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4126 }
4127 
4128 static void
4129 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4130 {
4131 	rib_qp_t *qp = ctoqp(conn);
4132 	rib_hca_t *hca = qp->hca;
4133 	rib_bufpool_t *rbp = NULL;
4134 	bufpool_t *bp;
4135 
4136 	/*
4137 	 * Obtain pool address based on type of pool
4138 	 */
4139 	switch (ptype) {
4140 	case SEND_BUFFER:
4141 		rbp = hca->send_pool;
4142 		break;
4143 	case RECV_BUFFER:
4144 		rbp = hca->recv_pool;
4145 		break;
4146 	default:
4147 		return;
4148 	}
4149 	if (rbp == NULL)
4150 		return;
4151 
4152 	bp = rbp->bpool;
4153 
4154 	mutex_enter(&bp->buflock);
4155 	if (++bp->buffree >= bp->numelems) {
4156 		/*
4157 		 * Should never happen
4158 		 */
4159 		bp->buffree--;
4160 	} else {
4161 		bp->buflist[bp->buffree] = buf;
4162 	}
4163 	mutex_exit(&bp->buflock);
4164 }
4165 
4166 static rdma_stat
4167 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4168 {
4169 	rw_enter(&connlist->conn_lock, RW_WRITER);
4170 	if (connlist->conn_hd) {
4171 		cn->c_next = connlist->conn_hd;
4172 		connlist->conn_hd->c_prev = cn;
4173 	}
4174 	connlist->conn_hd = cn;
4175 	rw_exit(&connlist->conn_lock);
4176 
4177 	return (RDMA_SUCCESS);
4178 }
4179 
4180 static rdma_stat
4181 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4182 {
4183 	rw_enter(&connlist->conn_lock, RW_WRITER);
4184 	if (cn->c_prev) {
4185 		cn->c_prev->c_next = cn->c_next;
4186 	}
4187 	if (cn->c_next) {
4188 		cn->c_next->c_prev = cn->c_prev;
4189 	}
4190 	if (connlist->conn_hd == cn)
4191 		connlist->conn_hd = cn->c_next;
4192 	rw_exit(&connlist->conn_lock);
4193 
4194 	return (RDMA_SUCCESS);
4195 }
4196 
4197 /* ARGSUSED */
4198 static rdma_stat
4199 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4200     int addr_type, void *handle, CONN **conn)
4201 {
4202 	rdma_stat status;
4203 	rpcib_ping_t rpt;
4204 
4205 	status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
4206 	return (status);
4207 }
4208 
4209 /*
4210  * rib_find_hca_connection
4211  *
4212  * if there is an existing connection to the specified address then
4213  * it will be returned in conn, otherwise conn will be set to NULL.
4214  * Also cleans up any connection that is in error state.
4215  */
4216 static int
4217 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4218     struct netbuf *d_svcaddr, CONN **conn)
4219 {
4220 	CONN *cn;
4221 	clock_t cv_stat, timout;
4222 
4223 	*conn = NULL;
4224 again:
4225 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4226 	cn = hca->cl_conn_list.conn_hd;
4227 	while (cn != NULL) {
4228 		/*
4229 		 * First, clear up any connection in the ERROR state
4230 		 */
4231 		mutex_enter(&cn->c_lock);
4232 		if (cn->c_state == C_ERROR_CONN) {
4233 			if (cn->c_ref == 0) {
4234 				/*
4235 				 * Remove connection from list and destroy it.
4236 				 */
4237 				cn->c_state = C_DISCONN_PEND;
4238 				mutex_exit(&cn->c_lock);
4239 				rw_exit(&hca->cl_conn_list.conn_lock);
4240 				rib_conn_close((void *)cn);
4241 				goto again;
4242 			}
4243 			mutex_exit(&cn->c_lock);
4244 			cn = cn->c_next;
4245 			continue;
4246 		}
4247 		if (cn->c_state == C_DISCONN_PEND) {
4248 			mutex_exit(&cn->c_lock);
4249 			cn = cn->c_next;
4250 			continue;
4251 		}
4252 
4253 		/*
4254 		 * source address is only checked for if there is one,
4255 		 * this is the case for retries.
4256 		 */
4257 		if ((cn->c_raddr.len == d_svcaddr->len) &&
4258 		    (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4259 		    d_svcaddr->len) == 0) &&
4260 		    ((s_svcaddr->len == 0) ||
4261 		    ((cn->c_laddr.len == s_svcaddr->len) &&
4262 		    (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4263 		    s_svcaddr->len) == 0)))) {
4264 			/*
4265 			 * Our connection. Give up conn list lock
4266 			 * as we are done traversing the list.
4267 			 */
4268 			rw_exit(&hca->cl_conn_list.conn_lock);
4269 			if (cn->c_state == C_CONNECTED) {
4270 				cn->c_ref++;	/* sharing a conn */
4271 				mutex_exit(&cn->c_lock);
4272 				*conn = cn;
4273 				return (RDMA_SUCCESS);
4274 			}
4275 			if (cn->c_state == C_CONN_PEND) {
4276 				/*
4277 				 * Hold a reference to this conn before
4278 				 * we give up the lock.
4279 				 */
4280 				cn->c_ref++;
4281 				timout =  ddi_get_lbolt() +
4282 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
4283 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4284 				    &cn->c_lock, timout)) > 0 &&
4285 				    cn->c_state == C_CONN_PEND)
4286 					;
4287 				if (cv_stat == 0) {
4288 					(void) rib_conn_release_locked(cn);
4289 					return (RDMA_INTR);
4290 				}
4291 				if (cv_stat < 0) {
4292 					(void) rib_conn_release_locked(cn);
4293 					return (RDMA_TIMEDOUT);
4294 				}
4295 				if (cn->c_state == C_CONNECTED) {
4296 					*conn = cn;
4297 					mutex_exit(&cn->c_lock);
4298 					return (RDMA_SUCCESS);
4299 				} else {
4300 					(void) rib_conn_release_locked(cn);
4301 					return (RDMA_TIMEDOUT);
4302 				}
4303 			}
4304 		}
4305 		mutex_exit(&cn->c_lock);
4306 		cn = cn->c_next;
4307 	}
4308 	rw_exit(&hca->cl_conn_list.conn_lock);
4309 	*conn = NULL;
4310 	return (RDMA_FAILED);
4311 }
4312 
4313 /*
4314  * Connection management.
4315  * IBTF does not support recycling of channels. So connections are only
4316  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4317  * C_DISCONN_PEND state. No C_IDLE state.
4318  * C_CONN_PEND state: Connection establishment in progress to the server.
4319  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4320  * It has an RC channel associated with it. ibt_post_send/recv are allowed
4321  * only in this state.
4322  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4323  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4324  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4325  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4326  * c_ref drops to 0 (this indicates that RPC has no more references to this
4327  * connection), the connection should be destroyed. A connection transitions
4328  * into this state when it is being destroyed.
4329  */
4330 /* ARGSUSED */
4331 static rdma_stat
4332 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4333     int addr_type, rpcib_ping_t *rpt, CONN **conn)
4334 {
4335 	CONN *cn;
4336 	int status;
4337 	rib_hca_t *hca;
4338 	rib_qp_t *qp;
4339 	int s_addr_len;
4340 	char *s_addr_buf;
4341 
4342 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4343 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4344 		rw_enter(&hca->state_lock, RW_READER);
4345 		if (hca->state != HCA_DETACHED) {
4346 			status = rib_find_hca_connection(hca, s_svcaddr,
4347 			    d_svcaddr, conn);
4348 			rw_exit(&hca->state_lock);
4349 			if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4350 				rw_exit(&rib_stat->hcas_list_lock);
4351 				return (status);
4352 			}
4353 		} else
4354 			rw_exit(&hca->state_lock);
4355 	}
4356 	rw_exit(&rib_stat->hcas_list_lock);
4357 
4358 	/*
4359 	 * No existing connection found, establish a new connection.
4360 	 */
4361 	bzero(rpt, sizeof (rpcib_ping_t));
4362 
4363 	status = rib_ping_srv(addr_type, d_svcaddr, rpt);
4364 	if (status != RDMA_SUCCESS) {
4365 		return (RDMA_FAILED);
4366 	}
4367 	hca = rpt->hca;
4368 
4369 	if (rpt->srcip.family == AF_INET) {
4370 		s_addr_len = sizeof (rpt->srcip.un.ip4addr);
4371 		s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
4372 	} else if (rpt->srcip.family == AF_INET6) {
4373 		s_addr_len = sizeof (rpt->srcip.un.ip6addr);
4374 		s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
4375 	} else {
4376 		return (RDMA_FAILED);
4377 	}
4378 
4379 	/*
4380 	 * Channel to server doesn't exist yet, create one.
4381 	 */
4382 	if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4383 		return (RDMA_FAILED);
4384 	}
4385 	cn = qptoc(qp);
4386 	cn->c_state = C_CONN_PEND;
4387 	cn->c_ref = 1;
4388 
4389 	cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4390 	bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4391 	cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4392 
4393 	if (rpt->srcip.family == AF_INET) {
4394 		cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP);
4395 		(void) strcpy(cn->c_netid, RIBNETID_TCP);
4396 
4397 		cn->c_addrmask.len = cn->c_addrmask.maxlen =
4398 		    sizeof (struct sockaddr_in);
4399 		cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4400 
4401 		((struct sockaddr_in *)cn->c_addrmask.buf)->sin_addr.s_addr =
4402 		    (uint32_t)~0;
4403 		((struct sockaddr_in *)cn->c_addrmask.buf)->sin_family =
4404 		    (ushort_t)~0;
4405 
4406 	} else {
4407 		cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP);
4408 		(void) strcpy(cn->c_netid, RIBNETID_TCP6);
4409 
4410 		cn->c_addrmask.len = cn->c_addrmask.maxlen =
4411 		    sizeof (struct sockaddr_in6);
4412 		cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4413 
4414 		(void) memset(
4415 		    &((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_addr,
4416 		    (uchar_t)~0, sizeof (struct in6_addr));
4417 		((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_family =
4418 		    (sa_family_t)~0;
4419 	}
4420 
4421 	/*
4422 	 * Add to conn list.
4423 	 * We had given up the READER lock. In the time since then,
4424 	 * another thread might have created the connection we are
4425 	 * trying here. But for now, that is quiet alright - there
4426 	 * might be two connections between a pair of hosts instead
4427 	 * of one. If we really want to close that window,
4428 	 * then need to check the list after acquiring the
4429 	 * WRITER lock.
4430 	 */
4431 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4432 	status = rib_conn_to_srv(hca, qp, rpt);
4433 	mutex_enter(&cn->c_lock);
4434 
4435 	if (cn->c_flags & C_CLOSE_PENDING) {
4436 		/*
4437 		 * This handles a case where the module or
4438 		 * HCA detached in the time a connection is
4439 		 * established. In such a case close the
4440 		 * connection immediately if this is the
4441 		 * only reference.
4442 		 */
4443 		if (cn->c_ref == 1) {
4444 			cn->c_ref--;
4445 			cn->c_state = C_DISCONN_PEND;
4446 			mutex_exit(&cn->c_lock);
4447 			rib_conn_close((void *)cn);
4448 			return (RDMA_FAILED);
4449 		}
4450 
4451 		/*
4452 		 * Connection to be closed later when c_ref = 0
4453 		 */
4454 		status = RDMA_FAILED;
4455 	}
4456 
4457 	if (status == RDMA_SUCCESS) {
4458 		cn->c_state = C_CONNECTED;
4459 		*conn = cn;
4460 	} else {
4461 		cn->c_state = C_ERROR_CONN;
4462 		cn->c_ref--;
4463 	}
4464 	cv_signal(&cn->c_cv);
4465 	mutex_exit(&cn->c_lock);
4466 	return (status);
4467 }
4468 
4469 static void
4470 rib_conn_close(void *rarg)
4471 {
4472 	CONN *conn = (CONN *)rarg;
4473 	rib_qp_t *qp = ctoqp(conn);
4474 
4475 	mutex_enter(&conn->c_lock);
4476 	if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4477 
4478 		conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4479 
4480 		/*
4481 		 * Live connection in CONNECTED state.
4482 		 */
4483 		if (conn->c_state == C_CONNECTED) {
4484 			conn->c_state = C_ERROR_CONN;
4485 		}
4486 		mutex_exit(&conn->c_lock);
4487 
4488 		rib_close_a_channel(conn);
4489 
4490 		mutex_enter(&conn->c_lock);
4491 		conn->c_flags &= ~C_CLOSE_PENDING;
4492 	}
4493 
4494 	mutex_exit(&conn->c_lock);
4495 
4496 	if (qp->mode == RIB_SERVER)
4497 		(void) rib_disconnect_channel(conn,
4498 		    &qp->hca->srv_conn_list);
4499 	else
4500 		(void) rib_disconnect_channel(conn,
4501 		    &qp->hca->cl_conn_list);
4502 }
4503 
4504 static void
4505 rib_conn_timeout_call(void *carg)
4506 {
4507 	time_t idle_time;
4508 	CONN *conn = (CONN *)carg;
4509 	rib_hca_t *hca = ctoqp(conn)->hca;
4510 	int error;
4511 
4512 	mutex_enter(&conn->c_lock);
4513 	if ((conn->c_ref > 0) ||
4514 	    (conn->c_state == C_DISCONN_PEND)) {
4515 		conn->c_timeout = NULL;
4516 		mutex_exit(&conn->c_lock);
4517 		return;
4518 	}
4519 
4520 	idle_time = (gethrestime_sec() - conn->c_last_used);
4521 
4522 	if ((idle_time <= rib_conn_timeout) &&
4523 	    (conn->c_state != C_ERROR_CONN)) {
4524 		/*
4525 		 * There was activity after the last timeout.
4526 		 * Extend the conn life. Unless the conn is
4527 		 * already in error state.
4528 		 */
4529 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4530 		    SEC_TO_TICK(rib_conn_timeout - idle_time));
4531 		mutex_exit(&conn->c_lock);
4532 		return;
4533 	}
4534 
4535 	error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4536 	    (void *)conn, DDI_NOSLEEP);
4537 
4538 	/*
4539 	 * If taskq dispatch fails above, then reset the timeout
4540 	 * to try again after 10 secs.
4541 	 */
4542 
4543 	if (error != DDI_SUCCESS) {
4544 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4545 		    SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4546 		mutex_exit(&conn->c_lock);
4547 		return;
4548 	}
4549 
4550 	conn->c_state = C_DISCONN_PEND;
4551 	mutex_exit(&conn->c_lock);
4552 }
4553 
4554 static rdma_stat
4555 rib_conn_release(CONN *conn)
4556 {
4557 	mutex_enter(&conn->c_lock);
4558 	return (rib_conn_release_locked(conn));
4559 }
4560 
4561 /*
4562  * Expects conn->c_lock to be held on entry.
4563  * c_lock released on return
4564  */
4565 static rdma_stat
4566 rib_conn_release_locked(CONN *conn)
4567 {
4568 	conn->c_ref--;
4569 
4570 	conn->c_last_used = gethrestime_sec();
4571 	if (conn->c_ref > 0) {
4572 		mutex_exit(&conn->c_lock);
4573 		return (RDMA_SUCCESS);
4574 	}
4575 
4576 	/*
4577 	 * If a conn is C_ERROR_CONN, close the channel.
4578 	 */
4579 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4580 		conn->c_state = C_DISCONN_PEND;
4581 		mutex_exit(&conn->c_lock);
4582 		rib_conn_close((void *)conn);
4583 		return (RDMA_SUCCESS);
4584 	}
4585 
4586 	/*
4587 	 * c_ref == 0, set a timeout for conn release
4588 	 */
4589 
4590 	if (conn->c_timeout == NULL) {
4591 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4592 		    SEC_TO_TICK(rib_conn_timeout));
4593 	}
4594 
4595 	mutex_exit(&conn->c_lock);
4596 	return (RDMA_SUCCESS);
4597 }
4598 
4599 /*
4600  * Add at front of list
4601  */
4602 static struct rdma_done_list *
4603 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4604 {
4605 	struct rdma_done_list *rd;
4606 
4607 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4608 
4609 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4610 	rd->xid = xid;
4611 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4612 
4613 	rd->prev = NULL;
4614 	rd->next = qp->rdlist;
4615 	if (qp->rdlist != NULL)
4616 		qp->rdlist->prev = rd;
4617 	qp->rdlist = rd;
4618 
4619 	return (rd);
4620 }
4621 
4622 static void
4623 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4624 {
4625 	struct rdma_done_list *r;
4626 
4627 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4628 
4629 	r = rd->next;
4630 	if (r != NULL) {
4631 		r->prev = rd->prev;
4632 	}
4633 
4634 	r = rd->prev;
4635 	if (r != NULL) {
4636 		r->next = rd->next;
4637 	} else {
4638 		qp->rdlist = rd->next;
4639 	}
4640 
4641 	cv_destroy(&rd->rdma_done_cv);
4642 	kmem_free(rd, sizeof (*rd));
4643 }
4644 
4645 static void
4646 rdma_done_rem_list(rib_qp_t *qp)
4647 {
4648 	struct rdma_done_list	*r, *n;
4649 
4650 	mutex_enter(&qp->rdlist_lock);
4651 	for (r = qp->rdlist; r != NULL; r = n) {
4652 		n = r->next;
4653 		rdma_done_rm(qp, r);
4654 	}
4655 	mutex_exit(&qp->rdlist_lock);
4656 }
4657 
4658 static void
4659 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4660 {
4661 	struct rdma_done_list *r = qp->rdlist;
4662 
4663 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4664 
4665 	while (r) {
4666 		if (r->xid == xid) {
4667 			cv_signal(&r->rdma_done_cv);
4668 			return;
4669 		} else {
4670 			r = r->next;
4671 		}
4672 	}
4673 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4674 	    int, xid);
4675 }
4676 
4677 /*
4678  * Expects conn->c_lock to be held by the caller.
4679  */
4680 
4681 static void
4682 rib_close_a_channel(CONN *conn)
4683 {
4684 	rib_qp_t	*qp;
4685 	qp = ctoqp(conn);
4686 
4687 	if (qp->qp_hdl == NULL) {
4688 		/* channel already freed */
4689 		return;
4690 	}
4691 
4692 	/*
4693 	 * Call ibt_close_rc_channel in blocking mode
4694 	 * with no callbacks.
4695 	 */
4696 	(void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4697 	    NULL, 0, NULL, NULL, 0);
4698 }
4699 
4700 /*
4701  * Goes through all connections and closes the channel
4702  * This will cause all the WRs on those channels to be
4703  * flushed.
4704  */
4705 static void
4706 rib_close_channels(rib_conn_list_t *connlist)
4707 {
4708 	CONN 		*conn, *tmp;
4709 
4710 	rw_enter(&connlist->conn_lock, RW_READER);
4711 	conn = connlist->conn_hd;
4712 	while (conn != NULL) {
4713 		mutex_enter(&conn->c_lock);
4714 		tmp = conn->c_next;
4715 		if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4716 
4717 			if (conn->c_state == C_CONN_PEND) {
4718 				conn->c_flags |= C_CLOSE_PENDING;
4719 				goto next;
4720 			}
4721 
4722 			conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4723 
4724 			/*
4725 			 * Live connection in CONNECTED state.
4726 			 */
4727 			if (conn->c_state == C_CONNECTED)
4728 				conn->c_state = C_ERROR_CONN;
4729 			mutex_exit(&conn->c_lock);
4730 
4731 			rib_close_a_channel(conn);
4732 
4733 			mutex_enter(&conn->c_lock);
4734 			conn->c_flags &= ~C_CLOSE_PENDING;
4735 			/* Signal a pending rib_disconnect_channel() */
4736 			cv_signal(&conn->c_cv);
4737 		}
4738 next:
4739 		mutex_exit(&conn->c_lock);
4740 		conn = tmp;
4741 	}
4742 	rw_exit(&connlist->conn_lock);
4743 }
4744 
4745 /*
4746  * Frees up all connections that are no longer being referenced
4747  */
4748 static void
4749 rib_purge_connlist(rib_conn_list_t *connlist)
4750 {
4751 	CONN 		*conn;
4752 
4753 top:
4754 	rw_enter(&connlist->conn_lock, RW_READER);
4755 	conn = connlist->conn_hd;
4756 	while (conn != NULL) {
4757 		mutex_enter(&conn->c_lock);
4758 
4759 		/*
4760 		 * At this point connection is either in ERROR
4761 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4762 		 * then some other thread is culling that connection.
4763 		 * If not and if c_ref is 0, then destroy the connection.
4764 		 */
4765 		if (conn->c_ref == 0 &&
4766 		    conn->c_state != C_DISCONN_PEND) {
4767 			/*
4768 			 * Cull the connection
4769 			 */
4770 			conn->c_state = C_DISCONN_PEND;
4771 			mutex_exit(&conn->c_lock);
4772 			rw_exit(&connlist->conn_lock);
4773 			(void) rib_disconnect_channel(conn, connlist);
4774 			goto top;
4775 		} else {
4776 			/*
4777 			 * conn disconnect already scheduled or will
4778 			 * happen from conn_release when c_ref drops to 0.
4779 			 */
4780 			mutex_exit(&conn->c_lock);
4781 		}
4782 		conn = conn->c_next;
4783 	}
4784 	rw_exit(&connlist->conn_lock);
4785 
4786 	/*
4787 	 * At this point, only connections with c_ref != 0 are on the list
4788 	 */
4789 }
4790 
4791 /*
4792  * Free all the HCA resources and close
4793  * the hca.
4794  */
4795 
4796 static void
4797 rib_free_hca(rib_hca_t *hca)
4798 {
4799 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4800 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4801 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4802 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4803 
4804 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4805 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4806 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4807 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4808 
4809 	rib_rbufpool_destroy(hca, RECV_BUFFER);
4810 	rib_rbufpool_destroy(hca, SEND_BUFFER);
4811 	rib_destroy_cache(hca);
4812 	if (rib_mod.rdma_count == 0)
4813 		(void) rdma_unregister_mod(&rib_mod);
4814 	(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4815 	(void) ibt_close_hca(hca->hca_hdl);
4816 	hca->hca_hdl = NULL;
4817 }
4818 
4819 
4820 static void
4821 rib_stop_hca_services(rib_hca_t *hca)
4822 {
4823 	rib_stop_services(hca);
4824 	rib_close_channels(&hca->cl_conn_list);
4825 	rib_close_channels(&hca->srv_conn_list);
4826 
4827 	rib_purge_connlist(&hca->cl_conn_list);
4828 	rib_purge_connlist(&hca->srv_conn_list);
4829 
4830 	if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4831 		kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4832 		    GLOBAL_ZONEID);
4833 		stats_enabled = FALSE;
4834 	}
4835 
4836 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4837 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4838 	if (hca->srv_conn_list.conn_hd == NULL &&
4839 	    hca->cl_conn_list.conn_hd == NULL) {
4840 		/*
4841 		 * conn_lists are NULL, so destroy
4842 		 * buffers, close hca and be done.
4843 		 */
4844 		rib_free_hca(hca);
4845 	}
4846 	rw_exit(&hca->cl_conn_list.conn_lock);
4847 	rw_exit(&hca->srv_conn_list.conn_lock);
4848 
4849 	if (hca->hca_hdl != NULL) {
4850 		mutex_enter(&hca->inuse_lock);
4851 		while (hca->inuse)
4852 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4853 		mutex_exit(&hca->inuse_lock);
4854 
4855 		rib_free_hca(hca);
4856 	}
4857 	rw_destroy(&hca->bound_services_lock);
4858 
4859 	if (hca->cleanup_helper != NULL) {
4860 		ddi_taskq_destroy(hca->cleanup_helper);
4861 		hca->cleanup_helper = NULL;
4862 	}
4863 }
4864 
4865 /*
4866  * Cleans and closes up all uses of the HCA
4867  */
4868 static void
4869 rib_detach_hca(ibt_hca_hdl_t hca_hdl)
4870 {
4871 	rib_hca_t *hca = NULL;
4872 	rib_hca_t **hcap;
4873 
4874 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4875 	for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) {
4876 		hca = *hcap;
4877 		rw_enter(&hca->state_lock, RW_WRITER);
4878 		if (hca->hca_hdl == hca_hdl) {
4879 			/*
4880 			 * Mark as detached and remove from
4881 			 * hca list.
4882 			 */
4883 			hca->state = HCA_DETACHED;
4884 			*hcap = hca->next;
4885 			rib_stat->nhca_inited--;
4886 			rib_mod.rdma_count--;
4887 			rw_exit(&hca->state_lock);
4888 			break;
4889 		}
4890 		rw_exit(&hca->state_lock);
4891 	}
4892 	rw_exit(&rib_stat->hcas_list_lock);
4893 
4894 	if (hca == NULL)
4895 		return;
4896 	ASSERT(hca->hca_hdl == hca_hdl);
4897 
4898 	/*
4899 	 * Stop all services on the HCA
4900 	 * Go through cl_conn_list and close all rc_channels
4901 	 * Go through svr_conn_list and close all rc_channels
4902 	 * Free connections whose c_ref has dropped to 0
4903 	 * Destroy all CQs
4904 	 * Deregister and released all buffer pool memory after all
4905 	 * connections are destroyed
4906 	 * Free the protection domain
4907 	 * ibt_close_hca()
4908 	 */
4909 	rib_stop_hca_services(hca);
4910 
4911 	kmem_free(hca, sizeof (*hca));
4912 }
4913 
4914 static void
4915 rib_server_side_cache_reclaim(void *argp)
4916 {
4917 	cache_avl_struct_t    *rcas;
4918 	rib_lrc_entry_t		*rb;
4919 	rib_hca_t *hca = (rib_hca_t *)argp;
4920 
4921 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4922 	rcas = avl_first(&hca->avl_tree);
4923 	if (rcas != NULL)
4924 		avl_remove(&hca->avl_tree, rcas);
4925 
4926 	while (rcas != NULL) {
4927 		while (rcas->r.forw != &rcas->r) {
4928 			rcas->elements--;
4929 			rb = rcas->r.forw;
4930 			remque(rb);
4931 			if (rb->registered)
4932 				(void) rib_deregistermem_via_hca(hca,
4933 				    rb->lrc_buf, rb->lrc_mhandle);
4934 
4935 			hca->cache_allocation -= rb->lrc_len;
4936 			kmem_free(rb->lrc_buf, rb->lrc_len);
4937 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4938 		}
4939 		mutex_destroy(&rcas->node_lock);
4940 		kmem_cache_free(hca->server_side_cache, rcas);
4941 		rcas = avl_first(&hca->avl_tree);
4942 		if (rcas != NULL)
4943 			avl_remove(&hca->avl_tree, rcas);
4944 	}
4945 	rw_exit(&hca->avl_rw_lock);
4946 }
4947 
4948 static void
4949 rib_server_side_cache_cleanup(void *argp)
4950 {
4951 	cache_avl_struct_t    *rcas;
4952 	rib_lrc_entry_t		*rb;
4953 	rib_hca_t *hca = (rib_hca_t *)argp;
4954 
4955 	mutex_enter(&hca->cache_allocation_lock);
4956 	if (hca->cache_allocation < cache_limit) {
4957 		mutex_exit(&hca->cache_allocation_lock);
4958 		return;
4959 	}
4960 	mutex_exit(&hca->cache_allocation_lock);
4961 
4962 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4963 	rcas = avl_last(&hca->avl_tree);
4964 	if (rcas != NULL)
4965 		avl_remove(&hca->avl_tree, rcas);
4966 
4967 	while (rcas != NULL) {
4968 		while (rcas->r.forw != &rcas->r) {
4969 			rcas->elements--;
4970 			rb = rcas->r.forw;
4971 			remque(rb);
4972 			if (rb->registered)
4973 				(void) rib_deregistermem_via_hca(hca,
4974 				    rb->lrc_buf, rb->lrc_mhandle);
4975 
4976 			hca->cache_allocation -= rb->lrc_len;
4977 
4978 			kmem_free(rb->lrc_buf, rb->lrc_len);
4979 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4980 		}
4981 		mutex_destroy(&rcas->node_lock);
4982 		if (hca->server_side_cache) {
4983 			kmem_cache_free(hca->server_side_cache, rcas);
4984 		}
4985 
4986 		if (hca->cache_allocation < cache_limit) {
4987 			rw_exit(&hca->avl_rw_lock);
4988 			return;
4989 		}
4990 
4991 		rcas = avl_last(&hca->avl_tree);
4992 		if (rcas != NULL)
4993 			avl_remove(&hca->avl_tree, rcas);
4994 	}
4995 	rw_exit(&hca->avl_rw_lock);
4996 }
4997 
4998 static int
4999 avl_compare(const void *t1, const void *t2)
5000 {
5001 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
5002 		return (0);
5003 
5004 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
5005 		return (-1);
5006 
5007 	return (1);
5008 }
5009 
5010 static void
5011 rib_destroy_cache(rib_hca_t *hca)
5012 {
5013 	if (hca->avl_init) {
5014 		rib_server_side_cache_reclaim((void *)hca);
5015 		if (hca->server_side_cache) {
5016 			kmem_cache_destroy(hca->server_side_cache);
5017 			hca->server_side_cache = NULL;
5018 		}
5019 		avl_destroy(&hca->avl_tree);
5020 		mutex_destroy(&hca->cache_allocation_lock);
5021 		rw_destroy(&hca->avl_rw_lock);
5022 	}
5023 	hca->avl_init = FALSE;
5024 }
5025 
5026 static void
5027 rib_force_cleanup(void *hca)
5028 {
5029 	if (((rib_hca_t *)hca)->cleanup_helper != NULL)
5030 		(void) ddi_taskq_dispatch(
5031 		    ((rib_hca_t *)hca)->cleanup_helper,
5032 		    rib_server_side_cache_cleanup,
5033 		    (void *)hca, DDI_NOSLEEP);
5034 }
5035 
5036 static rib_lrc_entry_t *
5037 rib_get_cache_buf(CONN *conn, uint32_t len)
5038 {
5039 	cache_avl_struct_t	cas, *rcas;
5040 	rib_hca_t	*hca = (ctoqp(conn))->hca;
5041 	rib_lrc_entry_t *reply_buf;
5042 	avl_index_t where = NULL;
5043 	uint64_t c_alloc = 0;
5044 
5045 	if (!hca->avl_init)
5046 		goto  error_alloc;
5047 
5048 	cas.len = len;
5049 
5050 	rw_enter(&hca->avl_rw_lock, RW_READER);
5051 
5052 	mutex_enter(&hca->cache_allocation_lock);
5053 	c_alloc = hca->cache_allocation;
5054 	mutex_exit(&hca->cache_allocation_lock);
5055 
5056 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
5057 	    &where)) == NULL) {
5058 		/* Am I above the cache limit */
5059 		if ((c_alloc + len) >= cache_limit) {
5060 			rib_force_cleanup((void *)hca);
5061 			rw_exit(&hca->avl_rw_lock);
5062 			mutex_enter(&hca->cache_allocation_lock);
5063 			hca->cache_misses_above_the_limit ++;
5064 			mutex_exit(&hca->cache_allocation_lock);
5065 
5066 			/* Allocate and register the buffer directly */
5067 			goto error_alloc;
5068 		}
5069 
5070 		rw_exit(&hca->avl_rw_lock);
5071 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
5072 
5073 		/* Recheck to make sure no other thread added the entry in */
5074 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
5075 		    &cas, &where)) == NULL) {
5076 			/* Allocate an avl tree entry */
5077 			rcas = (cache_avl_struct_t *)
5078 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
5079 
5080 			bzero(rcas, sizeof (cache_avl_struct_t));
5081 			rcas->elements = 0;
5082 			rcas->r.forw = &rcas->r;
5083 			rcas->r.back = &rcas->r;
5084 			rcas->len = len;
5085 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5086 			avl_insert(&hca->avl_tree, rcas, where);
5087 		}
5088 	}
5089 
5090 	mutex_enter(&rcas->node_lock);
5091 
5092 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
5093 		reply_buf = rcas->r.forw;
5094 		remque(reply_buf);
5095 		rcas->elements--;
5096 		mutex_exit(&rcas->node_lock);
5097 		rw_exit(&hca->avl_rw_lock);
5098 
5099 		mutex_enter(&hca->cache_allocation_lock);
5100 		hca->cache_hits++;
5101 		hca->cache_allocation -= len;
5102 		mutex_exit(&hca->cache_allocation_lock);
5103 	} else {
5104 		/* Am I above the cache limit */
5105 		mutex_exit(&rcas->node_lock);
5106 		if ((c_alloc + len) >= cache_limit) {
5107 			rib_force_cleanup((void *)hca);
5108 			rw_exit(&hca->avl_rw_lock);
5109 
5110 			mutex_enter(&hca->cache_allocation_lock);
5111 			hca->cache_misses_above_the_limit++;
5112 			mutex_exit(&hca->cache_allocation_lock);
5113 			/* Allocate and register the buffer directly */
5114 			goto error_alloc;
5115 		}
5116 		rw_exit(&hca->avl_rw_lock);
5117 		mutex_enter(&hca->cache_allocation_lock);
5118 		hca->cache_misses++;
5119 		mutex_exit(&hca->cache_allocation_lock);
5120 		/* Allocate a reply_buf entry */
5121 		reply_buf = (rib_lrc_entry_t *)
5122 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5123 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
5124 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
5125 		reply_buf->lrc_len  = len;
5126 		reply_buf->registered = FALSE;
5127 		reply_buf->avl_node = (void *)rcas;
5128 	}
5129 
5130 	return (reply_buf);
5131 
5132 error_alloc:
5133 	reply_buf = (rib_lrc_entry_t *)
5134 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5135 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
5136 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5137 	reply_buf->lrc_len = len;
5138 	reply_buf->registered = FALSE;
5139 	reply_buf->avl_node = NULL;
5140 
5141 	return (reply_buf);
5142 }
5143 
5144 /*
5145  * Return a pre-registered back to the cache (without
5146  * unregistering the buffer)..
5147  */
5148 
5149 static void
5150 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5151 {
5152 	cache_avl_struct_t    cas, *rcas;
5153 	avl_index_t where = NULL;
5154 	rib_hca_t	*hca = (ctoqp(conn))->hca;
5155 
5156 	if (!hca->avl_init)
5157 		goto  error_free;
5158 
5159 	cas.len = reg_buf->lrc_len;
5160 	rw_enter(&hca->avl_rw_lock, RW_READER);
5161 	if ((rcas = (cache_avl_struct_t *)
5162 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5163 		rw_exit(&hca->avl_rw_lock);
5164 		goto error_free;
5165 	} else {
5166 		cas.len = reg_buf->lrc_len;
5167 		mutex_enter(&rcas->node_lock);
5168 		insque(reg_buf, &rcas->r);
5169 		rcas->elements ++;
5170 		mutex_exit(&rcas->node_lock);
5171 		rw_exit(&hca->avl_rw_lock);
5172 		mutex_enter(&hca->cache_allocation_lock);
5173 		hca->cache_allocation += cas.len;
5174 		mutex_exit(&hca->cache_allocation_lock);
5175 	}
5176 
5177 	return;
5178 
5179 error_free:
5180 
5181 	if (reg_buf->registered)
5182 		(void) rib_deregistermem_via_hca(hca,
5183 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5184 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5185 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5186 }
5187 
5188 static rdma_stat
5189 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5190 	uint_t buflen, struct mrc *buf_handle)
5191 {
5192 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
5193 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
5194 	rdma_stat	status;
5195 
5196 
5197 	/*
5198 	 * Note: ALL buffer pools use the same memory type RDMARW.
5199 	 */
5200 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5201 	if (status == RDMA_SUCCESS) {
5202 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5203 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5204 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5205 	} else {
5206 		buf_handle->mrc_linfo = NULL;
5207 		buf_handle->mrc_lmr = 0;
5208 		buf_handle->mrc_rmr = 0;
5209 	}
5210 	return (status);
5211 }
5212 
5213 /* ARGSUSED */
5214 static rdma_stat
5215 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5216     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5217 {
5218 
5219 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5220 	return (RDMA_SUCCESS);
5221 }
5222 
5223 /* ARGSUSED */
5224 static rdma_stat
5225 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5226 {
5227 
5228 	(void) ibt_deregister_mr(hca->hca_hdl,
5229 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5230 	return (RDMA_SUCCESS);
5231 }
5232 
5233 /*
5234  * Check if the IP interface named by `lifrp' is RDMA-capable.
5235  */
5236 static boolean_t
5237 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5238 {
5239 	char ifname[LIFNAMSIZ];
5240 	char *cp;
5241 
5242 	if (lifrp->lifr_type == IFT_IB)
5243 		return (B_TRUE);
5244 
5245 	/*
5246 	 * Strip off the logical interface portion before getting
5247 	 * intimate with the name.
5248 	 */
5249 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5250 	if ((cp = strchr(ifname, ':')) != NULL)
5251 		*cp = '\0';
5252 
5253 	return (strcmp("lo0", ifname) == 0);
5254 }
5255 
5256 static int
5257 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5258 {
5259 	vnode_t *kkvp, *vp;
5260 	TIUSER  *tiptr;
5261 	struct  strioctl iocb;
5262 	k_sigset_t smask;
5263 	int	err = 0;
5264 
5265 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) {
5266 		if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE,
5267 		    &tiptr, CRED()) == 0) {
5268 			vp = tiptr->fp->f_vnode;
5269 		} else {
5270 			VN_RELE(kkvp);
5271 			return (EPROTO);
5272 		}
5273 	} else {
5274 		return (EPROTO);
5275 	}
5276 
5277 	iocb.ic_cmd = cmd;
5278 	iocb.ic_timout = 0;
5279 	iocb.ic_len = len;
5280 	iocb.ic_dp = (caddr_t)arg;
5281 	sigintr(&smask, 0);
5282 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5283 	sigunintr(&smask);
5284 	(void) t_kclose(tiptr, 0);
5285 	VN_RELE(kkvp);
5286 	return (err);
5287 }
5288 
5289 /*
5290  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5291  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5292  */
5293 static int
5294 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5295 {
5296 	int err;
5297 	struct lifnum lifn;
5298 
5299 	bzero(&lifn, sizeof (struct lifnum));
5300 	lifn.lifn_family = AF_UNSPEC;
5301 
5302 	err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5303 	if (err != 0)
5304 		return (err);
5305 
5306 	/*
5307 	 * Pad the interface count to account for additional interfaces that
5308 	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5309 	 */
5310 	lifn.lifn_count += 4;
5311 
5312 	bzero(lifcp, sizeof (struct lifconf));
5313 	lifcp->lifc_family = AF_UNSPEC;
5314 	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5315 	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5316 
5317 	err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5318 	if (err != 0) {
5319 		kmem_free(lifcp->lifc_buf, *bufsizep);
5320 		return (err);
5321 	}
5322 	return (0);
5323 }
5324 
5325 static boolean_t
5326 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5327 {
5328 	uint_t i, nifs;
5329 	uint_t bufsize;
5330 	struct lifconf lifc;
5331 	struct lifreq *lifrp;
5332 	struct sockaddr_in *sinp;
5333 	struct sockaddr_in6 *sin6p;
5334 
5335 	bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5336 	bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5337 
5338 	if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5339 		return (B_FALSE);
5340 
5341 	if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5342 		kmem_free(lifc.lifc_buf, bufsize);
5343 		return (B_FALSE);
5344 	}
5345 
5346 	/*
5347 	 * Worst case is that all of the addresses are IB-capable and have
5348 	 * the same address family, so size our buffers accordingly.
5349 	 */
5350 	addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5351 	addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5352 	addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5353 	addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5354 
5355 	for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5356 		if (!rpcib_rdma_capable_interface(lifrp))
5357 			continue;
5358 
5359 		if (lifrp->lifr_addr.ss_family == AF_INET) {
5360 			sinp = addrs4->ri_list;
5361 			bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5362 			    sizeof (struct sockaddr_in));
5363 		} else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5364 			sin6p = addrs6->ri_list;
5365 			bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5366 			    sizeof (struct sockaddr_in6));
5367 		}
5368 	}
5369 
5370 	kmem_free(lifc.lifc_buf, bufsize);
5371 	return (B_TRUE);
5372 }
5373 
5374 /* ARGSUSED */
5375 static int
5376 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5377 {
5378 	rib_hca_t *hca;
5379 
5380 	if (KSTAT_WRITE == rw) {
5381 		return (EACCES);
5382 	}
5383 
5384 	rpcib_kstat.cache_limit.value.ui64 =
5385 	    (uint64_t)cache_limit;
5386 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5387 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5388 		rpcib_kstat.cache_allocation.value.ui64 +=
5389 		    (uint64_t)hca->cache_allocation;
5390 		rpcib_kstat.cache_hits.value.ui64 +=
5391 		    (uint64_t)hca->cache_hits;
5392 		rpcib_kstat.cache_misses.value.ui64 +=
5393 		    (uint64_t)hca->cache_misses;
5394 		rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5395 		    (uint64_t)hca->cache_misses_above_the_limit;
5396 	}
5397 	rw_exit(&rib_stat->hcas_list_lock);
5398 	return (0);
5399 }
5400