xref: /titanic_50/usr/src/uts/common/rpc/rpcib.c (revision 707da956f10b527c61331142582204e292b21bd6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
28  *
29  * Portions of this source code is developed by the team members of
30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31  * headed by Professor Dhabaleswar K. (DK) Panda.
32  *
33  * Acknowledgements to contributions from developors:
34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
35  *   Lei Chai      : chail@cse.ohio-state.edu
36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
37  *
38  */
39 
40 /*
41  * The rpcib plugin. Implements the interface for RDMATF's
42  * interaction with IBTF.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/user.h>
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/file.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/stropts.h>
56 #include <sys/errno.h>
57 #include <sys/kmem.h>
58 #include <sys/debug.h>
59 #include <sys/pathname.h>
60 #include <sys/kstat.h>
61 #include <sys/t_lock.h>
62 #include <sys/ddi.h>
63 #include <sys/cmn_err.h>
64 #include <sys/time.h>
65 #include <sys/isa_defs.h>
66 #include <sys/callb.h>
67 #include <sys/sunddi.h>
68 #include <sys/sunndi.h>
69 #include <sys/sdt.h>
70 #include <sys/ib/ibtl/ibti.h>
71 #include <rpc/rpc.h>
72 #include <rpc/ib.h>
73 #include <sys/modctl.h>
74 #include <sys/kstr.h>
75 #include <sys/sockio.h>
76 #include <sys/vnode.h>
77 #include <sys/tiuser.h>
78 #include <net/if.h>
79 #include <net/if_types.h>
80 #include <sys/cred.h>
81 #include <rpc/rpc_rdma.h>
82 #include <nfs/nfs.h>
83 #include <sys/atomic.h>
84 
85 #define	NFS_RDMA_PORT	20049
86 
87 
88 /*
89  * Convenience structures for connection management
90  */
91 typedef struct rpcib_ipaddrs {
92 	void	*ri_list;	/* pointer to list of addresses */
93 	uint_t	ri_count;	/* number of addresses in list */
94 	uint_t	ri_size;	/* size of ri_list in bytes */
95 } rpcib_ipaddrs_t;
96 
97 
98 typedef struct rpcib_ping {
99 	rib_hca_t  *hca;
100 	ibt_path_info_t path;
101 	ibt_ip_addr_t srcip;
102 	ibt_ip_addr_t dstip;
103 } rpcib_ping_t;
104 
105 /*
106  * Prototype declarations for driver ops
107  */
108 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
109 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
110 				void *, void **);
111 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
113 static int	rpcib_do_ip_ioctl(int, int, void *);
114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
115 static int rpcib_cache_kstat_update(kstat_t *, int);
116 static void rib_force_cleanup(void *);
117 static void rib_stop_hca_services(rib_hca_t *);
118 static void rib_attach_hca(void);
119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
120 		struct netbuf *d_svcaddr, CONN **conn);
121 
122 struct {
123 	kstat_named_t cache_limit;
124 	kstat_named_t cache_allocation;
125 	kstat_named_t cache_hits;
126 	kstat_named_t cache_misses;
127 	kstat_named_t cache_misses_above_the_limit;
128 } rpcib_kstat = {
129 	{"cache_limit",			KSTAT_DATA_UINT64 },
130 	{"cache_allocation",		KSTAT_DATA_UINT64 },
131 	{"cache_hits",			KSTAT_DATA_UINT64 },
132 	{"cache_misses",		KSTAT_DATA_UINT64 },
133 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
134 };
135 
136 /* rpcib cb_ops */
137 static struct cb_ops rpcib_cbops = {
138 	nulldev,		/* open */
139 	nulldev,		/* close */
140 	nodev,			/* strategy */
141 	nodev,			/* print */
142 	nodev,			/* dump */
143 	nodev,			/* read */
144 	nodev,			/* write */
145 	nodev,			/* ioctl */
146 	nodev,			/* devmap */
147 	nodev,			/* mmap */
148 	nodev,			/* segmap */
149 	nochpoll,		/* poll */
150 	ddi_prop_op,		/* prop_op */
151 	NULL,			/* stream */
152 	D_MP,			/* cb_flag */
153 	CB_REV,			/* rev */
154 	nodev,			/* int (*cb_aread)() */
155 	nodev			/* int (*cb_awrite)() */
156 };
157 
158 /*
159  * Device options
160  */
161 static struct dev_ops rpcib_ops = {
162 	DEVO_REV,		/* devo_rev, */
163 	0,			/* refcnt  */
164 	rpcib_getinfo,		/* info */
165 	nulldev,		/* identify */
166 	nulldev,		/* probe */
167 	rpcib_attach,		/* attach */
168 	rpcib_detach,		/* detach */
169 	nodev,			/* reset */
170 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
171 	NULL,			/* bus operations */
172 	NULL,			/* power */
173 	ddi_quiesce_not_needed,		/* quiesce */
174 };
175 
176 /*
177  * Module linkage information.
178  */
179 
180 static struct modldrv rib_modldrv = {
181 	&mod_driverops,		/* Driver module */
182 	"RPCIB plugin driver",	/* Driver name and version */
183 	&rpcib_ops,		/* Driver ops */
184 };
185 
186 static struct modlinkage rib_modlinkage = {
187 	MODREV_1,
188 	(void *)&rib_modldrv,
189 	NULL
190 };
191 
192 typedef struct rib_lrc_entry {
193 	struct rib_lrc_entry *forw;
194 	struct rib_lrc_entry *back;
195 	char *lrc_buf;
196 
197 	uint32_t lrc_len;
198 	void  *avl_node;
199 	bool_t registered;
200 
201 	struct mrc lrc_mhandle;
202 	bool_t lrc_on_freed_list;
203 } rib_lrc_entry_t;
204 
205 typedef	struct cache_struct	{
206 	rib_lrc_entry_t		r;
207 	uint32_t		len;
208 	uint32_t		elements;
209 	kmutex_t		node_lock;
210 	avl_node_t		avl_link;
211 } cache_avl_struct_t;
212 
213 uint64_t	cache_limit = 100 * 1024 * 1024;
214 static uint64_t	cache_watermark = 80 * 1024 * 1024;
215 static bool_t	stats_enabled = FALSE;
216 
217 static uint64_t max_unsignaled_rws = 5;
218 int nfs_rdma_port = NFS_RDMA_PORT;
219 
220 #define	RIBNETID_TCP	"tcp"
221 #define	RIBNETID_TCP6	"tcp6"
222 
223 /*
224  * rib_stat: private data pointer used when registering
225  *	with the IBTF.  It is returned to the consumer
226  *	in all callbacks.
227  */
228 static rpcib_state_t *rib_stat = NULL;
229 
230 #define	RNR_RETRIES	IBT_RNR_RETRY_1
231 #define	MAX_PORTS	2
232 #define	RDMA_DUMMY_WRID	0x4D3A1D4D3A1D
233 #define	RDMA_CONN_REAP_RETRY	10	/* 10 secs */
234 
235 int preposted_rbufs = RDMA_BUFS_GRANT;
236 int send_threshold = 1;
237 
238 /*
239  * Old cards with Tavor driver have limited memory footprint
240  * when booted in 32bit. The rib_max_rbufs tunable can be
241  * tuned for more buffers if needed.
242  */
243 
244 #if !defined(_ELF64) && !defined(__sparc)
245 int rib_max_rbufs = MAX_BUFS;
246 #else
247 int rib_max_rbufs = 10 * MAX_BUFS;
248 #endif	/* !(_ELF64) && !(__sparc) */
249 
250 int rib_conn_timeout = 60 * 12;		/* 12 minutes */
251 
252 /*
253  * State of the plugin.
254  * ACCEPT = accepting new connections and requests.
255  * NO_ACCEPT = not accepting new connection and requests.
256  * This should eventually move to rpcib_state_t structure, since this
257  * will tell in which state the plugin is for a particular type of service
258  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
259  * state for one and in no_accept state for the other.
260  */
261 int		plugin_state;
262 kmutex_t	plugin_state_lock;
263 
264 ldi_ident_t rpcib_li;
265 
266 /*
267  * RPCIB RDMATF operations
268  */
269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
270 static rdma_stat rib_disconnect(CONN *conn);
271 static void rib_listen(struct rdma_svc_data *rd);
272 static void rib_listen_stop(struct rdma_svc_data *rd);
273 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
274 	uint_t buflen, struct mrc *buf_handle);
275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
276 	struct mrc buf_handle);
277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
278 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
280 		struct mrc buf_handle);
281 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
282 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
283 	void *lrc);
284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
285 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
287 	caddr_t buf, int len, int cpu);
288 
289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
290 
291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
293 
294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
295 
296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
305 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
306 	int addr_type, void *, CONN **);
307 static rdma_stat rib_conn_release(CONN *conn);
308 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
309 	rpcib_ping_t *, CONN **);
310 static rdma_stat rib_getinfo(rdma_info_t *info);
311 
312 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
313 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
314 static void rib_destroy_cache(rib_hca_t *hca);
315 static	void	rib_server_side_cache_reclaim(void *argp);
316 static int avl_compare(const void *t1, const void *t2);
317 
318 static void rib_stop_services(rib_hca_t *);
319 static void rib_close_channels(rib_conn_list_t *);
320 static void rib_conn_close(void *);
321 static void rib_recv_rele(rib_qp_t *);
322 static rdma_stat rib_conn_release_locked(CONN *conn);
323 
324 /*
325  * RPCIB addressing operations
326  */
327 
328 /*
329  * RDMA operations the RPCIB module exports
330  */
331 static rdmaops_t rib_ops = {
332 	rib_reachable,
333 	rib_conn_get,
334 	rib_conn_release,
335 	rib_listen,
336 	rib_listen_stop,
337 	rib_registermem,
338 	rib_deregistermem,
339 	rib_registermemsync,
340 	rib_deregistermemsync,
341 	rib_syncmem,
342 	rib_reg_buf_alloc,
343 	rib_reg_buf_free,
344 	rib_send,
345 	rib_send_resp,
346 	rib_post_resp,
347 	rib_post_resp_remove,
348 	rib_post_recv,
349 	rib_recv,
350 	rib_read,
351 	rib_write,
352 	rib_getinfo,
353 };
354 
355 /*
356  * RDMATF RPCIB plugin details
357  */
358 static rdma_mod_t rib_mod = {
359 	"ibtf",		/* api name */
360 	RDMATF_VERS_1,
361 	0,
362 	&rib_ops,	/* rdma op vector for ibtf */
363 };
364 
365 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
366 static rdma_stat rib_qp_init(rib_qp_t *, int);
367 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
368 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
369 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
370 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
371 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
372 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
373 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
374 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
375 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
376 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
377 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
378 	rib_qp_t **);
379 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
380 	rib_qp_t **);
381 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
382 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
383 static int rib_free_sendwait(struct send_wid *);
384 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
385 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
386 static void rdma_done_rem_list(rib_qp_t *);
387 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
388 
389 static void rib_async_handler(void *,
390 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
391 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
392 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
393 static int rib_free_svc_recv(struct svc_recv *);
394 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
395 static void rib_free_wid(struct recv_wid *);
396 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
397 static void rib_detach_hca(ibt_hca_hdl_t);
398 static void rib_close_a_channel(CONN *);
399 static void rib_send_hold(rib_qp_t *);
400 static void rib_send_rele(rib_qp_t *);
401 
402 /*
403  * Registration with IBTF as a consumer
404  */
405 static struct ibt_clnt_modinfo_s rib_modinfo = {
406 	IBTI_V_CURR,
407 	IBT_GENERIC,
408 	rib_async_handler,	/* async event handler */
409 	NULL,			/* Memory Region Handler */
410 	"nfs/ib"
411 };
412 
413 /*
414  * Global strucuture
415  */
416 
417 typedef struct rpcib_s {
418 	dev_info_t	*rpcib_dip;
419 	kmutex_t	rpcib_mutex;
420 } rpcib_t;
421 
422 rpcib_t rpcib;
423 
424 /*
425  * /etc/system controlled variable to control
426  * debugging in rpcib kernel module.
427  * Set it to values greater that 1 to control
428  * the amount of debugging messages required.
429  */
430 int rib_debug = 0;
431 
432 int
433 _init(void)
434 {
435 	int error;
436 
437 	error = mod_install((struct modlinkage *)&rib_modlinkage);
438 	if (error != 0) {
439 		/*
440 		 * Could not load module
441 		 */
442 		return (error);
443 	}
444 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
445 	return (0);
446 }
447 
448 int
449 _fini()
450 {
451 	int status;
452 
453 	/*
454 	 * Remove module
455 	 */
456 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
457 		return (status);
458 	}
459 	mutex_destroy(&plugin_state_lock);
460 	return (0);
461 }
462 
463 int
464 _info(struct modinfo *modinfop)
465 {
466 	return (mod_info(&rib_modlinkage, modinfop));
467 }
468 
469 /*
470  * rpcib_getinfo()
471  * Given the device number, return the devinfo pointer or the
472  * instance number.
473  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
474  */
475 
476 /*ARGSUSED*/
477 static int
478 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
479 {
480 	int ret = DDI_SUCCESS;
481 
482 	switch (cmd) {
483 	case DDI_INFO_DEVT2DEVINFO:
484 		if (rpcib.rpcib_dip != NULL)
485 			*result = rpcib.rpcib_dip;
486 		else {
487 			*result = NULL;
488 			ret = DDI_FAILURE;
489 		}
490 		break;
491 
492 	case DDI_INFO_DEVT2INSTANCE:
493 		*result = NULL;
494 		break;
495 
496 	default:
497 		ret = DDI_FAILURE;
498 	}
499 	return (ret);
500 }
501 
502 static void
503 rpcib_free_hca_list()
504 {
505 	rib_hca_t *hca, *hcap;
506 
507 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
508 	hca = rib_stat->hcas_list;
509 	rib_stat->hcas_list = NULL;
510 	rw_exit(&rib_stat->hcas_list_lock);
511 	while (hca != NULL) {
512 		rw_enter(&hca->state_lock, RW_WRITER);
513 		hcap = hca;
514 		hca = hca->next;
515 		rib_stat->nhca_inited--;
516 		rib_mod.rdma_count--;
517 		hcap->state = HCA_DETACHED;
518 		rw_exit(&hcap->state_lock);
519 		rib_stop_hca_services(hcap);
520 
521 		kmem_free(hcap, sizeof (*hcap));
522 	}
523 }
524 
525 static rdma_stat
526 rpcib_free_service_list()
527 {
528 	rib_service_t *service;
529 	ibt_status_t ret;
530 
531 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
532 	while (rib_stat->service_list != NULL) {
533 		service = rib_stat->service_list;
534 		ret = ibt_unbind_all_services(service->srv_hdl);
535 		if (ret != IBT_SUCCESS) {
536 			rw_exit(&rib_stat->service_list_lock);
537 #ifdef DEBUG
538 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
539 			    "ibt_unbind_all_services failed (%d)\n", (int)ret);
540 #endif
541 			return (RDMA_FAILED);
542 		}
543 		ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
544 		    service->srv_hdl);
545 		if (ret != IBT_SUCCESS) {
546 			rw_exit(&rib_stat->service_list_lock);
547 #ifdef DEBUG
548 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
549 			    "ibt_deregister_service failed (%d)\n", (int)ret);
550 #endif
551 			return (RDMA_FAILED);
552 		}
553 		rib_stat->service_list = service->next;
554 		kmem_free(service, sizeof (rib_service_t));
555 	}
556 	rw_exit(&rib_stat->service_list_lock);
557 
558 	return (RDMA_SUCCESS);
559 }
560 
561 static int
562 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
563 {
564 	ibt_status_t	ibt_status;
565 	rdma_stat	r_status;
566 
567 	switch (cmd) {
568 	case DDI_ATTACH:
569 		break;
570 	case DDI_RESUME:
571 		return (DDI_SUCCESS);
572 	default:
573 		return (DDI_FAILURE);
574 	}
575 
576 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
577 
578 	mutex_enter(&rpcib.rpcib_mutex);
579 	if (rpcib.rpcib_dip != NULL) {
580 		mutex_exit(&rpcib.rpcib_mutex);
581 		return (DDI_FAILURE);
582 	}
583 	rpcib.rpcib_dip = dip;
584 	mutex_exit(&rpcib.rpcib_mutex);
585 	/*
586 	 * Create the "rpcib" minor-node.
587 	 */
588 	if (ddi_create_minor_node(dip,
589 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
590 		/* Error message, no cmn_err as they print on console */
591 		return (DDI_FAILURE);
592 	}
593 
594 	if (rib_stat == NULL) {
595 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
596 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
597 		rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
598 		mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
599 	}
600 
601 	rib_stat->hca_count = ibt_get_hca_list(NULL);
602 	if (rib_stat->hca_count < 1) {
603 		mutex_destroy(&rib_stat->listen_lock);
604 		rw_destroy(&rib_stat->hcas_list_lock);
605 		mutex_destroy(&rib_stat->open_hca_lock);
606 		kmem_free(rib_stat, sizeof (*rib_stat));
607 		rib_stat = NULL;
608 		return (DDI_FAILURE);
609 	}
610 
611 	ibt_status = ibt_attach(&rib_modinfo, dip,
612 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
613 
614 	if (ibt_status != IBT_SUCCESS) {
615 		mutex_destroy(&rib_stat->listen_lock);
616 		rw_destroy(&rib_stat->hcas_list_lock);
617 		mutex_destroy(&rib_stat->open_hca_lock);
618 		kmem_free(rib_stat, sizeof (*rib_stat));
619 		rib_stat = NULL;
620 		return (DDI_FAILURE);
621 	}
622 
623 	rib_stat->service_list = NULL;
624 	rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
625 	mutex_enter(&rib_stat->open_hca_lock);
626 	if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
627 		mutex_exit(&rib_stat->open_hca_lock);
628 		goto open_fail;
629 	}
630 	mutex_exit(&rib_stat->open_hca_lock);
631 
632 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
633 	    DDI_PROP_SUCCESS) {
634 		cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
635 		    "failed.");
636 		goto register_fail;
637 	}
638 
639 	/*
640 	 * Register with rdmatf
641 	 */
642 	r_status = rdma_register_mod(&rib_mod);
643 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
644 		cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
645 		    "status = %d", r_status);
646 		goto register_fail;
647 	}
648 
649 	return (DDI_SUCCESS);
650 
651 register_fail:
652 
653 open_fail:
654 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
655 	rpcib_free_hca_list();
656 	(void) rpcib_free_service_list();
657 	mutex_destroy(&rib_stat->listen_lock);
658 	rw_destroy(&rib_stat->hcas_list_lock);
659 	mutex_destroy(&rib_stat->open_hca_lock);
660 	rw_destroy(&rib_stat->service_list_lock);
661 	kmem_free(rib_stat, sizeof (*rib_stat));
662 	rib_stat = NULL;
663 	return (DDI_FAILURE);
664 }
665 
666 /*ARGSUSED*/
667 static int
668 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
669 {
670 	switch (cmd) {
671 
672 	case DDI_DETACH:
673 		break;
674 
675 	case DDI_SUSPEND:
676 	default:
677 		return (DDI_FAILURE);
678 	}
679 
680 	/*
681 	 * Detach the hca and free resources
682 	 */
683 	mutex_enter(&plugin_state_lock);
684 	plugin_state = NO_ACCEPT;
685 	mutex_exit(&plugin_state_lock);
686 
687 	if (rpcib_free_service_list() != RDMA_SUCCESS)
688 		return (DDI_FAILURE);
689 	rpcib_free_hca_list();
690 
691 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
692 	mutex_destroy(&rib_stat->listen_lock);
693 	rw_destroy(&rib_stat->hcas_list_lock);
694 	mutex_destroy(&rib_stat->open_hca_lock);
695 	rw_destroy(&rib_stat->service_list_lock);
696 
697 	kmem_free(rib_stat, sizeof (*rib_stat));
698 	rib_stat = NULL;
699 
700 	mutex_enter(&rpcib.rpcib_mutex);
701 	rpcib.rpcib_dip = NULL;
702 	mutex_exit(&rpcib.rpcib_mutex);
703 	mutex_destroy(&rpcib.rpcib_mutex);
704 	return (DDI_SUCCESS);
705 }
706 
707 
708 static void rib_rbufpool_free(rib_hca_t *, int);
709 static void rib_rbufpool_deregister(rib_hca_t *, int);
710 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
711 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
712 static rdma_stat rib_rem_replylist(rib_qp_t *);
713 static int rib_remreply(rib_qp_t *, struct reply *);
714 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
715 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
716 
717 
718 /*
719  * One CQ pair per HCA
720  */
721 static rdma_stat
722 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
723 	rib_cq_t **cqp)
724 {
725 	rib_cq_t	*cq;
726 	ibt_cq_attr_t	cq_attr;
727 	uint32_t	real_size;
728 	ibt_status_t	status;
729 	rdma_stat	error = RDMA_SUCCESS;
730 
731 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
732 	cq->rib_hca = hca;
733 	cq_attr.cq_size = cq_size;
734 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
735 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
736 	    &real_size);
737 	if (status != IBT_SUCCESS) {
738 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
739 		    " status=%d", status);
740 		error = RDMA_FAILED;
741 		goto fail;
742 	}
743 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
744 
745 	/*
746 	 * Enable CQ callbacks. CQ Callbacks are single shot
747 	 * (e.g. you have to call ibt_enable_cq_notify()
748 	 * after each callback to get another one).
749 	 */
750 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
751 	if (status != IBT_SUCCESS) {
752 		cmn_err(CE_WARN, "rib_create_cq: "
753 		    "enable_cq_notify failed, status %d", status);
754 		error = RDMA_FAILED;
755 		goto fail;
756 	}
757 	*cqp = cq;
758 
759 	return (error);
760 fail:
761 	if (cq->rib_cq_hdl)
762 		(void) ibt_free_cq(cq->rib_cq_hdl);
763 	if (cq)
764 		kmem_free(cq, sizeof (rib_cq_t));
765 	return (error);
766 }
767 
768 /*
769  * rpcib_find_hca
770  *
771  * Caller should have already locked the hcas_lock before calling
772  * this function.
773  */
774 static rib_hca_t *
775 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
776 {
777 	rib_hca_t *hca = ribstat->hcas_list;
778 
779 	while (hca && hca->hca_guid != guid)
780 		hca = hca->next;
781 
782 	return (hca);
783 }
784 
785 static rdma_stat
786 rpcib_open_hcas(rpcib_state_t *ribstat)
787 {
788 	rib_hca_t		*hca;
789 	ibt_status_t		ibt_status;
790 	rdma_stat		status;
791 	ibt_hca_portinfo_t	*pinfop;
792 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
793 	uint_t			size, cq_size;
794 	int			i;
795 	kstat_t *ksp;
796 	cache_avl_struct_t example_avl_node;
797 	char rssc_name[32];
798 	int old_nhca_inited = ribstat->nhca_inited;
799 	ib_guid_t		*hca_guids;
800 
801 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
802 
803 	ribstat->hca_count = ibt_get_hca_list(&hca_guids);
804 	if (ribstat->hca_count == 0)
805 		return (RDMA_FAILED);
806 
807 	rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
808 	/*
809 	 * Open a hca and setup for RDMA
810 	 */
811 	for (i = 0; i < ribstat->hca_count; i++) {
812 		if (rpcib_find_hca(ribstat, hca_guids[i]))
813 			continue;
814 		hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
815 
816 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
817 		    hca_guids[i], &hca->hca_hdl);
818 		if (ibt_status != IBT_SUCCESS) {
819 			kmem_free(hca, sizeof (rib_hca_t));
820 			continue;
821 		}
822 		hca->hca_guid = hca_guids[i];
823 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
824 		hca->state = HCA_INITED;
825 
826 		/*
827 		 * query HCA info
828 		 */
829 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
830 		if (ibt_status != IBT_SUCCESS) {
831 			goto fail1;
832 		}
833 
834 		/*
835 		 * One PD (Protection Domain) per HCA.
836 		 * A qp is allowed to access a memory region
837 		 * only when it's in the same PD as that of
838 		 * the memory region.
839 		 */
840 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
841 		if (ibt_status != IBT_SUCCESS) {
842 			goto fail1;
843 		}
844 
845 		/*
846 		 * query HCA ports
847 		 */
848 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
849 		    0, &pinfop, &hca->hca_nports, &size);
850 		if (ibt_status != IBT_SUCCESS) {
851 			goto fail2;
852 		}
853 		hca->hca_ports = pinfop;
854 		hca->hca_pinfosz = size;
855 		pinfop = NULL;
856 
857 		cq_size = DEF_CQ_SIZE; /* default cq size */
858 		/*
859 		 * Create 2 pairs of cq's (1 pair for client
860 		 * and the other pair for server) on this hca.
861 		 * If number of qp's gets too large, then several
862 		 * cq's will be needed.
863 		 */
864 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
865 		    &hca->svc_rcq);
866 		if (status != RDMA_SUCCESS) {
867 			goto fail3;
868 		}
869 
870 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
871 		    &hca->svc_scq);
872 		if (status != RDMA_SUCCESS) {
873 			goto fail3;
874 		}
875 
876 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
877 		    &hca->clnt_rcq);
878 		if (status != RDMA_SUCCESS) {
879 			goto fail3;
880 		}
881 
882 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
883 		    &hca->clnt_scq);
884 		if (status != RDMA_SUCCESS) {
885 			goto fail3;
886 		}
887 
888 		/*
889 		 * Create buffer pools.
890 		 * Note rib_rbuf_create also allocates memory windows.
891 		 */
892 		hca->recv_pool = rib_rbufpool_create(hca,
893 		    RECV_BUFFER, rib_max_rbufs);
894 		if (hca->recv_pool == NULL) {
895 			goto fail3;
896 		}
897 
898 		hca->send_pool = rib_rbufpool_create(hca,
899 		    SEND_BUFFER, rib_max_rbufs);
900 		if (hca->send_pool == NULL) {
901 			rib_rbufpool_destroy(hca, RECV_BUFFER);
902 			goto fail3;
903 		}
904 
905 		if (hca->server_side_cache == NULL) {
906 			(void) sprintf(rssc_name,
907 			    "rib_srvr_cache_%llx",
908 			    (long long unsigned int) hca->hca_guid);
909 			hca->server_side_cache = kmem_cache_create(
910 			    rssc_name,
911 			    sizeof (cache_avl_struct_t), 0,
912 			    NULL,
913 			    NULL,
914 			    rib_server_side_cache_reclaim,
915 			    hca, NULL, 0);
916 		}
917 
918 		avl_create(&hca->avl_tree,
919 		    avl_compare,
920 		    sizeof (cache_avl_struct_t),
921 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
922 		    (uint_t)(uintptr_t)&example_avl_node);
923 
924 		rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
925 		    hca->iblock);
926 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
927 		rw_init(&hca->avl_rw_lock,
928 		    NULL, RW_DRIVER, hca->iblock);
929 		mutex_init(&hca->cache_allocation_lock,
930 		    NULL, MUTEX_DRIVER, NULL);
931 		hca->avl_init = TRUE;
932 
933 		/* Create kstats for the cache */
934 		ASSERT(INGLOBALZONE(curproc));
935 
936 		if (!stats_enabled) {
937 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
938 			    KSTAT_TYPE_NAMED,
939 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
940 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
941 			    GLOBAL_ZONEID);
942 			if (ksp) {
943 				ksp->ks_data = (void *) &rpcib_kstat;
944 				ksp->ks_update = rpcib_cache_kstat_update;
945 				kstat_install(ksp);
946 				stats_enabled = TRUE;
947 			}
948 		}
949 		if (hca->cleanup_helper == NULL) {
950 			char tq_name[sizeof (hca->hca_guid) * 2 + 1];
951 
952 			(void) snprintf(tq_name, sizeof (tq_name), "%llX",
953 			    (unsigned long long int) hca->hca_guid);
954 			hca->cleanup_helper = ddi_taskq_create(NULL,
955 			    tq_name, 1, TASKQ_DEFAULTPRI, 0);
956 		}
957 
958 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
959 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
960 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
961 		    hca->iblock);
962 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
963 		    hca->iblock);
964 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
965 		hca->inuse = TRUE;
966 
967 		hca->next = ribstat->hcas_list;
968 		ribstat->hcas_list = hca;
969 		ribstat->nhca_inited++;
970 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
971 		continue;
972 
973 fail3:
974 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
975 fail2:
976 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
977 fail1:
978 		(void) ibt_close_hca(hca->hca_hdl);
979 		kmem_free(hca, sizeof (rib_hca_t));
980 	}
981 	rw_exit(&ribstat->hcas_list_lock);
982 	ibt_free_hca_list(hca_guids, ribstat->hca_count);
983 	rib_mod.rdma_count = rib_stat->nhca_inited;
984 
985 	/*
986 	 * return success if at least one new hca has been configured.
987 	 */
988 	if (ribstat->nhca_inited != old_nhca_inited)
989 		return (RDMA_SUCCESS);
990 	else
991 		return (RDMA_FAILED);
992 }
993 
994 /*
995  * Callback routines
996  */
997 
998 /*
999  * SCQ handlers
1000  */
1001 /* ARGSUSED */
1002 static void
1003 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1004 {
1005 	ibt_status_t	ibt_status;
1006 	ibt_wc_t	wc;
1007 	struct send_wid	*wd;
1008 	CONN		*conn;
1009 	rib_qp_t	*qp;
1010 	int		i;
1011 
1012 	/*
1013 	 * Re-enable cq notify here to avoid missing any
1014 	 * completion queue notification.
1015 	 */
1016 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1017 
1018 	ibt_status = IBT_SUCCESS;
1019 	while (ibt_status != IBT_CQ_EMPTY) {
1020 		bzero(&wc, sizeof (wc));
1021 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1022 		if (ibt_status != IBT_SUCCESS)
1023 			return;
1024 
1025 		/*
1026 		 * Got a send completion
1027 		 */
1028 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1029 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1030 			qp = wd->qp;
1031 			conn = qptoc(qp);
1032 
1033 			mutex_enter(&wd->sendwait_lock);
1034 			switch (wc.wc_status) {
1035 			case IBT_WC_SUCCESS:
1036 				wd->status = RDMA_SUCCESS;
1037 				break;
1038 			default:
1039 /*
1040  *    RC Send Q Error Code		Local state     Remote State
1041  *    ==================== 		===========     ============
1042  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
1043  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
1044  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
1045  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
1046  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
1047  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
1048  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
1049  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
1050  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
1051  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
1052  *    IBT_WC_WR_FLUSHED_ERR               ERROR           None
1053  */
1054 				/*
1055 				 * Channel in error state. Set connection to
1056 				 * ERROR and cleanup will happen either from
1057 				 * conn_release  or from rib_conn_get
1058 				 */
1059 				wd->status = RDMA_FAILED;
1060 				mutex_enter(&conn->c_lock);
1061 				if (conn->c_state != C_DISCONN_PEND)
1062 					conn->c_state = C_ERROR_CONN;
1063 				mutex_exit(&conn->c_lock);
1064 				break;
1065 			}
1066 
1067 			if (wd->cv_sig == 1) {
1068 				/*
1069 				 * Notify poster
1070 				 */
1071 				cv_signal(&wd->wait_cv);
1072 				mutex_exit(&wd->sendwait_lock);
1073 			} else {
1074 				/*
1075 				 * Poster not waiting for notification.
1076 				 * Free the send buffers and send_wid
1077 				 */
1078 				for (i = 0; i < wd->nsbufs; i++) {
1079 					rib_rbuf_free(qptoc(wd->qp),
1080 					    SEND_BUFFER,
1081 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1082 				}
1083 
1084 				/* decrement the send ref count */
1085 				rib_send_rele(qp);
1086 
1087 				mutex_exit(&wd->sendwait_lock);
1088 				(void) rib_free_sendwait(wd);
1089 			}
1090 		}
1091 	}
1092 }
1093 
1094 /* ARGSUSED */
1095 static void
1096 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1097 {
1098 	ibt_status_t	ibt_status;
1099 	ibt_wc_t	wc;
1100 	struct send_wid	*wd;
1101 	rib_qp_t	*qp;
1102 	CONN		*conn;
1103 	int		i;
1104 
1105 	/*
1106 	 * Re-enable cq notify here to avoid missing any
1107 	 * completion queue notification.
1108 	 */
1109 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1110 
1111 	ibt_status = IBT_SUCCESS;
1112 	while (ibt_status != IBT_CQ_EMPTY) {
1113 		bzero(&wc, sizeof (wc));
1114 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1115 		if (ibt_status != IBT_SUCCESS)
1116 			return;
1117 
1118 		/*
1119 		 * Got a send completion
1120 		 */
1121 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1122 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1123 			qp = wd->qp;
1124 			conn = qptoc(qp);
1125 			mutex_enter(&wd->sendwait_lock);
1126 
1127 			switch (wc.wc_status) {
1128 			case IBT_WC_SUCCESS:
1129 				wd->status = RDMA_SUCCESS;
1130 				break;
1131 			default:
1132 				/*
1133 				 * Channel in error state. Set connection to
1134 				 * ERROR and cleanup will happen either from
1135 				 * conn_release  or conn timeout.
1136 				 */
1137 				wd->status = RDMA_FAILED;
1138 				mutex_enter(&conn->c_lock);
1139 				if (conn->c_state != C_DISCONN_PEND)
1140 					conn->c_state = C_ERROR_CONN;
1141 				mutex_exit(&conn->c_lock);
1142 				break;
1143 			}
1144 
1145 			if (wd->cv_sig == 1) {
1146 				/*
1147 				 * Update completion status and notify poster
1148 				 */
1149 				cv_signal(&wd->wait_cv);
1150 				mutex_exit(&wd->sendwait_lock);
1151 			} else {
1152 				/*
1153 				 * Poster not waiting for notification.
1154 				 * Free the send buffers and send_wid
1155 				 */
1156 				for (i = 0; i < wd->nsbufs; i++) {
1157 					rib_rbuf_free(qptoc(wd->qp),
1158 					    SEND_BUFFER,
1159 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1160 				}
1161 
1162 				/* decrement the send ref count */
1163 				rib_send_rele(qp);
1164 
1165 				mutex_exit(&wd->sendwait_lock);
1166 				(void) rib_free_sendwait(wd);
1167 			}
1168 		}
1169 	}
1170 }
1171 
1172 /*
1173  * RCQ handler
1174  */
1175 /* ARGSUSED */
1176 static void
1177 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1178 {
1179 	rib_qp_t	*qp;
1180 	ibt_status_t	ibt_status;
1181 	ibt_wc_t	wc;
1182 	struct recv_wid	*rwid;
1183 
1184 	/*
1185 	 * Re-enable cq notify here to avoid missing any
1186 	 * completion queue notification.
1187 	 */
1188 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1189 
1190 	ibt_status = IBT_SUCCESS;
1191 	while (ibt_status != IBT_CQ_EMPTY) {
1192 		bzero(&wc, sizeof (wc));
1193 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1194 		if (ibt_status != IBT_SUCCESS)
1195 			return;
1196 
1197 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1198 		qp = rwid->qp;
1199 
1200 		if (wc.wc_status == IBT_WC_SUCCESS) {
1201 			XDR	inxdrs, *xdrs;
1202 			uint_t	xid, vers, op, find_xid = 0;
1203 			struct reply	*r;
1204 			CONN *conn = qptoc(qp);
1205 			uint32_t rdma_credit = 0;
1206 
1207 			xdrs = &inxdrs;
1208 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1209 			    wc.wc_bytes_xfer, XDR_DECODE);
1210 			/*
1211 			 * Treat xid as opaque (xid is the first entity
1212 			 * in the rpc rdma message).
1213 			 */
1214 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1215 
1216 			/* Skip xid and set the xdr position accordingly. */
1217 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1218 			(void) xdr_u_int(xdrs, &vers);
1219 			(void) xdr_u_int(xdrs, &rdma_credit);
1220 			(void) xdr_u_int(xdrs, &op);
1221 			XDR_DESTROY(xdrs);
1222 
1223 			if (vers != RPCRDMA_VERS) {
1224 				/*
1225 				 * Invalid RPC/RDMA version. Cannot
1226 				 * interoperate.  Set connection to
1227 				 * ERROR state and bail out.
1228 				 */
1229 				mutex_enter(&conn->c_lock);
1230 				if (conn->c_state != C_DISCONN_PEND)
1231 					conn->c_state = C_ERROR_CONN;
1232 				mutex_exit(&conn->c_lock);
1233 				rib_rbuf_free(conn, RECV_BUFFER,
1234 				    (void *)(uintptr_t)rwid->addr);
1235 				rib_free_wid(rwid);
1236 				rib_recv_rele(qp);
1237 				continue;
1238 			}
1239 
1240 			mutex_enter(&qp->replylist_lock);
1241 			for (r = qp->replylist; r != NULL; r = r->next) {
1242 				if (r->xid == xid) {
1243 					find_xid = 1;
1244 					switch (op) {
1245 					case RDMA_MSG:
1246 					case RDMA_NOMSG:
1247 					case RDMA_MSGP:
1248 						r->status = RDMA_SUCCESS;
1249 						r->vaddr_cq = rwid->addr;
1250 						r->bytes_xfer =
1251 						    wc.wc_bytes_xfer;
1252 						cv_signal(&r->wait_cv);
1253 						break;
1254 					default:
1255 						rib_rbuf_free(qptoc(qp),
1256 						    RECV_BUFFER,
1257 						    (void *)(uintptr_t)
1258 						    rwid->addr);
1259 						break;
1260 					}
1261 					break;
1262 				}
1263 			}
1264 			mutex_exit(&qp->replylist_lock);
1265 			if (find_xid == 0) {
1266 				/* RPC caller not waiting for reply */
1267 
1268 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1269 				    int, xid);
1270 
1271 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1272 				    (void *)(uintptr_t)rwid->addr);
1273 			}
1274 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1275 			CONN *conn = qptoc(qp);
1276 
1277 			/*
1278 			 * Connection being flushed. Just free
1279 			 * the posted buffer
1280 			 */
1281 			rib_rbuf_free(conn, RECV_BUFFER,
1282 			    (void *)(uintptr_t)rwid->addr);
1283 		} else {
1284 			CONN *conn = qptoc(qp);
1285 /*
1286  *  RC Recv Q Error Code		Local state     Remote State
1287  *  ====================		===========     ============
1288  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1289  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1290  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1291  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1292  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1293  *  IBT_WC_WR_FLUSHED_ERR               None            None
1294  */
1295 			/*
1296 			 * Channel in error state. Set connection
1297 			 * in ERROR state.
1298 			 */
1299 			mutex_enter(&conn->c_lock);
1300 			if (conn->c_state != C_DISCONN_PEND)
1301 				conn->c_state = C_ERROR_CONN;
1302 			mutex_exit(&conn->c_lock);
1303 			rib_rbuf_free(conn, RECV_BUFFER,
1304 			    (void *)(uintptr_t)rwid->addr);
1305 		}
1306 		rib_free_wid(rwid);
1307 		rib_recv_rele(qp);
1308 	}
1309 }
1310 
1311 /* Server side */
1312 /* ARGSUSED */
1313 static void
1314 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1315 {
1316 	rdma_recv_data_t *rdp;
1317 	rib_qp_t	*qp;
1318 	ibt_status_t	ibt_status;
1319 	ibt_wc_t	wc;
1320 	struct svc_recv	*s_recvp;
1321 	CONN		*conn;
1322 	mblk_t		*mp;
1323 
1324 	/*
1325 	 * Re-enable cq notify here to avoid missing any
1326 	 * completion queue notification.
1327 	 */
1328 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1329 
1330 	ibt_status = IBT_SUCCESS;
1331 	while (ibt_status != IBT_CQ_EMPTY) {
1332 		bzero(&wc, sizeof (wc));
1333 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1334 		if (ibt_status != IBT_SUCCESS)
1335 			return;
1336 
1337 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1338 		qp = s_recvp->qp;
1339 		conn = qptoc(qp);
1340 
1341 		if (wc.wc_status == IBT_WC_SUCCESS) {
1342 			XDR	inxdrs, *xdrs;
1343 			uint_t	xid, vers, op;
1344 			uint32_t rdma_credit;
1345 
1346 			xdrs = &inxdrs;
1347 			/* s_recvp->vaddr stores data */
1348 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1349 			    wc.wc_bytes_xfer, XDR_DECODE);
1350 
1351 			/*
1352 			 * Treat xid as opaque (xid is the first entity
1353 			 * in the rpc rdma message).
1354 			 */
1355 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1356 			/* Skip xid and set the xdr position accordingly. */
1357 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1358 			if (!xdr_u_int(xdrs, &vers) ||
1359 			    !xdr_u_int(xdrs, &rdma_credit) ||
1360 			    !xdr_u_int(xdrs, &op)) {
1361 				rib_rbuf_free(conn, RECV_BUFFER,
1362 				    (void *)(uintptr_t)s_recvp->vaddr);
1363 				XDR_DESTROY(xdrs);
1364 				rib_recv_rele(qp);
1365 				(void) rib_free_svc_recv(s_recvp);
1366 				continue;
1367 			}
1368 			XDR_DESTROY(xdrs);
1369 
1370 			if (vers != RPCRDMA_VERS) {
1371 				/*
1372 				 * Invalid RPC/RDMA version.
1373 				 * Drop rpc rdma message.
1374 				 */
1375 				rib_rbuf_free(conn, RECV_BUFFER,
1376 				    (void *)(uintptr_t)s_recvp->vaddr);
1377 				rib_recv_rele(qp);
1378 				(void) rib_free_svc_recv(s_recvp);
1379 				continue;
1380 			}
1381 			/*
1382 			 * Is this for RDMA_DONE?
1383 			 */
1384 			if (op == RDMA_DONE) {
1385 				rib_rbuf_free(conn, RECV_BUFFER,
1386 				    (void *)(uintptr_t)s_recvp->vaddr);
1387 				/*
1388 				 * Wake up the thread waiting on
1389 				 * a RDMA_DONE for xid
1390 				 */
1391 				mutex_enter(&qp->rdlist_lock);
1392 				rdma_done_notify(qp, xid);
1393 				mutex_exit(&qp->rdlist_lock);
1394 				rib_recv_rele(qp);
1395 				(void) rib_free_svc_recv(s_recvp);
1396 				continue;
1397 			}
1398 
1399 			mutex_enter(&plugin_state_lock);
1400 			mutex_enter(&conn->c_lock);
1401 			if ((plugin_state == ACCEPT) &&
1402 			    (conn->c_state == C_CONNECTED)) {
1403 				conn->c_ref++;
1404 				mutex_exit(&conn->c_lock);
1405 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1406 				    == NULL)
1407 					(void) strwaitbuf(
1408 					    sizeof (*rdp), BPRI_LO);
1409 				/*
1410 				 * Plugin is in accept state, hence the master
1411 				 * transport queue for this is still accepting
1412 				 * requests. Hence we can call svc_queuereq to
1413 				 * queue this recieved msg.
1414 				 */
1415 				rdp = (rdma_recv_data_t *)mp->b_rptr;
1416 				rdp->conn = conn;
1417 				rdp->rpcmsg.addr =
1418 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1419 				rdp->rpcmsg.type = RECV_BUFFER;
1420 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1421 				rdp->status = wc.wc_status;
1422 				mp->b_wptr += sizeof (*rdp);
1423 				svc_queuereq((queue_t *)rib_stat->q, mp);
1424 				mutex_exit(&plugin_state_lock);
1425 			} else {
1426 				/*
1427 				 * The master transport for this is going
1428 				 * away and the queue is not accepting anymore
1429 				 * requests for krpc, so don't do anything, just
1430 				 * free the msg.
1431 				 */
1432 				mutex_exit(&conn->c_lock);
1433 				mutex_exit(&plugin_state_lock);
1434 				rib_rbuf_free(conn, RECV_BUFFER,
1435 				    (void *)(uintptr_t)s_recvp->vaddr);
1436 			}
1437 		} else {
1438 			rib_rbuf_free(conn, RECV_BUFFER,
1439 			    (void *)(uintptr_t)s_recvp->vaddr);
1440 		}
1441 		rib_recv_rele(qp);
1442 		(void) rib_free_svc_recv(s_recvp);
1443 	}
1444 }
1445 
1446 static void
1447 rib_attach_hca()
1448 {
1449 	mutex_enter(&rib_stat->open_hca_lock);
1450 	(void) rpcib_open_hcas(rib_stat);
1451 	rib_listen(NULL);
1452 	mutex_exit(&rib_stat->open_hca_lock);
1453 }
1454 
1455 /*
1456  * Handles DR event of IBT_HCA_DETACH_EVENT.
1457  */
1458 /* ARGSUSED */
1459 static void
1460 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1461 	ibt_async_code_t code, ibt_async_event_t *event)
1462 {
1463 	switch (code) {
1464 	case IBT_HCA_ATTACH_EVENT:
1465 		rib_attach_hca();
1466 		break;
1467 	case IBT_HCA_DETACH_EVENT:
1468 		rib_detach_hca(hca_hdl);
1469 #ifdef DEBUG
1470 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1471 #endif
1472 		break;
1473 	case IBT_EVENT_PORT_UP:
1474 		/*
1475 		 * A port is up. We should call rib_listen() since there is
1476 		 * a chance that rib_listen() may have failed during
1477 		 * rib_attach_hca() because the port had not been up yet.
1478 		 */
1479 		rib_listen(NULL);
1480 #ifdef DEBUG
1481 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1482 #endif
1483 		break;
1484 #ifdef DEBUG
1485 	case IBT_EVENT_PATH_MIGRATED:
1486 		cmn_err(CE_NOTE, "rib_async_handler(): "
1487 		    "IBT_EVENT_PATH_MIGRATED\n");
1488 		break;
1489 	case IBT_EVENT_SQD:
1490 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1491 		break;
1492 	case IBT_EVENT_COM_EST:
1493 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1494 		break;
1495 	case IBT_ERROR_CATASTROPHIC_CHAN:
1496 		cmn_err(CE_NOTE, "rib_async_handler(): "
1497 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1498 		break;
1499 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1500 		cmn_err(CE_NOTE, "rib_async_handler(): "
1501 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1502 		break;
1503 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1504 		cmn_err(CE_NOTE, "rib_async_handler(): "
1505 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1506 		break;
1507 	case IBT_ERROR_PATH_MIGRATE_REQ:
1508 		cmn_err(CE_NOTE, "rib_async_handler(): "
1509 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1510 		break;
1511 	case IBT_ERROR_CQ:
1512 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1513 		break;
1514 	case IBT_ERROR_PORT_DOWN:
1515 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1516 		break;
1517 	case IBT_ASYNC_OPAQUE1:
1518 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1519 		break;
1520 	case IBT_ASYNC_OPAQUE2:
1521 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1522 		break;
1523 	case IBT_ASYNC_OPAQUE3:
1524 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1525 		break;
1526 	case IBT_ASYNC_OPAQUE4:
1527 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1528 		break;
1529 #endif
1530 	default:
1531 		break;
1532 	}
1533 }
1534 
1535 /*
1536  * Client's reachable function.
1537  */
1538 static rdma_stat
1539 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1540 {
1541 	rdma_stat	status;
1542 	rpcib_ping_t	rpt;
1543 	struct netbuf	saddr;
1544 	CONN		*conn;
1545 
1546 	bzero(&saddr, sizeof (struct netbuf));
1547 	status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
1548 
1549 	if (status == RDMA_SUCCESS) {
1550 		*handle = (void *)rpt.hca;
1551 		/* release the reference */
1552 		(void) rib_conn_release(conn);
1553 		return (RDMA_SUCCESS);
1554 	} else {
1555 		*handle = NULL;
1556 		DTRACE_PROBE(rpcib__i__pingfailed);
1557 		return (RDMA_FAILED);
1558 	}
1559 }
1560 
1561 /* Client side qp creation */
1562 static rdma_stat
1563 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1564 {
1565 	rib_qp_t	*kqp = NULL;
1566 	CONN		*conn;
1567 	rdma_clnt_cred_ctrl_t *cc_info;
1568 
1569 	ASSERT(qp != NULL);
1570 	*qp = NULL;
1571 
1572 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1573 	conn = qptoc(kqp);
1574 	kqp->hca = hca;
1575 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1576 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1577 
1578 	kqp->mode = RIB_CLIENT;
1579 	kqp->chan_flags = IBT_BLOCKING;
1580 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1581 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1582 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1583 	/*
1584 	 * Initialize
1585 	 */
1586 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1587 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1588 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1589 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1590 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1591 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1592 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1593 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1594 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1595 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1596 	/*
1597 	 * Initialize the client credit control
1598 	 * portion of the rdmaconn struct.
1599 	 */
1600 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1601 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1602 	cc_info->clnt_cc_granted_ops = 0;
1603 	cc_info->clnt_cc_in_flight_ops = 0;
1604 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1605 
1606 	*qp = kqp;
1607 	return (RDMA_SUCCESS);
1608 }
1609 
1610 /* Server side qp creation */
1611 static rdma_stat
1612 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1613 {
1614 	rib_qp_t	*kqp = NULL;
1615 	ibt_chan_sizes_t	chan_sizes;
1616 	ibt_rc_chan_alloc_args_t	qp_attr;
1617 	ibt_status_t		ibt_status;
1618 	rdma_srv_cred_ctrl_t *cc_info;
1619 
1620 	*qp = NULL;
1621 
1622 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1623 	kqp->hca = hca;
1624 	kqp->port_num = port;
1625 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1626 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1627 
1628 	/*
1629 	 * Create the qp handle
1630 	 */
1631 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1632 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1633 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1634 	qp_attr.rc_pd = hca->pd_hdl;
1635 	qp_attr.rc_hca_port_num = port;
1636 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1637 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1638 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1639 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1640 	qp_attr.rc_clone_chan = NULL;
1641 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1642 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1643 
1644 	rw_enter(&hca->state_lock, RW_READER);
1645 	if (hca->state != HCA_DETACHED) {
1646 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1647 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1648 		    &chan_sizes);
1649 	} else {
1650 		rw_exit(&hca->state_lock);
1651 		goto fail;
1652 	}
1653 	rw_exit(&hca->state_lock);
1654 
1655 	if (ibt_status != IBT_SUCCESS) {
1656 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1657 		    int, ibt_status);
1658 		goto fail;
1659 	}
1660 
1661 	kqp->mode = RIB_SERVER;
1662 	kqp->chan_flags = IBT_BLOCKING;
1663 	kqp->q = q;	/* server ONLY */
1664 
1665 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1666 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1667 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1668 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1669 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1670 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1671 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1672 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1673 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1674 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1675 	/*
1676 	 * Set the private data area to qp to be used in callbacks
1677 	 */
1678 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1679 	kqp->rdmaconn.c_state = C_CONNECTED;
1680 
1681 	/*
1682 	 * Initialize the server credit control
1683 	 * portion of the rdmaconn struct.
1684 	 */
1685 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1686 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1687 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1688 	cc_info->srv_cc_cur_buffers_used = 0;
1689 	cc_info->srv_cc_posted = preposted_rbufs;
1690 
1691 	*qp = kqp;
1692 
1693 	return (RDMA_SUCCESS);
1694 fail:
1695 	if (kqp)
1696 		kmem_free(kqp, sizeof (rib_qp_t));
1697 
1698 	return (RDMA_FAILED);
1699 }
1700 
1701 /* ARGSUSED */
1702 ibt_cm_status_t
1703 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1704     ibt_cm_return_args_t *ret_args, void *priv_data,
1705     ibt_priv_data_len_t len)
1706 {
1707 	rib_hca_t	*hca;
1708 
1709 	hca = (rib_hca_t *)clnt_hdl;
1710 
1711 	switch (event->cm_type) {
1712 
1713 	/* got a connection close event */
1714 	case IBT_CM_EVENT_CONN_CLOSED:
1715 	{
1716 		CONN	*conn;
1717 		rib_qp_t *qp;
1718 
1719 		/* check reason why connection was closed */
1720 		switch (event->cm_event.closed) {
1721 		case IBT_CM_CLOSED_DREP_RCVD:
1722 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1723 		case IBT_CM_CLOSED_DUP:
1724 		case IBT_CM_CLOSED_ABORT:
1725 		case IBT_CM_CLOSED_ALREADY:
1726 			/*
1727 			 * These cases indicate the local end initiated
1728 			 * the closing of the channel. Nothing to do here.
1729 			 */
1730 			break;
1731 		default:
1732 			/*
1733 			 * Reason for CONN_CLOSED event must be one of
1734 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1735 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1736 			 * the remote end is closing the channel. In these
1737 			 * cases free the channel and transition to error
1738 			 * state
1739 			 */
1740 			qp = ibt_get_chan_private(event->cm_channel);
1741 			conn = qptoc(qp);
1742 			mutex_enter(&conn->c_lock);
1743 			if (conn->c_state == C_DISCONN_PEND) {
1744 				mutex_exit(&conn->c_lock);
1745 				break;
1746 			}
1747 
1748 			conn->c_state = C_ERROR_CONN;
1749 
1750 			/*
1751 			 * Free the conn if c_ref is down to 0 already
1752 			 */
1753 			if (conn->c_ref == 0) {
1754 				/*
1755 				 * Remove from list and free conn
1756 				 */
1757 				conn->c_state = C_DISCONN_PEND;
1758 				mutex_exit(&conn->c_lock);
1759 				rw_enter(&hca->state_lock, RW_READER);
1760 				if (hca->state != HCA_DETACHED)
1761 					(void) rib_disconnect_channel(conn,
1762 					    &hca->cl_conn_list);
1763 				rw_exit(&hca->state_lock);
1764 			} else {
1765 				/*
1766 				 * conn will be freed when c_ref goes to 0.
1767 				 * Indicate to cleaning thread not to close
1768 				 * the connection, but just free the channel.
1769 				 */
1770 				conn->c_flags |= C_CLOSE_NOTNEEDED;
1771 				mutex_exit(&conn->c_lock);
1772 			}
1773 #ifdef DEBUG
1774 			if (rib_debug)
1775 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1776 				    "(CONN_CLOSED) channel disconnected");
1777 #endif
1778 			break;
1779 		}
1780 		break;
1781 	}
1782 	default:
1783 		break;
1784 	}
1785 	return (IBT_CM_ACCEPT);
1786 }
1787 
1788 /*
1789  * Connect to the server.
1790  */
1791 rdma_stat
1792 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1793 {
1794 	ibt_chan_open_args_t	chan_args;	/* channel args */
1795 	ibt_chan_sizes_t	chan_sizes;
1796 	ibt_rc_chan_alloc_args_t	qp_attr;
1797 	ibt_status_t		ibt_status;
1798 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1799 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1800 	ibt_ip_cm_info_t	ipcm_info;
1801 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1802 
1803 
1804 	(void) bzero(&chan_args, sizeof (chan_args));
1805 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1806 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1807 
1808 	ipcm_info.src_addr.family = rptp->srcip.family;
1809 	switch (ipcm_info.src_addr.family) {
1810 	case AF_INET:
1811 		ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1812 		break;
1813 	case AF_INET6:
1814 		ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1815 		break;
1816 	}
1817 
1818 	ipcm_info.dst_addr.family = rptp->srcip.family;
1819 	switch (ipcm_info.dst_addr.family) {
1820 	case AF_INET:
1821 		ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1822 		break;
1823 	case AF_INET6:
1824 		ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1825 		break;
1826 	}
1827 
1828 	ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1829 
1830 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1831 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1832 
1833 	if (ibt_status != IBT_SUCCESS) {
1834 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1835 		return (-1);
1836 	}
1837 
1838 	qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1839 	/* Alloc a RC channel */
1840 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1841 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1842 	qp_attr.rc_pd = hca->pd_hdl;
1843 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1844 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1845 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1846 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1847 	qp_attr.rc_clone_chan = NULL;
1848 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1849 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1850 
1851 	rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1852 	chan_args.oc_path = &rptp->path;
1853 
1854 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1855 	chan_args.oc_cm_clnt_private = (void *)hca;
1856 	chan_args.oc_rdma_ra_out = 4;
1857 	chan_args.oc_rdma_ra_in = 4;
1858 	chan_args.oc_path_retry_cnt = 2;
1859 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1860 	chan_args.oc_priv_data = cmp_ip_pvt;
1861 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1862 
1863 refresh:
1864 	rw_enter(&hca->state_lock, RW_READER);
1865 	if (hca->state != HCA_DETACHED) {
1866 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1867 		    IBT_ACHAN_NO_FLAGS,
1868 		    &qp_attr, &qp->qp_hdl,
1869 		    &chan_sizes);
1870 	} else {
1871 		rw_exit(&hca->state_lock);
1872 		return (RDMA_FAILED);
1873 	}
1874 	rw_exit(&hca->state_lock);
1875 
1876 	if (ibt_status != IBT_SUCCESS) {
1877 		DTRACE_PROBE1(rpcib__i_conntosrv,
1878 		    int, ibt_status);
1879 		return (RDMA_FAILED);
1880 	}
1881 
1882 	/* Connect to the Server */
1883 	(void) bzero(&ret_args, sizeof (ret_args));
1884 	mutex_enter(&qp->cb_lock);
1885 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1886 	    IBT_BLOCKING, &chan_args, &ret_args);
1887 	if (ibt_status != IBT_SUCCESS) {
1888 		DTRACE_PROBE2(rpcib__i_openrctosrv,
1889 		    int, ibt_status, int, ret_args.rc_status);
1890 
1891 		(void) ibt_free_channel(qp->qp_hdl);
1892 		qp->qp_hdl = NULL;
1893 		mutex_exit(&qp->cb_lock);
1894 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1895 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1896 			/*
1897 			 * Got IBT_CM_CONN_STALE probably because of stale
1898 			 * data on the passive end of a channel that existed
1899 			 * prior to reboot. Retry establishing a channel
1900 			 * REFRESH_ATTEMPTS times, during which time the
1901 			 * stale conditions on the server might clear up.
1902 			 */
1903 			goto refresh;
1904 		}
1905 		return (RDMA_FAILED);
1906 	}
1907 	mutex_exit(&qp->cb_lock);
1908 	/*
1909 	 * Set the private data area to qp to be used in callbacks
1910 	 */
1911 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1912 	return (RDMA_SUCCESS);
1913 }
1914 
1915 rdma_stat
1916 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1917 {
1918 	uint_t			i, addr_count;
1919 	ibt_status_t		ibt_status;
1920 	uint8_t			num_paths_p;
1921 	ibt_ip_path_attr_t	ipattr;
1922 	ibt_path_ip_src_t	srcip;
1923 	rpcib_ipaddrs_t		addrs4;
1924 	rpcib_ipaddrs_t		addrs6;
1925 	struct sockaddr_in	*sinp;
1926 	struct sockaddr_in6	*sin6p;
1927 	rdma_stat		retval = RDMA_FAILED;
1928 	rib_hca_t *hca;
1929 
1930 	if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1931 		return (RDMA_INVAL);
1932 	ASSERT(raddr->buf != NULL);
1933 
1934 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1935 
1936 	if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1937 	    (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1938 		retval = RDMA_FAILED;
1939 		goto done2;
1940 	}
1941 
1942 	if (addr_type == AF_INET) {
1943 		addr_count = addrs4.ri_count;
1944 		sinp = (struct sockaddr_in *)raddr->buf;
1945 		rptp->dstip.family = AF_INET;
1946 		rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1947 		sinp = addrs4.ri_list;
1948 	} else {
1949 		addr_count = addrs6.ri_count;
1950 		sin6p = (struct sockaddr_in6 *)raddr->buf;
1951 		rptp->dstip.family = AF_INET6;
1952 		rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1953 		sin6p = addrs6.ri_list;
1954 	}
1955 
1956 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1957 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1958 		rw_enter(&hca->state_lock, RW_READER);
1959 		if (hca->state == HCA_DETACHED) {
1960 			rw_exit(&hca->state_lock);
1961 			continue;
1962 		}
1963 
1964 		ipattr.ipa_dst_ip 	= &rptp->dstip;
1965 		ipattr.ipa_hca_guid	= hca->hca_guid;
1966 		ipattr.ipa_ndst		= 1;
1967 		ipattr.ipa_max_paths	= 1;
1968 		ipattr.ipa_src_ip.family = rptp->dstip.family;
1969 		for (i = 0; i < addr_count; i++) {
1970 			num_paths_p = 0;
1971 			if (addr_type == AF_INET) {
1972 				ipattr.ipa_src_ip.un.ip4addr =
1973 				    sinp[i].sin_addr.s_addr;
1974 			} else {
1975 				ipattr.ipa_src_ip.un.ip6addr =
1976 				    sin6p[i].sin6_addr;
1977 			}
1978 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
1979 
1980 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1981 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1982 			    &num_paths_p, &srcip);
1983 			if (ibt_status == IBT_SUCCESS &&
1984 			    num_paths_p != 0 &&
1985 			    rptp->path.pi_hca_guid == hca->hca_guid) {
1986 				rptp->hca = hca;
1987 				rw_exit(&hca->state_lock);
1988 				if (addr_type == AF_INET) {
1989 					rptp->srcip.family = AF_INET;
1990 					rptp->srcip.un.ip4addr =
1991 					    srcip.ip_primary.un.ip4addr;
1992 				} else {
1993 					rptp->srcip.family = AF_INET6;
1994 					rptp->srcip.un.ip6addr =
1995 					    srcip.ip_primary.un.ip6addr;
1996 
1997 				}
1998 				retval = RDMA_SUCCESS;
1999 				goto done1;
2000 			}
2001 		}
2002 		rw_exit(&hca->state_lock);
2003 	}
2004 done1:
2005 	rw_exit(&rib_stat->hcas_list_lock);
2006 done2:
2007 	if (addrs4.ri_size > 0)
2008 		kmem_free(addrs4.ri_list, addrs4.ri_size);
2009 	if (addrs6.ri_size > 0)
2010 		kmem_free(addrs6.ri_list, addrs6.ri_size);
2011 	return (retval);
2012 }
2013 
2014 /*
2015  * Close channel, remove from connection list and
2016  * free up resources allocated for that channel.
2017  */
2018 rdma_stat
2019 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2020 {
2021 	rib_qp_t	*qp = ctoqp(conn);
2022 	rib_hca_t	*hca;
2023 
2024 	mutex_enter(&conn->c_lock);
2025 	if (conn->c_timeout != NULL) {
2026 		mutex_exit(&conn->c_lock);
2027 		(void) untimeout(conn->c_timeout);
2028 		mutex_enter(&conn->c_lock);
2029 	}
2030 
2031 	while (conn->c_flags & C_CLOSE_PENDING) {
2032 		cv_wait(&conn->c_cv, &conn->c_lock);
2033 	}
2034 	mutex_exit(&conn->c_lock);
2035 
2036 	/*
2037 	 * c_ref == 0 and connection is in C_DISCONN_PEND
2038 	 */
2039 	hca = qp->hca;
2040 	if (conn_list != NULL)
2041 		(void) rib_rm_conn(conn, conn_list);
2042 
2043 	/*
2044 	 * There is only one case where we get here with
2045 	 * qp_hdl = NULL, which is during connection setup on
2046 	 * the client. In such a case there are no posted
2047 	 * send/recv buffers.
2048 	 */
2049 	if (qp->qp_hdl != NULL) {
2050 		mutex_enter(&qp->posted_rbufs_lock);
2051 		while (qp->n_posted_rbufs)
2052 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2053 		mutex_exit(&qp->posted_rbufs_lock);
2054 
2055 		mutex_enter(&qp->send_rbufs_lock);
2056 		while (qp->n_send_rbufs)
2057 			cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2058 			mutex_exit(&qp->send_rbufs_lock);
2059 
2060 		(void) ibt_free_channel(qp->qp_hdl);
2061 			qp->qp_hdl = NULL;
2062 	}
2063 
2064 	ASSERT(qp->rdlist == NULL);
2065 
2066 	if (qp->replylist != NULL) {
2067 		(void) rib_rem_replylist(qp);
2068 	}
2069 
2070 	cv_destroy(&qp->cb_conn_cv);
2071 	cv_destroy(&qp->posted_rbufs_cv);
2072 	cv_destroy(&qp->send_rbufs_cv);
2073 	mutex_destroy(&qp->cb_lock);
2074 	mutex_destroy(&qp->replylist_lock);
2075 	mutex_destroy(&qp->posted_rbufs_lock);
2076 	mutex_destroy(&qp->send_rbufs_lock);
2077 	mutex_destroy(&qp->rdlist_lock);
2078 
2079 	cv_destroy(&conn->c_cv);
2080 	mutex_destroy(&conn->c_lock);
2081 
2082 	if (conn->c_raddr.buf != NULL) {
2083 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2084 	}
2085 	if (conn->c_laddr.buf != NULL) {
2086 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2087 	}
2088 	if (conn->c_netid != NULL) {
2089 		kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1));
2090 	}
2091 
2092 	/*
2093 	 * Credit control cleanup.
2094 	 */
2095 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2096 		rdma_clnt_cred_ctrl_t *cc_info;
2097 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2098 		cv_destroy(&cc_info->clnt_cc_cv);
2099 	}
2100 
2101 	kmem_free(qp, sizeof (rib_qp_t));
2102 
2103 	/*
2104 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2105 	 * then the hca is no longer being used.
2106 	 */
2107 	if (conn_list != NULL) {
2108 		rw_enter(&hca->state_lock, RW_READER);
2109 		if (hca->state == HCA_DETACHED) {
2110 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2111 			if (hca->srv_conn_list.conn_hd == NULL) {
2112 				rw_enter(&hca->cl_conn_list.conn_lock,
2113 				    RW_READER);
2114 
2115 				if (hca->cl_conn_list.conn_hd == NULL) {
2116 					mutex_enter(&hca->inuse_lock);
2117 					hca->inuse = FALSE;
2118 					cv_signal(&hca->cb_cv);
2119 					mutex_exit(&hca->inuse_lock);
2120 				}
2121 				rw_exit(&hca->cl_conn_list.conn_lock);
2122 			}
2123 			rw_exit(&hca->srv_conn_list.conn_lock);
2124 		}
2125 		rw_exit(&hca->state_lock);
2126 	}
2127 
2128 	return (RDMA_SUCCESS);
2129 }
2130 
2131 /*
2132  * All sends are done under the protection of
2133  * the wdesc->sendwait_lock. n_send_rbufs count
2134  * is protected using the send_rbufs_lock.
2135  * lock ordering is:
2136  * sendwait_lock -> send_rbufs_lock
2137  */
2138 
2139 void
2140 rib_send_hold(rib_qp_t *qp)
2141 {
2142 	mutex_enter(&qp->send_rbufs_lock);
2143 	qp->n_send_rbufs++;
2144 	mutex_exit(&qp->send_rbufs_lock);
2145 }
2146 
2147 void
2148 rib_send_rele(rib_qp_t *qp)
2149 {
2150 	mutex_enter(&qp->send_rbufs_lock);
2151 	qp->n_send_rbufs--;
2152 	if (qp->n_send_rbufs == 0)
2153 		cv_signal(&qp->send_rbufs_cv);
2154 	mutex_exit(&qp->send_rbufs_lock);
2155 }
2156 
2157 void
2158 rib_recv_rele(rib_qp_t *qp)
2159 {
2160 	mutex_enter(&qp->posted_rbufs_lock);
2161 	qp->n_posted_rbufs--;
2162 	if (qp->n_posted_rbufs == 0)
2163 		cv_signal(&qp->posted_rbufs_cv);
2164 	mutex_exit(&qp->posted_rbufs_lock);
2165 }
2166 
2167 /*
2168  * Wait for send completion notification. Only on receiving a
2169  * notification be it a successful or error completion, free the
2170  * send_wid.
2171  */
2172 static rdma_stat
2173 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2174 {
2175 	clock_t timout, cv_wait_ret;
2176 	rdma_stat error = RDMA_SUCCESS;
2177 	int	i;
2178 
2179 	/*
2180 	 * Wait for send to complete
2181 	 */
2182 	ASSERT(wd != NULL);
2183 	mutex_enter(&wd->sendwait_lock);
2184 	if (wd->status == (uint_t)SEND_WAIT) {
2185 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2186 		    ddi_get_lbolt();
2187 
2188 		if (qp->mode == RIB_SERVER) {
2189 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2190 			    &wd->sendwait_lock, timout)) > 0 &&
2191 			    wd->status == (uint_t)SEND_WAIT)
2192 				;
2193 			switch (cv_wait_ret) {
2194 			case -1:	/* timeout */
2195 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2196 
2197 				wd->cv_sig = 0;		/* no signal needed */
2198 				error = RDMA_TIMEDOUT;
2199 				break;
2200 			default:	/* got send completion */
2201 				break;
2202 			}
2203 		} else {
2204 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2205 			    &wd->sendwait_lock, timout)) > 0 &&
2206 			    wd->status == (uint_t)SEND_WAIT)
2207 				;
2208 			switch (cv_wait_ret) {
2209 			case -1:	/* timeout */
2210 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2211 
2212 				wd->cv_sig = 0;		/* no signal needed */
2213 				error = RDMA_TIMEDOUT;
2214 				break;
2215 			case 0:		/* interrupted */
2216 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2217 
2218 				wd->cv_sig = 0;		/* no signal needed */
2219 				error = RDMA_INTR;
2220 				break;
2221 			default:	/* got send completion */
2222 				break;
2223 			}
2224 		}
2225 	}
2226 
2227 	if (wd->status != (uint_t)SEND_WAIT) {
2228 		/* got send completion */
2229 		if (wd->status != RDMA_SUCCESS) {
2230 			switch (wd->status) {
2231 			case RDMA_CONNLOST:
2232 				error = RDMA_CONNLOST;
2233 				break;
2234 			default:
2235 				error = RDMA_FAILED;
2236 				break;
2237 			}
2238 		}
2239 		for (i = 0; i < wd->nsbufs; i++) {
2240 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2241 			    (void *)(uintptr_t)wd->sbufaddr[i]);
2242 		}
2243 
2244 		rib_send_rele(qp);
2245 
2246 		mutex_exit(&wd->sendwait_lock);
2247 		(void) rib_free_sendwait(wd);
2248 
2249 	} else {
2250 		mutex_exit(&wd->sendwait_lock);
2251 	}
2252 	return (error);
2253 }
2254 
2255 static struct send_wid *
2256 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2257 {
2258 	struct send_wid	*wd;
2259 
2260 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2261 	wd->xid = xid;
2262 	wd->cv_sig = cv_sig;
2263 	wd->qp = qp;
2264 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2265 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2266 	wd->status = (uint_t)SEND_WAIT;
2267 
2268 	return (wd);
2269 }
2270 
2271 static int
2272 rib_free_sendwait(struct send_wid *wdesc)
2273 {
2274 	cv_destroy(&wdesc->wait_cv);
2275 	mutex_destroy(&wdesc->sendwait_lock);
2276 	kmem_free(wdesc, sizeof (*wdesc));
2277 
2278 	return (0);
2279 }
2280 
2281 static rdma_stat
2282 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2283 {
2284 	mutex_enter(&qp->replylist_lock);
2285 	if (rep != NULL) {
2286 		(void) rib_remreply(qp, rep);
2287 		mutex_exit(&qp->replylist_lock);
2288 		return (RDMA_SUCCESS);
2289 	}
2290 	mutex_exit(&qp->replylist_lock);
2291 	return (RDMA_FAILED);
2292 }
2293 
2294 /*
2295  * Send buffers are freed here only in case of error in posting
2296  * on QP. If the post succeeded, the send buffers are freed upon
2297  * send completion in rib_sendwait() or in the scq_handler.
2298  */
2299 rdma_stat
2300 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2301 	int send_sig, int cv_sig, caddr_t *swid)
2302 {
2303 	struct send_wid	*wdesc;
2304 	struct clist	*clp;
2305 	ibt_status_t	ibt_status = IBT_SUCCESS;
2306 	rdma_stat	ret = RDMA_SUCCESS;
2307 	ibt_send_wr_t	tx_wr;
2308 	int		i, nds;
2309 	ibt_wr_ds_t	sgl[DSEG_MAX];
2310 	uint_t		total_msg_size;
2311 	rib_qp_t	*qp;
2312 
2313 	qp = ctoqp(conn);
2314 
2315 	ASSERT(cl != NULL);
2316 
2317 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2318 
2319 	nds = 0;
2320 	total_msg_size = 0;
2321 	clp = cl;
2322 	while (clp != NULL) {
2323 		if (nds >= DSEG_MAX) {
2324 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2325 			return (RDMA_FAILED);
2326 		}
2327 		sgl[nds].ds_va = clp->w.c_saddr;
2328 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2329 		sgl[nds].ds_len = clp->c_len;
2330 		total_msg_size += clp->c_len;
2331 		clp = clp->c_next;
2332 		nds++;
2333 	}
2334 
2335 	if (send_sig) {
2336 		/* Set SEND_SIGNAL flag. */
2337 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2338 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2339 		*swid = (caddr_t)wdesc;
2340 		tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2341 		mutex_enter(&wdesc->sendwait_lock);
2342 		wdesc->nsbufs = nds;
2343 		for (i = 0; i < nds; i++) {
2344 			wdesc->sbufaddr[i] = sgl[i].ds_va;
2345 		}
2346 	} else {
2347 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2348 		*swid = NULL;
2349 		tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2350 	}
2351 
2352 	tx_wr.wr_opcode = IBT_WRC_SEND;
2353 	tx_wr.wr_trans = IBT_RC_SRV;
2354 	tx_wr.wr_nds = nds;
2355 	tx_wr.wr_sgl = sgl;
2356 
2357 	mutex_enter(&conn->c_lock);
2358 	if (conn->c_state == C_CONNECTED) {
2359 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2360 	}
2361 	if (conn->c_state != C_CONNECTED ||
2362 	    ibt_status != IBT_SUCCESS) {
2363 		if (conn->c_state != C_DISCONN_PEND)
2364 			conn->c_state = C_ERROR_CONN;
2365 		mutex_exit(&conn->c_lock);
2366 		if (send_sig) {
2367 			for (i = 0; i < nds; i++) {
2368 				rib_rbuf_free(conn, SEND_BUFFER,
2369 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2370 			}
2371 			mutex_exit(&wdesc->sendwait_lock);
2372 			(void) rib_free_sendwait(wdesc);
2373 		}
2374 		return (RDMA_CONNLOST);
2375 	}
2376 
2377 	mutex_exit(&conn->c_lock);
2378 
2379 	if (send_sig) {
2380 		rib_send_hold(qp);
2381 		mutex_exit(&wdesc->sendwait_lock);
2382 		if (cv_sig) {
2383 			/*
2384 			 * cv_wait for send to complete.
2385 			 * We can fail due to a timeout or signal or
2386 			 * unsuccessful send.
2387 			 */
2388 			ret = rib_sendwait(qp, wdesc);
2389 
2390 			return (ret);
2391 		}
2392 	}
2393 
2394 	return (RDMA_SUCCESS);
2395 }
2396 
2397 
2398 rdma_stat
2399 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2400 {
2401 	rdma_stat	ret;
2402 	caddr_t		wd;
2403 
2404 	/* send-wait & cv_signal */
2405 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2406 	return (ret);
2407 }
2408 
2409 /*
2410  * Deprecated/obsolete interface not used currently
2411  * but earlier used for READ-READ protocol.
2412  * Send RPC reply and wait for RDMA_DONE.
2413  */
2414 rdma_stat
2415 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2416 {
2417 	rdma_stat ret = RDMA_SUCCESS;
2418 	struct rdma_done_list *rd;
2419 	clock_t cv_wait_ret;
2420 	caddr_t *wid = NULL;
2421 	rib_qp_t *qp = ctoqp(conn);
2422 
2423 	mutex_enter(&qp->rdlist_lock);
2424 	rd = rdma_done_add(qp, msgid);
2425 
2426 	/* No cv_signal (whether send-wait or no-send-wait) */
2427 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2428 
2429 	if (ret != RDMA_SUCCESS) {
2430 		rdma_done_rm(qp, rd);
2431 	} else {
2432 		/*
2433 		 * Wait for RDMA_DONE from remote end
2434 		 */
2435 		cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv,
2436 		    &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000),
2437 		    TR_CLOCK_TICK);
2438 
2439 		rdma_done_rm(qp, rd);
2440 
2441 		if (cv_wait_ret < 0) {
2442 			ret = RDMA_TIMEDOUT;
2443 		}
2444 	}
2445 
2446 	mutex_exit(&qp->rdlist_lock);
2447 	return (ret);
2448 }
2449 
2450 static struct recv_wid *
2451 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2452 {
2453 	struct recv_wid	*rwid;
2454 
2455 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2456 	rwid->xid = msgid;
2457 	rwid->addr = sgl->ds_va;
2458 	rwid->qp = qp;
2459 
2460 	return (rwid);
2461 }
2462 
2463 static void
2464 rib_free_wid(struct recv_wid *rwid)
2465 {
2466 	kmem_free(rwid, sizeof (struct recv_wid));
2467 }
2468 
2469 rdma_stat
2470 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2471 {
2472 	rib_qp_t	*qp = ctoqp(conn);
2473 	struct clist	*clp = cl;
2474 	struct reply	*rep;
2475 	struct recv_wid	*rwid;
2476 	int		nds;
2477 	ibt_wr_ds_t	sgl[DSEG_MAX];
2478 	ibt_recv_wr_t	recv_wr;
2479 	rdma_stat	ret;
2480 	ibt_status_t	ibt_status;
2481 
2482 	/*
2483 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2484 	 */
2485 
2486 	nds = 0;
2487 	while (cl != NULL) {
2488 		if (nds >= DSEG_MAX) {
2489 			ret = RDMA_FAILED;
2490 			goto done;
2491 		}
2492 		sgl[nds].ds_va = cl->w.c_saddr;
2493 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2494 		sgl[nds].ds_len = cl->c_len;
2495 		cl = cl->c_next;
2496 		nds++;
2497 	}
2498 
2499 	if (nds != 1) {
2500 		ret = RDMA_FAILED;
2501 		goto done;
2502 	}
2503 
2504 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2505 	recv_wr.wr_nds = nds;
2506 	recv_wr.wr_sgl = sgl;
2507 
2508 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2509 	if (rwid) {
2510 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2511 	} else {
2512 		ret = RDMA_NORESOURCE;
2513 		goto done;
2514 	}
2515 	rep = rib_addreplylist(qp, msgid);
2516 	if (!rep) {
2517 		rib_free_wid(rwid);
2518 		ret = RDMA_NORESOURCE;
2519 		goto done;
2520 	}
2521 
2522 	mutex_enter(&conn->c_lock);
2523 
2524 	if (conn->c_state == C_CONNECTED) {
2525 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2526 	}
2527 
2528 	if (conn->c_state != C_CONNECTED ||
2529 	    ibt_status != IBT_SUCCESS) {
2530 		if (conn->c_state != C_DISCONN_PEND)
2531 			conn->c_state = C_ERROR_CONN;
2532 		mutex_exit(&conn->c_lock);
2533 		rib_free_wid(rwid);
2534 		(void) rib_rem_rep(qp, rep);
2535 		ret = RDMA_CONNLOST;
2536 		goto done;
2537 	}
2538 
2539 	mutex_enter(&qp->posted_rbufs_lock);
2540 	qp->n_posted_rbufs++;
2541 	mutex_exit(&qp->posted_rbufs_lock);
2542 
2543 	mutex_exit(&conn->c_lock);
2544 	return (RDMA_SUCCESS);
2545 
2546 done:
2547 	while (clp != NULL) {
2548 		rib_rbuf_free(conn, RECV_BUFFER,
2549 		    (void *)(uintptr_t)clp->w.c_saddr3);
2550 		clp = clp->c_next;
2551 	}
2552 	return (ret);
2553 }
2554 
2555 rdma_stat
2556 rib_svc_post(CONN* conn, struct clist *cl)
2557 {
2558 	rib_qp_t	*qp = ctoqp(conn);
2559 	struct svc_recv	*s_recvp;
2560 	int		nds;
2561 	ibt_wr_ds_t	sgl[DSEG_MAX];
2562 	ibt_recv_wr_t	recv_wr;
2563 	ibt_status_t	ibt_status;
2564 
2565 	nds = 0;
2566 	while (cl != NULL) {
2567 		if (nds >= DSEG_MAX) {
2568 			return (RDMA_FAILED);
2569 		}
2570 		sgl[nds].ds_va = cl->w.c_saddr;
2571 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2572 		sgl[nds].ds_len = cl->c_len;
2573 		cl = cl->c_next;
2574 		nds++;
2575 	}
2576 
2577 	if (nds != 1) {
2578 		rib_rbuf_free(conn, RECV_BUFFER,
2579 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2580 
2581 		return (RDMA_FAILED);
2582 	}
2583 
2584 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2585 	recv_wr.wr_nds = nds;
2586 	recv_wr.wr_sgl = sgl;
2587 
2588 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2589 	/* Use s_recvp's addr as wr id */
2590 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2591 	mutex_enter(&conn->c_lock);
2592 	if (conn->c_state == C_CONNECTED) {
2593 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2594 	}
2595 	if (conn->c_state != C_CONNECTED ||
2596 	    ibt_status != IBT_SUCCESS) {
2597 		if (conn->c_state != C_DISCONN_PEND)
2598 			conn->c_state = C_ERROR_CONN;
2599 		mutex_exit(&conn->c_lock);
2600 		rib_rbuf_free(conn, RECV_BUFFER,
2601 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2602 		(void) rib_free_svc_recv(s_recvp);
2603 
2604 		return (RDMA_CONNLOST);
2605 	}
2606 	mutex_exit(&conn->c_lock);
2607 
2608 	return (RDMA_SUCCESS);
2609 }
2610 
2611 /* Client */
2612 rdma_stat
2613 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2614 {
2615 	return (rib_clnt_post(conn, cl, msgid));
2616 }
2617 
2618 /* Client */
2619 rdma_stat
2620 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2621 {
2622 	rib_qp_t	*qp = ctoqp(conn);
2623 	struct reply	*rep;
2624 
2625 	mutex_enter(&qp->replylist_lock);
2626 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2627 		if (rep->xid == msgid) {
2628 			if (rep->vaddr_cq) {
2629 				rib_rbuf_free(conn, RECV_BUFFER,
2630 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2631 			}
2632 			(void) rib_remreply(qp, rep);
2633 			break;
2634 		}
2635 	}
2636 	mutex_exit(&qp->replylist_lock);
2637 
2638 	return (RDMA_SUCCESS);
2639 }
2640 
2641 /* Server */
2642 rdma_stat
2643 rib_post_recv(CONN *conn, struct clist *cl)
2644 {
2645 	rib_qp_t	*qp = ctoqp(conn);
2646 
2647 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2648 		mutex_enter(&qp->posted_rbufs_lock);
2649 		qp->n_posted_rbufs++;
2650 		mutex_exit(&qp->posted_rbufs_lock);
2651 		return (RDMA_SUCCESS);
2652 	}
2653 	return (RDMA_FAILED);
2654 }
2655 
2656 /*
2657  * Client side only interface to "recv" the rpc reply buf
2658  * posted earlier by rib_post_resp(conn, cl, msgid).
2659  */
2660 rdma_stat
2661 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2662 {
2663 	struct reply *rep = NULL;
2664 	clock_t timout, cv_wait_ret;
2665 	rdma_stat ret = RDMA_SUCCESS;
2666 	rib_qp_t *qp = ctoqp(conn);
2667 
2668 	/*
2669 	 * Find the reply structure for this msgid
2670 	 */
2671 	mutex_enter(&qp->replylist_lock);
2672 
2673 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2674 		if (rep->xid == msgid)
2675 			break;
2676 	}
2677 
2678 	if (rep != NULL) {
2679 		/*
2680 		 * If message not yet received, wait.
2681 		 */
2682 		if (rep->status == (uint_t)REPLY_WAIT) {
2683 			timout = ddi_get_lbolt() +
2684 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2685 
2686 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2687 			    &qp->replylist_lock, timout)) > 0 &&
2688 			    rep->status == (uint_t)REPLY_WAIT)
2689 				;
2690 
2691 			switch (cv_wait_ret) {
2692 			case -1:	/* timeout */
2693 				ret = RDMA_TIMEDOUT;
2694 				break;
2695 			case 0:
2696 				ret = RDMA_INTR;
2697 				break;
2698 			default:
2699 				break;
2700 			}
2701 		}
2702 
2703 		if (rep->status == RDMA_SUCCESS) {
2704 			struct clist *cl = NULL;
2705 
2706 			/*
2707 			 * Got message successfully
2708 			 */
2709 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2710 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2711 			*clp = cl;
2712 		} else {
2713 			if (rep->status != (uint_t)REPLY_WAIT) {
2714 				/*
2715 				 * Got error in reply message. Free
2716 				 * recv buffer here.
2717 				 */
2718 				ret = rep->status;
2719 				rib_rbuf_free(conn, RECV_BUFFER,
2720 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2721 			}
2722 		}
2723 		(void) rib_remreply(qp, rep);
2724 	} else {
2725 		/*
2726 		 * No matching reply structure found for given msgid on the
2727 		 * reply wait list.
2728 		 */
2729 		ret = RDMA_INVAL;
2730 		DTRACE_PROBE(rpcib__i__nomatchxid2);
2731 	}
2732 
2733 	/*
2734 	 * Done.
2735 	 */
2736 	mutex_exit(&qp->replylist_lock);
2737 	return (ret);
2738 }
2739 
2740 /*
2741  * RDMA write a buffer to the remote address.
2742  */
2743 rdma_stat
2744 rib_write(CONN *conn, struct clist *cl, int wait)
2745 {
2746 	ibt_send_wr_t	tx_wr;
2747 	int		cv_sig;
2748 	ibt_wr_ds_t	sgl[DSEG_MAX];
2749 	struct send_wid	*wdesc;
2750 	ibt_status_t	ibt_status;
2751 	rdma_stat	ret = RDMA_SUCCESS;
2752 	rib_qp_t	*qp = ctoqp(conn);
2753 	uint64_t	n_writes = 0;
2754 
2755 	if (cl == NULL) {
2756 		return (RDMA_FAILED);
2757 	}
2758 
2759 	while ((cl != NULL)) {
2760 		if (cl->c_len > 0) {
2761 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2762 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2763 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2764 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2765 			sgl[0].ds_va = cl->w.c_saddr;
2766 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2767 			sgl[0].ds_len = cl->c_len;
2768 
2769 			if (wait) {
2770 				cv_sig = 1;
2771 			} else {
2772 				if (n_writes > max_unsignaled_rws) {
2773 					n_writes = 0;
2774 					cv_sig = 1;
2775 				} else {
2776 					cv_sig = 0;
2777 				}
2778 			}
2779 
2780 			if (cv_sig) {
2781 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2782 				wdesc = rib_init_sendwait(0, cv_sig, qp);
2783 				tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2784 				mutex_enter(&wdesc->sendwait_lock);
2785 			} else {
2786 				tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2787 				tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2788 			}
2789 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2790 			tx_wr.wr_trans = IBT_RC_SRV;
2791 			tx_wr.wr_nds = 1;
2792 			tx_wr.wr_sgl = sgl;
2793 
2794 			mutex_enter(&conn->c_lock);
2795 			if (conn->c_state == C_CONNECTED) {
2796 				ibt_status =
2797 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2798 			}
2799 			if (conn->c_state != C_CONNECTED ||
2800 			    ibt_status != IBT_SUCCESS) {
2801 				if (conn->c_state != C_DISCONN_PEND)
2802 					conn->c_state = C_ERROR_CONN;
2803 				mutex_exit(&conn->c_lock);
2804 				if (cv_sig) {
2805 					mutex_exit(&wdesc->sendwait_lock);
2806 					(void) rib_free_sendwait(wdesc);
2807 				}
2808 				return (RDMA_CONNLOST);
2809 			}
2810 
2811 			mutex_exit(&conn->c_lock);
2812 
2813 			/*
2814 			 * Wait for send to complete
2815 			 */
2816 			if (cv_sig) {
2817 
2818 				rib_send_hold(qp);
2819 				mutex_exit(&wdesc->sendwait_lock);
2820 
2821 				ret = rib_sendwait(qp, wdesc);
2822 				if (ret != 0)
2823 					return (ret);
2824 			}
2825 			n_writes ++;
2826 		}
2827 		cl = cl->c_next;
2828 	}
2829 	return (RDMA_SUCCESS);
2830 }
2831 
2832 /*
2833  * RDMA Read a buffer from the remote address.
2834  */
2835 rdma_stat
2836 rib_read(CONN *conn, struct clist *cl, int wait)
2837 {
2838 	ibt_send_wr_t	rx_wr;
2839 	int		cv_sig = 0;
2840 	ibt_wr_ds_t	sgl;
2841 	struct send_wid	*wdesc;
2842 	ibt_status_t	ibt_status = IBT_SUCCESS;
2843 	rdma_stat	ret = RDMA_SUCCESS;
2844 	rib_qp_t	*qp = ctoqp(conn);
2845 
2846 	if (cl == NULL) {
2847 		return (RDMA_FAILED);
2848 	}
2849 
2850 	while (cl != NULL) {
2851 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2852 		/*
2853 		 * Remote address is at the head chunk item in list.
2854 		 */
2855 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2856 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2857 
2858 		sgl.ds_va = cl->u.c_daddr;
2859 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2860 		sgl.ds_len = cl->c_len;
2861 
2862 		/*
2863 		 * If there are multiple chunks to be read, and
2864 		 * wait is set, ask for signal only for the last chunk
2865 		 * and wait only on the last chunk. The completion of
2866 		 * RDMA_READ on last chunk ensures that reads on all
2867 		 * previous chunks are also completed.
2868 		 */
2869 		if (wait && (cl->c_next == NULL)) {
2870 			cv_sig = 1;
2871 			wdesc = rib_init_sendwait(0, cv_sig, qp);
2872 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2873 			rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2874 			mutex_enter(&wdesc->sendwait_lock);
2875 		} else {
2876 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2877 			rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2878 		}
2879 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2880 		rx_wr.wr_trans = IBT_RC_SRV;
2881 		rx_wr.wr_nds = 1;
2882 		rx_wr.wr_sgl = &sgl;
2883 
2884 		mutex_enter(&conn->c_lock);
2885 		if (conn->c_state == C_CONNECTED) {
2886 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2887 		}
2888 		if (conn->c_state != C_CONNECTED ||
2889 		    ibt_status != IBT_SUCCESS) {
2890 			if (conn->c_state != C_DISCONN_PEND)
2891 				conn->c_state = C_ERROR_CONN;
2892 			mutex_exit(&conn->c_lock);
2893 			if (wait && (cl->c_next == NULL)) {
2894 				mutex_exit(&wdesc->sendwait_lock);
2895 				(void) rib_free_sendwait(wdesc);
2896 			}
2897 			return (RDMA_CONNLOST);
2898 		}
2899 
2900 		mutex_exit(&conn->c_lock);
2901 
2902 		/*
2903 		 * Wait for send to complete if this is the
2904 		 * last item in the list.
2905 		 */
2906 		if (wait && cl->c_next == NULL) {
2907 			rib_send_hold(qp);
2908 			mutex_exit(&wdesc->sendwait_lock);
2909 
2910 			ret = rib_sendwait(qp, wdesc);
2911 
2912 			if (ret != 0)
2913 				return (ret);
2914 		}
2915 		cl = cl->c_next;
2916 	}
2917 	return (RDMA_SUCCESS);
2918 }
2919 
2920 /*
2921  * rib_srv_cm_handler()
2922  *    Connection Manager callback to handle RC connection requests.
2923  */
2924 /* ARGSUSED */
2925 static ibt_cm_status_t
2926 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2927 	ibt_cm_return_args_t *ret_args, void *priv_data,
2928 	ibt_priv_data_len_t len)
2929 {
2930 	queue_t		*q;
2931 	rib_qp_t	*qp;
2932 	rib_hca_t	*hca;
2933 	rdma_stat	status = RDMA_SUCCESS;
2934 	int		i;
2935 	struct clist	cl;
2936 	rdma_buf_t	rdbuf = {0};
2937 	void		*buf = NULL;
2938 	CONN		*conn;
2939 	ibt_ip_cm_info_t	ipinfo;
2940 	struct sockaddr_in *s;
2941 	struct sockaddr_in6 *s6;
2942 	int sin_size = sizeof (struct sockaddr_in);
2943 	int in_size = sizeof (struct in_addr);
2944 	int sin6_size = sizeof (struct sockaddr_in6);
2945 
2946 	ASSERT(any != NULL);
2947 	ASSERT(event != NULL);
2948 
2949 	hca = (rib_hca_t *)any;
2950 
2951 	/* got a connection request */
2952 	switch (event->cm_type) {
2953 	case IBT_CM_EVENT_REQ_RCV:
2954 		/*
2955 		 * If the plugin is in the NO_ACCEPT state, bail out.
2956 		 */
2957 		mutex_enter(&plugin_state_lock);
2958 		if (plugin_state == NO_ACCEPT) {
2959 			mutex_exit(&plugin_state_lock);
2960 			return (IBT_CM_REJECT);
2961 		}
2962 		mutex_exit(&plugin_state_lock);
2963 
2964 		/*
2965 		 * Need to send a MRA MAD to CM so that it does not
2966 		 * timeout on us.
2967 		 */
2968 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2969 		    event->cm_event.req.req_timeout * 8, NULL, 0);
2970 
2971 		mutex_enter(&rib_stat->open_hca_lock);
2972 		q = rib_stat->q;
2973 		mutex_exit(&rib_stat->open_hca_lock);
2974 
2975 		status = rib_svc_create_chan(hca, (caddr_t)q,
2976 		    event->cm_event.req.req_prim_hca_port, &qp);
2977 
2978 		if (status) {
2979 			return (IBT_CM_REJECT);
2980 		}
2981 
2982 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2983 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2984 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2985 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2986 
2987 		/*
2988 		 * Pre-posts RECV buffers
2989 		 */
2990 		conn = qptoc(qp);
2991 		for (i = 0; i < preposted_rbufs; i++) {
2992 			bzero(&rdbuf, sizeof (rdbuf));
2993 			rdbuf.type = RECV_BUFFER;
2994 			buf = rib_rbuf_alloc(conn, &rdbuf);
2995 			if (buf == NULL) {
2996 				/*
2997 				 * A connection is not established yet.
2998 				 * Just flush the channel. Buffers
2999 				 * posted till now will error out with
3000 				 * IBT_WC_WR_FLUSHED_ERR.
3001 				 */
3002 				(void) ibt_flush_channel(qp->qp_hdl);
3003 				(void) rib_disconnect_channel(conn, NULL);
3004 				return (IBT_CM_REJECT);
3005 			}
3006 
3007 			bzero(&cl, sizeof (cl));
3008 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
3009 			cl.c_len = rdbuf.len;
3010 			cl.c_smemhandle.mrc_lmr =
3011 			    rdbuf.handle.mrc_lmr; /* lkey */
3012 			cl.c_next = NULL;
3013 			status = rib_post_recv(conn, &cl);
3014 			if (status != RDMA_SUCCESS) {
3015 				/*
3016 				 * A connection is not established yet.
3017 				 * Just flush the channel. Buffers
3018 				 * posted till now will error out with
3019 				 * IBT_WC_WR_FLUSHED_ERR.
3020 				 */
3021 				(void) ibt_flush_channel(qp->qp_hdl);
3022 				(void) rib_disconnect_channel(conn, NULL);
3023 				return (IBT_CM_REJECT);
3024 			}
3025 		}
3026 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
3027 
3028 		/*
3029 		 * Get the address translation
3030 		 */
3031 		rw_enter(&hca->state_lock, RW_READER);
3032 		if (hca->state == HCA_DETACHED) {
3033 			rw_exit(&hca->state_lock);
3034 			return (IBT_CM_REJECT);
3035 		}
3036 		rw_exit(&hca->state_lock);
3037 
3038 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3039 
3040 		if (ibt_get_ip_data(event->cm_priv_data_len,
3041 		    event->cm_priv_data,
3042 		    &ipinfo) != IBT_SUCCESS) {
3043 
3044 			return (IBT_CM_REJECT);
3045 		}
3046 
3047 		switch (ipinfo.src_addr.family) {
3048 		case AF_INET:
3049 
3050 			conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1,
3051 			    KM_SLEEP);
3052 			(void) strcpy(conn->c_netid, RIBNETID_TCP);
3053 
3054 			conn->c_raddr.maxlen =
3055 			    conn->c_raddr.len = sin_size;
3056 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3057 
3058 			s = (struct sockaddr_in *)conn->c_raddr.buf;
3059 			s->sin_family = AF_INET;
3060 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3061 			    &s->sin_addr, in_size);
3062 
3063 			conn->c_laddr.maxlen =
3064 			    conn->c_laddr.len = sin_size;
3065 			conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3066 
3067 			s = (struct sockaddr_in *)conn->c_laddr.buf;
3068 			s->sin_family = AF_INET;
3069 			bcopy((void *)&ipinfo.dst_addr.un.ip4addr,
3070 			    &s->sin_addr, in_size);
3071 
3072 			break;
3073 
3074 		case AF_INET6:
3075 
3076 			conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1,
3077 			    KM_SLEEP);
3078 			(void) strcpy(conn->c_netid, RIBNETID_TCP6);
3079 
3080 			conn->c_raddr.maxlen =
3081 			    conn->c_raddr.len = sin6_size;
3082 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3083 
3084 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3085 			s6->sin6_family = AF_INET6;
3086 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3087 			    &s6->sin6_addr,
3088 			    sizeof (struct in6_addr));
3089 
3090 			conn->c_laddr.maxlen =
3091 			    conn->c_laddr.len = sin6_size;
3092 			conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3093 
3094 			s6 = (struct sockaddr_in6 *)conn->c_laddr.buf;
3095 			s6->sin6_family = AF_INET6;
3096 			bcopy((void *)&ipinfo.dst_addr.un.ip6addr,
3097 			    &s6->sin6_addr,
3098 			    sizeof (struct in6_addr));
3099 
3100 			break;
3101 
3102 		default:
3103 			return (IBT_CM_REJECT);
3104 		}
3105 
3106 		break;
3107 
3108 	case IBT_CM_EVENT_CONN_CLOSED:
3109 	{
3110 		CONN		*conn;
3111 		rib_qp_t	*qp;
3112 
3113 		switch (event->cm_event.closed) {
3114 		case IBT_CM_CLOSED_DREP_RCVD:
3115 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
3116 		case IBT_CM_CLOSED_DUP:
3117 		case IBT_CM_CLOSED_ABORT:
3118 		case IBT_CM_CLOSED_ALREADY:
3119 			/*
3120 			 * These cases indicate the local end initiated
3121 			 * the closing of the channel. Nothing to do here.
3122 			 */
3123 			break;
3124 		default:
3125 			/*
3126 			 * Reason for CONN_CLOSED event must be one of
3127 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3128 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
3129 			 * the remote end is closing the channel. In these
3130 			 * cases free the channel and transition to error
3131 			 * state
3132 			 */
3133 			qp = ibt_get_chan_private(event->cm_channel);
3134 			conn = qptoc(qp);
3135 			mutex_enter(&conn->c_lock);
3136 			if (conn->c_state == C_DISCONN_PEND) {
3137 				mutex_exit(&conn->c_lock);
3138 				break;
3139 			}
3140 			conn->c_state = C_ERROR_CONN;
3141 
3142 			/*
3143 			 * Free the conn if c_ref goes down to 0
3144 			 */
3145 			if (conn->c_ref == 0) {
3146 				/*
3147 				 * Remove from list and free conn
3148 				 */
3149 				conn->c_state = C_DISCONN_PEND;
3150 				mutex_exit(&conn->c_lock);
3151 				(void) rib_disconnect_channel(conn,
3152 				    &hca->srv_conn_list);
3153 			} else {
3154 				/*
3155 				 * conn will be freed when c_ref goes to 0.
3156 				 * Indicate to cleaning thread not to close
3157 				 * the connection, but just free the channel.
3158 				 */
3159 				conn->c_flags |= C_CLOSE_NOTNEEDED;
3160 				mutex_exit(&conn->c_lock);
3161 			}
3162 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3163 			break;
3164 		}
3165 		break;
3166 	}
3167 	case IBT_CM_EVENT_CONN_EST:
3168 		/*
3169 		 * RTU received, hence connection established.
3170 		 */
3171 		if (rib_debug > 1)
3172 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3173 			    "(CONN_EST) channel established");
3174 		break;
3175 
3176 	default:
3177 		if (rib_debug > 2) {
3178 			/* Let CM handle the following events. */
3179 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3180 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3181 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3182 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3183 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3184 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3185 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3186 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3187 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3188 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3189 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3190 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3191 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3192 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3193 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
3194 			}
3195 		}
3196 		return (IBT_CM_DEFAULT);
3197 	}
3198 
3199 	/* accept all other CM messages (i.e. let the CM handle them) */
3200 	return (IBT_CM_ACCEPT);
3201 }
3202 
3203 static rdma_stat
3204 rib_register_service(rib_hca_t *hca, int service_type,
3205 	uint8_t protocol_num, in_port_t dst_port)
3206 {
3207 	ibt_srv_desc_t		sdesc;
3208 	ibt_hca_portinfo_t	*port_infop;
3209 	ib_svc_id_t		srv_id;
3210 	ibt_srv_hdl_t		srv_hdl;
3211 	uint_t			port_size;
3212 	uint_t			pki, i, num_ports, nbinds;
3213 	ibt_status_t		ibt_status;
3214 	rib_service_t		*service;
3215 	ib_pkey_t		pkey;
3216 
3217 	/*
3218 	 * Query all ports for the given HCA
3219 	 */
3220 	rw_enter(&hca->state_lock, RW_READER);
3221 	if (hca->state != HCA_DETACHED) {
3222 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3223 		    &num_ports, &port_size);
3224 		rw_exit(&hca->state_lock);
3225 	} else {
3226 		rw_exit(&hca->state_lock);
3227 		return (RDMA_FAILED);
3228 	}
3229 	if (ibt_status != IBT_SUCCESS) {
3230 		return (RDMA_FAILED);
3231 	}
3232 
3233 	DTRACE_PROBE1(rpcib__i__regservice_numports,
3234 	    int, num_ports);
3235 
3236 	for (i = 0; i < num_ports; i++) {
3237 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3238 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3239 			    int, i+1);
3240 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3241 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
3242 			    int, i+1);
3243 		}
3244 	}
3245 
3246 	/*
3247 	 * Get all the IP addresses on this system to register the
3248 	 * given "service type" on all DNS recognized IP addrs.
3249 	 * Each service type such as NFS will have all the systems
3250 	 * IP addresses as its different names. For now the only
3251 	 * type of service we support in RPCIB is NFS.
3252 	 */
3253 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3254 	/*
3255 	 * Start registering and binding service to active
3256 	 * on active ports on this HCA.
3257 	 */
3258 	nbinds = 0;
3259 	for (service = rib_stat->service_list;
3260 	    service && (service->srv_type != service_type);
3261 	    service = service->next)
3262 		;
3263 
3264 	if (service == NULL) {
3265 		/*
3266 		 * We use IP addresses as the service names for
3267 		 * service registration.  Register each of them
3268 		 * with CM to obtain a svc_id and svc_hdl.  We do not
3269 		 * register the service with machine's loopback address.
3270 		 */
3271 		(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3272 		(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3273 		(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3274 		sdesc.sd_handler = rib_srv_cm_handler;
3275 		sdesc.sd_flags = 0;
3276 		ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3277 		    &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3278 		    1, &srv_hdl, &srv_id);
3279 		if ((ibt_status != IBT_SUCCESS) &&
3280 		    (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3281 			rw_exit(&rib_stat->service_list_lock);
3282 			DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3283 			    int, ibt_status);
3284 			ibt_free_portinfo(port_infop, port_size);
3285 			return (RDMA_FAILED);
3286 		}
3287 
3288 		/*
3289 		 * Allocate and prepare a service entry
3290 		 */
3291 		service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3292 
3293 		service->srv_type = service_type;
3294 		service->srv_hdl = srv_hdl;
3295 		service->srv_id = srv_id;
3296 
3297 		service->next = rib_stat->service_list;
3298 		rib_stat->service_list = service;
3299 		DTRACE_PROBE1(rpcib__i__regservice__new__service,
3300 		    int, service->srv_type);
3301 	} else {
3302 		srv_hdl = service->srv_hdl;
3303 		srv_id = service->srv_id;
3304 		DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3305 		    int, service->srv_type);
3306 	}
3307 
3308 	for (i = 0; i < num_ports; i++) {
3309 		ibt_sbind_hdl_t		sbp;
3310 		rib_hca_service_t	*hca_srv;
3311 		ib_gid_t		gid;
3312 
3313 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3314 			continue;
3315 
3316 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3317 			pkey = port_infop[i].p_pkey_tbl[pki];
3318 
3319 			rw_enter(&hca->bound_services_lock, RW_READER);
3320 			gid = port_infop[i].p_sgid_tbl[0];
3321 			for (hca_srv = hca->bound_services; hca_srv;
3322 			    hca_srv = hca_srv->next) {
3323 				if ((hca_srv->srv_id == service->srv_id) &&
3324 				    (hca_srv->gid.gid_prefix ==
3325 				    gid.gid_prefix) &&
3326 				    (hca_srv->gid.gid_guid == gid.gid_guid))
3327 					break;
3328 			}
3329 			rw_exit(&hca->bound_services_lock);
3330 			if (hca_srv != NULL) {
3331 				/*
3332 				 * port is alreay bound the the service
3333 				 */
3334 				DTRACE_PROBE1(
3335 				    rpcib__i__regservice__already__bound,
3336 				    int, i+1);
3337 				nbinds++;
3338 				continue;
3339 			}
3340 
3341 			if ((pkey & IBSRM_HB) &&
3342 			    (pkey != IB_PKEY_INVALID_FULL)) {
3343 
3344 				sbp = NULL;
3345 				ibt_status = ibt_bind_service(srv_hdl,
3346 				    gid, NULL, hca, &sbp);
3347 
3348 				if (ibt_status == IBT_SUCCESS) {
3349 					hca_srv = kmem_zalloc(
3350 					    sizeof (rib_hca_service_t),
3351 					    KM_SLEEP);
3352 					hca_srv->srv_id = srv_id;
3353 					hca_srv->gid = gid;
3354 					hca_srv->sbind_hdl = sbp;
3355 
3356 					rw_enter(&hca->bound_services_lock,
3357 					    RW_WRITER);
3358 					hca_srv->next = hca->bound_services;
3359 					hca->bound_services = hca_srv;
3360 					rw_exit(&hca->bound_services_lock);
3361 					nbinds++;
3362 				}
3363 
3364 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3365 				    int, ibt_status);
3366 			}
3367 		}
3368 	}
3369 	rw_exit(&rib_stat->service_list_lock);
3370 
3371 	ibt_free_portinfo(port_infop, port_size);
3372 
3373 	if (nbinds == 0) {
3374 		return (RDMA_FAILED);
3375 	} else {
3376 		/*
3377 		 * Put this plugin into accept state, since atleast
3378 		 * one registration was successful.
3379 		 */
3380 		mutex_enter(&plugin_state_lock);
3381 		plugin_state = ACCEPT;
3382 		mutex_exit(&plugin_state_lock);
3383 		return (RDMA_SUCCESS);
3384 	}
3385 }
3386 
3387 void
3388 rib_listen(struct rdma_svc_data *rd)
3389 {
3390 	rdma_stat status;
3391 	int n_listening = 0;
3392 	rib_hca_t *hca;
3393 
3394 	mutex_enter(&rib_stat->listen_lock);
3395 	/*
3396 	 * if rd parameter is NULL then it means that rib_stat->q is
3397 	 * already initialized by a call from RDMA and we just want to
3398 	 * add a newly attached HCA to the same listening state as other
3399 	 * HCAs.
3400 	 */
3401 	if (rd == NULL) {
3402 		if (rib_stat->q == NULL) {
3403 			mutex_exit(&rib_stat->listen_lock);
3404 			return;
3405 		}
3406 	} else {
3407 		rib_stat->q = &rd->q;
3408 	}
3409 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3410 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3411 		/*
3412 		 * First check if a hca is still attached
3413 		 */
3414 		rw_enter(&hca->state_lock, RW_READER);
3415 		if (hca->state != HCA_INITED) {
3416 			rw_exit(&hca->state_lock);
3417 			continue;
3418 		}
3419 		rw_exit(&hca->state_lock);
3420 
3421 		/*
3422 		 * Right now the only service type is NFS. Hence
3423 		 * force feed this value. Ideally to communicate
3424 		 * the service type it should be passed down in
3425 		 * rdma_svc_data.
3426 		 */
3427 		status = rib_register_service(hca, NFS,
3428 		    IPPROTO_TCP, nfs_rdma_port);
3429 		if (status == RDMA_SUCCESS)
3430 			n_listening++;
3431 	}
3432 	rw_exit(&rib_stat->hcas_list_lock);
3433 
3434 	/*
3435 	 * Service active on an HCA, check rd->err_code for more
3436 	 * explainable errors.
3437 	 */
3438 	if (rd) {
3439 		if (n_listening > 0) {
3440 			rd->active = 1;
3441 			rd->err_code = RDMA_SUCCESS;
3442 		} else {
3443 			rd->active = 0;
3444 			rd->err_code = RDMA_FAILED;
3445 		}
3446 	}
3447 	mutex_exit(&rib_stat->listen_lock);
3448 }
3449 
3450 /* XXXX */
3451 /* ARGSUSED */
3452 static void
3453 rib_listen_stop(struct rdma_svc_data *svcdata)
3454 {
3455 	rib_hca_t		*hca;
3456 
3457 	mutex_enter(&rib_stat->listen_lock);
3458 	/*
3459 	 * KRPC called the RDMATF to stop the listeners, this means
3460 	 * stop sending incomming or recieved requests to KRPC master
3461 	 * transport handle for RDMA-IB. This is also means that the
3462 	 * master transport handle, responsible for us, is going away.
3463 	 */
3464 	mutex_enter(&plugin_state_lock);
3465 	plugin_state = NO_ACCEPT;
3466 	if (svcdata != NULL)
3467 		svcdata->active = 0;
3468 	mutex_exit(&plugin_state_lock);
3469 
3470 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3471 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3472 		/*
3473 		 * First check if a hca is still attached
3474 		 */
3475 		rw_enter(&hca->state_lock, RW_READER);
3476 		if (hca->state == HCA_DETACHED) {
3477 			rw_exit(&hca->state_lock);
3478 			continue;
3479 		}
3480 		rib_close_channels(&hca->srv_conn_list);
3481 		rib_stop_services(hca);
3482 		rw_exit(&hca->state_lock);
3483 	}
3484 	rw_exit(&rib_stat->hcas_list_lock);
3485 
3486 	/*
3487 	 * Avoid rib_listen() using the stale q field.
3488 	 * This could happen if a port goes up after all services
3489 	 * are already unregistered.
3490 	 */
3491 	rib_stat->q = NULL;
3492 	mutex_exit(&rib_stat->listen_lock);
3493 }
3494 
3495 /*
3496  * Traverse the HCA's service list to unbind and deregister services.
3497  * For each bound service of HCA to be removed, first find the corresponding
3498  * service handle (srv_hdl) and then unbind the service by calling
3499  * ibt_unbind_service().
3500  */
3501 static void
3502 rib_stop_services(rib_hca_t *hca)
3503 {
3504 	rib_hca_service_t *srv_list, *to_remove;
3505 
3506 	/*
3507 	 * unbind and deregister the services for this service type.
3508 	 * Right now there is only one service type. In future it will
3509 	 * be passed down to this function.
3510 	 */
3511 	rw_enter(&hca->bound_services_lock, RW_READER);
3512 	srv_list = hca->bound_services;
3513 	hca->bound_services = NULL;
3514 	rw_exit(&hca->bound_services_lock);
3515 
3516 	while (srv_list != NULL) {
3517 		rib_service_t *sc;
3518 
3519 		to_remove = srv_list;
3520 		srv_list = to_remove->next;
3521 		rw_enter(&rib_stat->service_list_lock, RW_READER);
3522 		for (sc = rib_stat->service_list;
3523 		    sc && (sc->srv_id != to_remove->srv_id);
3524 		    sc = sc->next)
3525 			;
3526 		/*
3527 		 * if sc is NULL then the service doesn't exist anymore,
3528 		 * probably just removed completely through rib_stat.
3529 		 */
3530 		if (sc != NULL)
3531 			(void) ibt_unbind_service(sc->srv_hdl,
3532 			    to_remove->sbind_hdl);
3533 		rw_exit(&rib_stat->service_list_lock);
3534 		kmem_free(to_remove, sizeof (rib_hca_service_t));
3535 	}
3536 }
3537 
3538 static struct svc_recv *
3539 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3540 {
3541 	struct svc_recv	*recvp;
3542 
3543 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3544 	recvp->vaddr = sgl->ds_va;
3545 	recvp->qp = qp;
3546 	recvp->bytes_xfer = 0;
3547 	return (recvp);
3548 }
3549 
3550 static int
3551 rib_free_svc_recv(struct svc_recv *recvp)
3552 {
3553 	kmem_free(recvp, sizeof (*recvp));
3554 
3555 	return (0);
3556 }
3557 
3558 static struct reply *
3559 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3560 {
3561 	struct reply	*rep;
3562 
3563 
3564 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3565 	if (rep == NULL) {
3566 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3567 		return (NULL);
3568 	}
3569 	rep->xid = msgid;
3570 	rep->vaddr_cq = NULL;
3571 	rep->bytes_xfer = 0;
3572 	rep->status = (uint_t)REPLY_WAIT;
3573 	rep->prev = NULL;
3574 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3575 
3576 	mutex_enter(&qp->replylist_lock);
3577 	if (qp->replylist) {
3578 		rep->next = qp->replylist;
3579 		qp->replylist->prev = rep;
3580 	}
3581 	qp->rep_list_size++;
3582 
3583 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3584 	    int, qp->rep_list_size);
3585 
3586 	qp->replylist = rep;
3587 	mutex_exit(&qp->replylist_lock);
3588 
3589 	return (rep);
3590 }
3591 
3592 static rdma_stat
3593 rib_rem_replylist(rib_qp_t *qp)
3594 {
3595 	struct reply	*r, *n;
3596 
3597 	mutex_enter(&qp->replylist_lock);
3598 	for (r = qp->replylist; r != NULL; r = n) {
3599 		n = r->next;
3600 		(void) rib_remreply(qp, r);
3601 	}
3602 	mutex_exit(&qp->replylist_lock);
3603 
3604 	return (RDMA_SUCCESS);
3605 }
3606 
3607 static int
3608 rib_remreply(rib_qp_t *qp, struct reply *rep)
3609 {
3610 
3611 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3612 	if (rep->prev) {
3613 		rep->prev->next = rep->next;
3614 	}
3615 	if (rep->next) {
3616 		rep->next->prev = rep->prev;
3617 	}
3618 	if (qp->replylist == rep)
3619 		qp->replylist = rep->next;
3620 
3621 	cv_destroy(&rep->wait_cv);
3622 	qp->rep_list_size--;
3623 
3624 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3625 	    int, qp->rep_list_size);
3626 
3627 	kmem_free(rep, sizeof (*rep));
3628 
3629 	return (0);
3630 }
3631 
3632 rdma_stat
3633 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3634 	struct mrc *buf_handle)
3635 {
3636 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3637 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3638 	rdma_stat	status;
3639 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3640 
3641 	/*
3642 	 * Note: ALL buffer pools use the same memory type RDMARW.
3643 	 */
3644 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3645 	if (status == RDMA_SUCCESS) {
3646 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3647 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3648 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3649 	} else {
3650 		buf_handle->mrc_linfo = NULL;
3651 		buf_handle->mrc_lmr = 0;
3652 		buf_handle->mrc_rmr = 0;
3653 	}
3654 	return (status);
3655 }
3656 
3657 static rdma_stat
3658 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3659 	ibt_mr_flags_t spec,
3660 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3661 {
3662 	ibt_mr_attr_t	mem_attr;
3663 	ibt_status_t	ibt_status;
3664 	mem_attr.mr_vaddr = (uintptr_t)buf;
3665 	mem_attr.mr_len = (ib_msglen_t)size;
3666 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3667 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3668 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3669 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3670 
3671 	rw_enter(&hca->state_lock, RW_READER);
3672 	if (hca->state != HCA_DETACHED) {
3673 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3674 		    &mem_attr, mr_hdlp, mr_descp);
3675 		rw_exit(&hca->state_lock);
3676 	} else {
3677 		rw_exit(&hca->state_lock);
3678 		return (RDMA_FAILED);
3679 	}
3680 
3681 	if (ibt_status != IBT_SUCCESS) {
3682 		return (RDMA_FAILED);
3683 	}
3684 	return (RDMA_SUCCESS);
3685 }
3686 
3687 rdma_stat
3688 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3689 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3690 {
3691 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3692 	rib_lrc_entry_t *l;
3693 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3694 	rdma_stat	status;
3695 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3696 
3697 	/*
3698 	 * Non-coherent memory registration.
3699 	 */
3700 	l = (rib_lrc_entry_t *)lrc;
3701 	if (l) {
3702 		if (l->registered) {
3703 			buf_handle->mrc_linfo =
3704 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3705 			buf_handle->mrc_lmr =
3706 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3707 			buf_handle->mrc_rmr =
3708 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3709 			*sync_handle = (RIB_SYNCMEM_HANDLE)
3710 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3711 			return (RDMA_SUCCESS);
3712 		} else {
3713 			/* Always register the whole buffer */
3714 			buf = (caddr_t)l->lrc_buf;
3715 			buflen = l->lrc_len;
3716 		}
3717 	}
3718 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3719 
3720 	if (status == RDMA_SUCCESS) {
3721 		if (l) {
3722 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3723 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3724 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3725 			l->registered		 = TRUE;
3726 		}
3727 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3728 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3729 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3730 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3731 	} else {
3732 		buf_handle->mrc_linfo = NULL;
3733 		buf_handle->mrc_lmr = 0;
3734 		buf_handle->mrc_rmr = 0;
3735 	}
3736 	return (status);
3737 }
3738 
3739 /* ARGSUSED */
3740 rdma_stat
3741 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3742 {
3743 	rib_hca_t *hca = (ctoqp(conn))->hca;
3744 	/*
3745 	 * Allow memory deregistration even if HCA is
3746 	 * getting detached. Need all outstanding
3747 	 * memory registrations to be deregistered
3748 	 * before HCA_DETACH_EVENT can be accepted.
3749 	 */
3750 	(void) ibt_deregister_mr(hca->hca_hdl,
3751 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3752 	return (RDMA_SUCCESS);
3753 }
3754 
3755 /* ARGSUSED */
3756 rdma_stat
3757 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3758 		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3759 {
3760 	rib_lrc_entry_t *l;
3761 	l = (rib_lrc_entry_t *)lrc;
3762 	if (l)
3763 		if (l->registered)
3764 			return (RDMA_SUCCESS);
3765 
3766 	(void) rib_deregistermem(conn, buf, buf_handle);
3767 
3768 	return (RDMA_SUCCESS);
3769 }
3770 
3771 /* ARGSUSED */
3772 rdma_stat
3773 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3774 		int len, int cpu)
3775 {
3776 	ibt_status_t	status;
3777 	rib_hca_t *hca = (ctoqp(conn))->hca;
3778 	ibt_mr_sync_t	mr_segment;
3779 
3780 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3781 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3782 	mr_segment.ms_len = (ib_memlen_t)len;
3783 	if (cpu) {
3784 		/* make incoming data visible to memory */
3785 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3786 	} else {
3787 		/* make memory changes visible to IO */
3788 		mr_segment.ms_flags = IBT_SYNC_READ;
3789 	}
3790 	rw_enter(&hca->state_lock, RW_READER);
3791 	if (hca->state != HCA_DETACHED) {
3792 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3793 		rw_exit(&hca->state_lock);
3794 	} else {
3795 		rw_exit(&hca->state_lock);
3796 		return (RDMA_FAILED);
3797 	}
3798 
3799 	if (status == IBT_SUCCESS)
3800 		return (RDMA_SUCCESS);
3801 	else {
3802 		return (RDMA_FAILED);
3803 	}
3804 }
3805 
3806 /*
3807  * XXXX	????
3808  */
3809 static rdma_stat
3810 rib_getinfo(rdma_info_t *info)
3811 {
3812 	/*
3813 	 * XXXX	Hack!
3814 	 */
3815 	info->addrlen = 16;
3816 	info->mts = 1000000;
3817 	info->mtu = 1000000;
3818 
3819 	return (RDMA_SUCCESS);
3820 }
3821 
3822 rib_bufpool_t *
3823 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3824 {
3825 	rib_bufpool_t	*rbp = NULL;
3826 	bufpool_t	*bp = NULL;
3827 	caddr_t		buf;
3828 	ibt_mr_attr_t	mem_attr;
3829 	ibt_status_t	ibt_status;
3830 	int		i, j;
3831 
3832 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3833 
3834 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3835 	    num * sizeof (void *), KM_SLEEP);
3836 
3837 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3838 	bp->numelems = num;
3839 
3840 
3841 	switch (ptype) {
3842 	case SEND_BUFFER:
3843 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3844 		bp->rsize = RPC_MSG_SZ;
3845 		break;
3846 	case RECV_BUFFER:
3847 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3848 		bp->rsize = RPC_BUF_SIZE;
3849 		break;
3850 	default:
3851 		goto fail;
3852 	}
3853 
3854 	/*
3855 	 * Register the pool.
3856 	 */
3857 	bp->bufsize = num * bp->rsize;
3858 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3859 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3860 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3861 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3862 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3863 	rw_enter(&hca->state_lock, RW_READER);
3864 
3865 	if (hca->state == HCA_DETACHED) {
3866 		rw_exit(&hca->state_lock);
3867 		goto fail;
3868 	}
3869 
3870 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3871 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3872 		mem_attr.mr_vaddr = (uintptr_t)buf;
3873 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3874 		mem_attr.mr_as = NULL;
3875 		ibt_status = ibt_register_mr(hca->hca_hdl,
3876 		    hca->pd_hdl, &mem_attr,
3877 		    &rbp->mr_hdl[i],
3878 		    &rbp->mr_desc[i]);
3879 		if (ibt_status != IBT_SUCCESS) {
3880 			for (j = 0; j < i; j++) {
3881 				(void) ibt_deregister_mr(hca->hca_hdl,
3882 				    rbp->mr_hdl[j]);
3883 			}
3884 			rw_exit(&hca->state_lock);
3885 			goto fail;
3886 		}
3887 	}
3888 	rw_exit(&hca->state_lock);
3889 	buf = (caddr_t)bp->buf;
3890 	for (i = 0; i < num; i++, buf += bp->rsize) {
3891 		bp->buflist[i] = (void *)buf;
3892 	}
3893 	bp->buffree = num - 1;	/* no. of free buffers */
3894 	rbp->bpool = bp;
3895 
3896 	return (rbp);
3897 fail:
3898 	if (bp) {
3899 		if (bp->buf)
3900 			kmem_free(bp->buf, bp->bufsize);
3901 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3902 	}
3903 	if (rbp) {
3904 		if (rbp->mr_hdl)
3905 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3906 		if (rbp->mr_desc)
3907 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3908 		kmem_free(rbp, sizeof (rib_bufpool_t));
3909 	}
3910 	return (NULL);
3911 }
3912 
3913 static void
3914 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3915 {
3916 	int i;
3917 	rib_bufpool_t *rbp = NULL;
3918 	bufpool_t *bp;
3919 
3920 	/*
3921 	 * Obtain pool address based on type of pool
3922 	 */
3923 	switch (ptype) {
3924 		case SEND_BUFFER:
3925 			rbp = hca->send_pool;
3926 			break;
3927 		case RECV_BUFFER:
3928 			rbp = hca->recv_pool;
3929 			break;
3930 		default:
3931 			return;
3932 	}
3933 	if (rbp == NULL)
3934 		return;
3935 
3936 	bp = rbp->bpool;
3937 
3938 	/*
3939 	 * Deregister the pool memory and free it.
3940 	 */
3941 	for (i = 0; i < bp->numelems; i++) {
3942 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3943 	}
3944 }
3945 
3946 static void
3947 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3948 {
3949 
3950 	rib_bufpool_t *rbp = NULL;
3951 	bufpool_t *bp;
3952 
3953 	/*
3954 	 * Obtain pool address based on type of pool
3955 	 */
3956 	switch (ptype) {
3957 		case SEND_BUFFER:
3958 			rbp = hca->send_pool;
3959 			break;
3960 		case RECV_BUFFER:
3961 			rbp = hca->recv_pool;
3962 			break;
3963 		default:
3964 			return;
3965 	}
3966 	if (rbp == NULL)
3967 		return;
3968 
3969 	bp = rbp->bpool;
3970 
3971 	/*
3972 	 * Free the pool memory.
3973 	 */
3974 	if (rbp->mr_hdl)
3975 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3976 
3977 	if (rbp->mr_desc)
3978 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
3979 	if (bp->buf)
3980 		kmem_free(bp->buf, bp->bufsize);
3981 	mutex_destroy(&bp->buflock);
3982 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
3983 	kmem_free(rbp, sizeof (rib_bufpool_t));
3984 }
3985 
3986 void
3987 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
3988 {
3989 	/*
3990 	 * Deregister the pool memory and free it.
3991 	 */
3992 	rib_rbufpool_deregister(hca, ptype);
3993 	rib_rbufpool_free(hca, ptype);
3994 }
3995 
3996 /*
3997  * Fetch a buffer from the pool of type specified in rdbuf->type.
3998  */
3999 static rdma_stat
4000 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4001 {
4002 	rib_lrc_entry_t *rlep;
4003 
4004 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
4005 		rlep = rib_get_cache_buf(conn, rdbuf->len);
4006 		rdbuf->rb_private =  (caddr_t)rlep;
4007 		rdbuf->addr = rlep->lrc_buf;
4008 		rdbuf->handle = rlep->lrc_mhandle;
4009 		return (RDMA_SUCCESS);
4010 	}
4011 
4012 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4013 	if (rdbuf->addr) {
4014 		switch (rdbuf->type) {
4015 		case SEND_BUFFER:
4016 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
4017 			break;
4018 		case RECV_BUFFER:
4019 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
4020 			break;
4021 		default:
4022 			rdbuf->len = 0;
4023 		}
4024 		return (RDMA_SUCCESS);
4025 	} else
4026 		return (RDMA_FAILED);
4027 }
4028 
4029 /*
4030  * Fetch a buffer of specified type.
4031  * Note that rdbuf->handle is mw's rkey.
4032  */
4033 static void *
4034 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4035 {
4036 	rib_qp_t	*qp = ctoqp(conn);
4037 	rib_hca_t	*hca = qp->hca;
4038 	rdma_btype	ptype = rdbuf->type;
4039 	void		*buf;
4040 	rib_bufpool_t	*rbp = NULL;
4041 	bufpool_t	*bp;
4042 	int		i;
4043 
4044 	/*
4045 	 * Obtain pool address based on type of pool
4046 	 */
4047 	switch (ptype) {
4048 	case SEND_BUFFER:
4049 		rbp = hca->send_pool;
4050 		break;
4051 	case RECV_BUFFER:
4052 		rbp = hca->recv_pool;
4053 		break;
4054 	default:
4055 		return (NULL);
4056 	}
4057 	if (rbp == NULL)
4058 		return (NULL);
4059 
4060 	bp = rbp->bpool;
4061 
4062 	mutex_enter(&bp->buflock);
4063 	if (bp->buffree < 0) {
4064 		mutex_exit(&bp->buflock);
4065 		return (NULL);
4066 	}
4067 
4068 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4069 	buf = bp->buflist[bp->buffree];
4070 	rdbuf->addr = buf;
4071 	rdbuf->len = bp->rsize;
4072 	for (i = bp->numelems - 1; i >= 0; i--) {
4073 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4074 			rdbuf->handle.mrc_rmr =
4075 			    (uint32_t)rbp->mr_desc[i].md_rkey;
4076 			rdbuf->handle.mrc_linfo =
4077 			    (uintptr_t)rbp->mr_hdl[i];
4078 			rdbuf->handle.mrc_lmr =
4079 			    (uint32_t)rbp->mr_desc[i].md_lkey;
4080 			bp->buffree--;
4081 
4082 			mutex_exit(&bp->buflock);
4083 
4084 			return (buf);
4085 		}
4086 	}
4087 
4088 	mutex_exit(&bp->buflock);
4089 
4090 	return (NULL);
4091 }
4092 
4093 static void
4094 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4095 {
4096 
4097 	if (rdbuf->type == RDMA_LONG_BUFFER) {
4098 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4099 		rdbuf->rb_private = NULL;
4100 		return;
4101 	}
4102 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4103 }
4104 
4105 static void
4106 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4107 {
4108 	rib_qp_t *qp = ctoqp(conn);
4109 	rib_hca_t *hca = qp->hca;
4110 	rib_bufpool_t *rbp = NULL;
4111 	bufpool_t *bp;
4112 
4113 	/*
4114 	 * Obtain pool address based on type of pool
4115 	 */
4116 	switch (ptype) {
4117 	case SEND_BUFFER:
4118 		rbp = hca->send_pool;
4119 		break;
4120 	case RECV_BUFFER:
4121 		rbp = hca->recv_pool;
4122 		break;
4123 	default:
4124 		return;
4125 	}
4126 	if (rbp == NULL)
4127 		return;
4128 
4129 	bp = rbp->bpool;
4130 
4131 	mutex_enter(&bp->buflock);
4132 	if (++bp->buffree >= bp->numelems) {
4133 		/*
4134 		 * Should never happen
4135 		 */
4136 		bp->buffree--;
4137 	} else {
4138 		bp->buflist[bp->buffree] = buf;
4139 	}
4140 	mutex_exit(&bp->buflock);
4141 }
4142 
4143 static rdma_stat
4144 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4145 {
4146 	rw_enter(&connlist->conn_lock, RW_WRITER);
4147 	if (connlist->conn_hd) {
4148 		cn->c_next = connlist->conn_hd;
4149 		connlist->conn_hd->c_prev = cn;
4150 	}
4151 	connlist->conn_hd = cn;
4152 	rw_exit(&connlist->conn_lock);
4153 
4154 	return (RDMA_SUCCESS);
4155 }
4156 
4157 static rdma_stat
4158 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4159 {
4160 	rw_enter(&connlist->conn_lock, RW_WRITER);
4161 	if (cn->c_prev) {
4162 		cn->c_prev->c_next = cn->c_next;
4163 	}
4164 	if (cn->c_next) {
4165 		cn->c_next->c_prev = cn->c_prev;
4166 	}
4167 	if (connlist->conn_hd == cn)
4168 		connlist->conn_hd = cn->c_next;
4169 	rw_exit(&connlist->conn_lock);
4170 
4171 	return (RDMA_SUCCESS);
4172 }
4173 
4174 /* ARGSUSED */
4175 static rdma_stat
4176 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4177     int addr_type, void *handle, CONN **conn)
4178 {
4179 	rdma_stat status;
4180 	rpcib_ping_t rpt;
4181 
4182 	status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
4183 	return (status);
4184 }
4185 
4186 /*
4187  * rib_find_hca_connection
4188  *
4189  * if there is an existing connection to the specified address then
4190  * it will be returned in conn, otherwise conn will be set to NULL.
4191  * Also cleans up any connection that is in error state.
4192  */
4193 static int
4194 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4195     struct netbuf *d_svcaddr, CONN **conn)
4196 {
4197 	CONN *cn;
4198 	clock_t cv_stat, timout;
4199 
4200 	*conn = NULL;
4201 again:
4202 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4203 	cn = hca->cl_conn_list.conn_hd;
4204 	while (cn != NULL) {
4205 		/*
4206 		 * First, clear up any connection in the ERROR state
4207 		 */
4208 		mutex_enter(&cn->c_lock);
4209 		if (cn->c_state == C_ERROR_CONN) {
4210 			if (cn->c_ref == 0) {
4211 				/*
4212 				 * Remove connection from list and destroy it.
4213 				 */
4214 				cn->c_state = C_DISCONN_PEND;
4215 				mutex_exit(&cn->c_lock);
4216 				rw_exit(&hca->cl_conn_list.conn_lock);
4217 				rib_conn_close((void *)cn);
4218 				goto again;
4219 			}
4220 			mutex_exit(&cn->c_lock);
4221 			cn = cn->c_next;
4222 			continue;
4223 		}
4224 		if (cn->c_state == C_DISCONN_PEND) {
4225 			mutex_exit(&cn->c_lock);
4226 			cn = cn->c_next;
4227 			continue;
4228 		}
4229 
4230 		/*
4231 		 * source address is only checked for if there is one,
4232 		 * this is the case for retries.
4233 		 */
4234 		if ((cn->c_raddr.len == d_svcaddr->len) &&
4235 		    (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4236 		    d_svcaddr->len) == 0) &&
4237 		    ((s_svcaddr->len == 0) ||
4238 		    ((cn->c_laddr.len == s_svcaddr->len) &&
4239 		    (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4240 		    s_svcaddr->len) == 0)))) {
4241 			/*
4242 			 * Our connection. Give up conn list lock
4243 			 * as we are done traversing the list.
4244 			 */
4245 			rw_exit(&hca->cl_conn_list.conn_lock);
4246 			if (cn->c_state == C_CONNECTED) {
4247 				cn->c_ref++;	/* sharing a conn */
4248 				mutex_exit(&cn->c_lock);
4249 				*conn = cn;
4250 				return (RDMA_SUCCESS);
4251 			}
4252 			if (cn->c_state == C_CONN_PEND) {
4253 				/*
4254 				 * Hold a reference to this conn before
4255 				 * we give up the lock.
4256 				 */
4257 				cn->c_ref++;
4258 				timout =  ddi_get_lbolt() +
4259 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
4260 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4261 				    &cn->c_lock, timout)) > 0 &&
4262 				    cn->c_state == C_CONN_PEND)
4263 					;
4264 				if (cv_stat == 0) {
4265 					(void) rib_conn_release_locked(cn);
4266 					return (RDMA_INTR);
4267 				}
4268 				if (cv_stat < 0) {
4269 					(void) rib_conn_release_locked(cn);
4270 					return (RDMA_TIMEDOUT);
4271 				}
4272 				if (cn->c_state == C_CONNECTED) {
4273 					*conn = cn;
4274 					mutex_exit(&cn->c_lock);
4275 					return (RDMA_SUCCESS);
4276 				} else {
4277 					(void) rib_conn_release_locked(cn);
4278 					return (RDMA_TIMEDOUT);
4279 				}
4280 			}
4281 		}
4282 		mutex_exit(&cn->c_lock);
4283 		cn = cn->c_next;
4284 	}
4285 	rw_exit(&hca->cl_conn_list.conn_lock);
4286 	*conn = NULL;
4287 	return (RDMA_FAILED);
4288 }
4289 
4290 /*
4291  * Connection management.
4292  * IBTF does not support recycling of channels. So connections are only
4293  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4294  * C_DISCONN_PEND state. No C_IDLE state.
4295  * C_CONN_PEND state: Connection establishment in progress to the server.
4296  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4297  * It has an RC channel associated with it. ibt_post_send/recv are allowed
4298  * only in this state.
4299  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4300  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4301  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4302  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4303  * c_ref drops to 0 (this indicates that RPC has no more references to this
4304  * connection), the connection should be destroyed. A connection transitions
4305  * into this state when it is being destroyed.
4306  */
4307 /* ARGSUSED */
4308 static rdma_stat
4309 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4310     int addr_type, rpcib_ping_t *rpt, CONN **conn)
4311 {
4312 	CONN *cn;
4313 	int status;
4314 	rib_hca_t *hca;
4315 	rib_qp_t *qp;
4316 	int s_addr_len;
4317 	char *s_addr_buf;
4318 
4319 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4320 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4321 		rw_enter(&hca->state_lock, RW_READER);
4322 		if (hca->state != HCA_DETACHED) {
4323 			status = rib_find_hca_connection(hca, s_svcaddr,
4324 			    d_svcaddr, conn);
4325 			rw_exit(&hca->state_lock);
4326 			if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4327 				rw_exit(&rib_stat->hcas_list_lock);
4328 				return (status);
4329 			}
4330 		} else
4331 			rw_exit(&hca->state_lock);
4332 	}
4333 	rw_exit(&rib_stat->hcas_list_lock);
4334 
4335 	/*
4336 	 * No existing connection found, establish a new connection.
4337 	 */
4338 	bzero(rpt, sizeof (rpcib_ping_t));
4339 
4340 	status = rib_ping_srv(addr_type, d_svcaddr, rpt);
4341 	if (status != RDMA_SUCCESS) {
4342 		return (RDMA_FAILED);
4343 	}
4344 	hca = rpt->hca;
4345 
4346 	if (rpt->srcip.family == AF_INET) {
4347 		s_addr_len = sizeof (rpt->srcip.un.ip4addr);
4348 		s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
4349 	} else if (rpt->srcip.family == AF_INET6) {
4350 		s_addr_len = sizeof (rpt->srcip.un.ip6addr);
4351 		s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
4352 	} else {
4353 		return (RDMA_FAILED);
4354 	}
4355 
4356 	/*
4357 	 * Channel to server doesn't exist yet, create one.
4358 	 */
4359 	if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4360 		return (RDMA_FAILED);
4361 	}
4362 	cn = qptoc(qp);
4363 	cn->c_state = C_CONN_PEND;
4364 	cn->c_ref = 1;
4365 
4366 	cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4367 	bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4368 	cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4369 
4370 	if (rpt->srcip.family == AF_INET) {
4371 		cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP);
4372 		(void) strcpy(cn->c_netid, RIBNETID_TCP);
4373 	} else {
4374 		cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP);
4375 		(void) strcpy(cn->c_netid, RIBNETID_TCP6);
4376 	}
4377 
4378 	/*
4379 	 * Add to conn list.
4380 	 * We had given up the READER lock. In the time since then,
4381 	 * another thread might have created the connection we are
4382 	 * trying here. But for now, that is quiet alright - there
4383 	 * might be two connections between a pair of hosts instead
4384 	 * of one. If we really want to close that window,
4385 	 * then need to check the list after acquiring the
4386 	 * WRITER lock.
4387 	 */
4388 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4389 	status = rib_conn_to_srv(hca, qp, rpt);
4390 	mutex_enter(&cn->c_lock);
4391 
4392 	if (cn->c_flags & C_CLOSE_PENDING) {
4393 		/*
4394 		 * This handles a case where the module or
4395 		 * HCA detached in the time a connection is
4396 		 * established. In such a case close the
4397 		 * connection immediately if this is the
4398 		 * only reference.
4399 		 */
4400 		if (cn->c_ref == 1) {
4401 			cn->c_ref--;
4402 			cn->c_state = C_DISCONN_PEND;
4403 			mutex_exit(&cn->c_lock);
4404 			rib_conn_close((void *)cn);
4405 			return (RDMA_FAILED);
4406 		}
4407 
4408 		/*
4409 		 * Connection to be closed later when c_ref = 0
4410 		 */
4411 		status = RDMA_FAILED;
4412 	}
4413 
4414 	if (status == RDMA_SUCCESS) {
4415 		cn->c_state = C_CONNECTED;
4416 		*conn = cn;
4417 	} else {
4418 		cn->c_state = C_ERROR_CONN;
4419 		cn->c_ref--;
4420 	}
4421 	cv_signal(&cn->c_cv);
4422 	mutex_exit(&cn->c_lock);
4423 	return (status);
4424 }
4425 
4426 static void
4427 rib_conn_close(void *rarg)
4428 {
4429 	CONN *conn = (CONN *)rarg;
4430 	rib_qp_t *qp = ctoqp(conn);
4431 
4432 	mutex_enter(&conn->c_lock);
4433 	if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4434 
4435 		conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4436 
4437 		/*
4438 		 * Live connection in CONNECTED state.
4439 		 */
4440 		if (conn->c_state == C_CONNECTED) {
4441 			conn->c_state = C_ERROR_CONN;
4442 		}
4443 		mutex_exit(&conn->c_lock);
4444 
4445 		rib_close_a_channel(conn);
4446 
4447 		mutex_enter(&conn->c_lock);
4448 		conn->c_flags &= ~C_CLOSE_PENDING;
4449 	}
4450 
4451 	mutex_exit(&conn->c_lock);
4452 
4453 	if (qp->mode == RIB_SERVER)
4454 		(void) rib_disconnect_channel(conn,
4455 		    &qp->hca->srv_conn_list);
4456 	else
4457 		(void) rib_disconnect_channel(conn,
4458 		    &qp->hca->cl_conn_list);
4459 }
4460 
4461 static void
4462 rib_conn_timeout_call(void *carg)
4463 {
4464 	time_t idle_time;
4465 	CONN *conn = (CONN *)carg;
4466 	rib_hca_t *hca = ctoqp(conn)->hca;
4467 	int error;
4468 
4469 	mutex_enter(&conn->c_lock);
4470 	if ((conn->c_ref > 0) ||
4471 	    (conn->c_state == C_DISCONN_PEND)) {
4472 		conn->c_timeout = NULL;
4473 		mutex_exit(&conn->c_lock);
4474 		return;
4475 	}
4476 
4477 	idle_time = (gethrestime_sec() - conn->c_last_used);
4478 
4479 	if ((idle_time <= rib_conn_timeout) &&
4480 	    (conn->c_state != C_ERROR_CONN)) {
4481 		/*
4482 		 * There was activity after the last timeout.
4483 		 * Extend the conn life. Unless the conn is
4484 		 * already in error state.
4485 		 */
4486 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4487 		    SEC_TO_TICK(rib_conn_timeout - idle_time));
4488 		mutex_exit(&conn->c_lock);
4489 		return;
4490 	}
4491 
4492 	error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4493 	    (void *)conn, DDI_NOSLEEP);
4494 
4495 	/*
4496 	 * If taskq dispatch fails above, then reset the timeout
4497 	 * to try again after 10 secs.
4498 	 */
4499 
4500 	if (error != DDI_SUCCESS) {
4501 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4502 		    SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4503 		mutex_exit(&conn->c_lock);
4504 		return;
4505 	}
4506 
4507 	conn->c_state = C_DISCONN_PEND;
4508 	mutex_exit(&conn->c_lock);
4509 }
4510 
4511 static rdma_stat
4512 rib_conn_release(CONN *conn)
4513 {
4514 	mutex_enter(&conn->c_lock);
4515 	return (rib_conn_release_locked(conn));
4516 }
4517 
4518 /*
4519  * Expects conn->c_lock to be held on entry.
4520  * c_lock released on return
4521  */
4522 static rdma_stat
4523 rib_conn_release_locked(CONN *conn)
4524 {
4525 	conn->c_ref--;
4526 
4527 	conn->c_last_used = gethrestime_sec();
4528 	if (conn->c_ref > 0) {
4529 		mutex_exit(&conn->c_lock);
4530 		return (RDMA_SUCCESS);
4531 	}
4532 
4533 	/*
4534 	 * If a conn is C_ERROR_CONN, close the channel.
4535 	 */
4536 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4537 		conn->c_state = C_DISCONN_PEND;
4538 		mutex_exit(&conn->c_lock);
4539 		rib_conn_close((void *)conn);
4540 		return (RDMA_SUCCESS);
4541 	}
4542 
4543 	/*
4544 	 * c_ref == 0, set a timeout for conn release
4545 	 */
4546 
4547 	if (conn->c_timeout == NULL) {
4548 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4549 		    SEC_TO_TICK(rib_conn_timeout));
4550 	}
4551 
4552 	mutex_exit(&conn->c_lock);
4553 	return (RDMA_SUCCESS);
4554 }
4555 
4556 /*
4557  * Add at front of list
4558  */
4559 static struct rdma_done_list *
4560 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4561 {
4562 	struct rdma_done_list *rd;
4563 
4564 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4565 
4566 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4567 	rd->xid = xid;
4568 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4569 
4570 	rd->prev = NULL;
4571 	rd->next = qp->rdlist;
4572 	if (qp->rdlist != NULL)
4573 		qp->rdlist->prev = rd;
4574 	qp->rdlist = rd;
4575 
4576 	return (rd);
4577 }
4578 
4579 static void
4580 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4581 {
4582 	struct rdma_done_list *r;
4583 
4584 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4585 
4586 	r = rd->next;
4587 	if (r != NULL) {
4588 		r->prev = rd->prev;
4589 	}
4590 
4591 	r = rd->prev;
4592 	if (r != NULL) {
4593 		r->next = rd->next;
4594 	} else {
4595 		qp->rdlist = rd->next;
4596 	}
4597 
4598 	cv_destroy(&rd->rdma_done_cv);
4599 	kmem_free(rd, sizeof (*rd));
4600 }
4601 
4602 static void
4603 rdma_done_rem_list(rib_qp_t *qp)
4604 {
4605 	struct rdma_done_list	*r, *n;
4606 
4607 	mutex_enter(&qp->rdlist_lock);
4608 	for (r = qp->rdlist; r != NULL; r = n) {
4609 		n = r->next;
4610 		rdma_done_rm(qp, r);
4611 	}
4612 	mutex_exit(&qp->rdlist_lock);
4613 }
4614 
4615 static void
4616 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4617 {
4618 	struct rdma_done_list *r = qp->rdlist;
4619 
4620 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4621 
4622 	while (r) {
4623 		if (r->xid == xid) {
4624 			cv_signal(&r->rdma_done_cv);
4625 			return;
4626 		} else {
4627 			r = r->next;
4628 		}
4629 	}
4630 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4631 	    int, xid);
4632 }
4633 
4634 /*
4635  * Expects conn->c_lock to be held by the caller.
4636  */
4637 
4638 static void
4639 rib_close_a_channel(CONN *conn)
4640 {
4641 	rib_qp_t	*qp;
4642 	qp = ctoqp(conn);
4643 
4644 	if (qp->qp_hdl == NULL) {
4645 		/* channel already freed */
4646 		return;
4647 	}
4648 
4649 	/*
4650 	 * Call ibt_close_rc_channel in blocking mode
4651 	 * with no callbacks.
4652 	 */
4653 	(void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4654 	    NULL, 0, NULL, NULL, 0);
4655 }
4656 
4657 /*
4658  * Goes through all connections and closes the channel
4659  * This will cause all the WRs on those channels to be
4660  * flushed.
4661  */
4662 static void
4663 rib_close_channels(rib_conn_list_t *connlist)
4664 {
4665 	CONN 		*conn, *tmp;
4666 
4667 	rw_enter(&connlist->conn_lock, RW_READER);
4668 	conn = connlist->conn_hd;
4669 	while (conn != NULL) {
4670 		mutex_enter(&conn->c_lock);
4671 		tmp = conn->c_next;
4672 		if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4673 
4674 			if (conn->c_state == C_CONN_PEND) {
4675 				conn->c_flags |= C_CLOSE_PENDING;
4676 				goto next;
4677 			}
4678 
4679 			conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4680 
4681 			/*
4682 			 * Live connection in CONNECTED state.
4683 			 */
4684 			if (conn->c_state == C_CONNECTED)
4685 				conn->c_state = C_ERROR_CONN;
4686 			mutex_exit(&conn->c_lock);
4687 
4688 			rib_close_a_channel(conn);
4689 
4690 			mutex_enter(&conn->c_lock);
4691 			conn->c_flags &= ~C_CLOSE_PENDING;
4692 			/* Signal a pending rib_disconnect_channel() */
4693 			cv_signal(&conn->c_cv);
4694 		}
4695 next:
4696 		mutex_exit(&conn->c_lock);
4697 		conn = tmp;
4698 	}
4699 	rw_exit(&connlist->conn_lock);
4700 }
4701 
4702 /*
4703  * Frees up all connections that are no longer being referenced
4704  */
4705 static void
4706 rib_purge_connlist(rib_conn_list_t *connlist)
4707 {
4708 	CONN 		*conn;
4709 
4710 top:
4711 	rw_enter(&connlist->conn_lock, RW_READER);
4712 	conn = connlist->conn_hd;
4713 	while (conn != NULL) {
4714 		mutex_enter(&conn->c_lock);
4715 
4716 		/*
4717 		 * At this point connection is either in ERROR
4718 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4719 		 * then some other thread is culling that connection.
4720 		 * If not and if c_ref is 0, then destroy the connection.
4721 		 */
4722 		if (conn->c_ref == 0 &&
4723 		    conn->c_state != C_DISCONN_PEND) {
4724 			/*
4725 			 * Cull the connection
4726 			 */
4727 			conn->c_state = C_DISCONN_PEND;
4728 			mutex_exit(&conn->c_lock);
4729 			rw_exit(&connlist->conn_lock);
4730 			(void) rib_disconnect_channel(conn, connlist);
4731 			goto top;
4732 		} else {
4733 			/*
4734 			 * conn disconnect already scheduled or will
4735 			 * happen from conn_release when c_ref drops to 0.
4736 			 */
4737 			mutex_exit(&conn->c_lock);
4738 		}
4739 		conn = conn->c_next;
4740 	}
4741 	rw_exit(&connlist->conn_lock);
4742 
4743 	/*
4744 	 * At this point, only connections with c_ref != 0 are on the list
4745 	 */
4746 }
4747 
4748 /*
4749  * Free all the HCA resources and close
4750  * the hca.
4751  */
4752 
4753 static void
4754 rib_free_hca(rib_hca_t *hca)
4755 {
4756 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4757 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4758 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4759 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4760 
4761 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4762 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4763 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4764 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4765 
4766 	rib_rbufpool_destroy(hca, RECV_BUFFER);
4767 	rib_rbufpool_destroy(hca, SEND_BUFFER);
4768 	rib_destroy_cache(hca);
4769 	if (rib_mod.rdma_count == 0)
4770 		(void) rdma_unregister_mod(&rib_mod);
4771 	(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4772 	(void) ibt_close_hca(hca->hca_hdl);
4773 	hca->hca_hdl = NULL;
4774 }
4775 
4776 
4777 static void
4778 rib_stop_hca_services(rib_hca_t *hca)
4779 {
4780 	rib_stop_services(hca);
4781 	rib_close_channels(&hca->cl_conn_list);
4782 	rib_close_channels(&hca->srv_conn_list);
4783 
4784 	rib_purge_connlist(&hca->cl_conn_list);
4785 	rib_purge_connlist(&hca->srv_conn_list);
4786 
4787 	if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4788 		kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4789 		    GLOBAL_ZONEID);
4790 		stats_enabled = FALSE;
4791 	}
4792 
4793 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4794 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4795 	if (hca->srv_conn_list.conn_hd == NULL &&
4796 	    hca->cl_conn_list.conn_hd == NULL) {
4797 		/*
4798 		 * conn_lists are NULL, so destroy
4799 		 * buffers, close hca and be done.
4800 		 */
4801 		rib_free_hca(hca);
4802 	}
4803 	rw_exit(&hca->cl_conn_list.conn_lock);
4804 	rw_exit(&hca->srv_conn_list.conn_lock);
4805 
4806 	if (hca->hca_hdl != NULL) {
4807 		mutex_enter(&hca->inuse_lock);
4808 		while (hca->inuse)
4809 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4810 		mutex_exit(&hca->inuse_lock);
4811 
4812 		rib_free_hca(hca);
4813 	}
4814 	rw_destroy(&hca->bound_services_lock);
4815 
4816 	if (hca->cleanup_helper != NULL) {
4817 		ddi_taskq_destroy(hca->cleanup_helper);
4818 		hca->cleanup_helper = NULL;
4819 	}
4820 }
4821 
4822 /*
4823  * Cleans and closes up all uses of the HCA
4824  */
4825 static void
4826 rib_detach_hca(ibt_hca_hdl_t hca_hdl)
4827 {
4828 	rib_hca_t *hca = NULL;
4829 	rib_hca_t **hcap;
4830 
4831 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4832 	for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) {
4833 		hca = *hcap;
4834 		rw_enter(&hca->state_lock, RW_WRITER);
4835 		if (hca->hca_hdl == hca_hdl) {
4836 			/*
4837 			 * Mark as detached and remove from
4838 			 * hca list.
4839 			 */
4840 			hca->state = HCA_DETACHED;
4841 			*hcap = hca->next;
4842 			rib_stat->nhca_inited--;
4843 			rib_mod.rdma_count--;
4844 			rw_exit(&hca->state_lock);
4845 			break;
4846 		}
4847 		rw_exit(&hca->state_lock);
4848 	}
4849 	rw_exit(&rib_stat->hcas_list_lock);
4850 
4851 	if (hca == NULL)
4852 		return;
4853 	ASSERT(hca->hca_hdl == hca_hdl);
4854 
4855 	/*
4856 	 * Stop all services on the HCA
4857 	 * Go through cl_conn_list and close all rc_channels
4858 	 * Go through svr_conn_list and close all rc_channels
4859 	 * Free connections whose c_ref has dropped to 0
4860 	 * Destroy all CQs
4861 	 * Deregister and released all buffer pool memory after all
4862 	 * connections are destroyed
4863 	 * Free the protection domain
4864 	 * ibt_close_hca()
4865 	 */
4866 	rib_stop_hca_services(hca);
4867 
4868 	kmem_free(hca, sizeof (*hca));
4869 }
4870 
4871 static void
4872 rib_server_side_cache_reclaim(void *argp)
4873 {
4874 	cache_avl_struct_t    *rcas;
4875 	rib_lrc_entry_t		*rb;
4876 	rib_hca_t *hca = (rib_hca_t *)argp;
4877 
4878 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4879 	rcas = avl_first(&hca->avl_tree);
4880 	if (rcas != NULL)
4881 		avl_remove(&hca->avl_tree, rcas);
4882 
4883 	while (rcas != NULL) {
4884 		while (rcas->r.forw != &rcas->r) {
4885 			rcas->elements--;
4886 			rb = rcas->r.forw;
4887 			remque(rb);
4888 			if (rb->registered)
4889 				(void) rib_deregistermem_via_hca(hca,
4890 				    rb->lrc_buf, rb->lrc_mhandle);
4891 
4892 			hca->cache_allocation -= rb->lrc_len;
4893 			kmem_free(rb->lrc_buf, rb->lrc_len);
4894 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4895 		}
4896 		mutex_destroy(&rcas->node_lock);
4897 		kmem_cache_free(hca->server_side_cache, rcas);
4898 		rcas = avl_first(&hca->avl_tree);
4899 		if (rcas != NULL)
4900 			avl_remove(&hca->avl_tree, rcas);
4901 	}
4902 	rw_exit(&hca->avl_rw_lock);
4903 }
4904 
4905 static void
4906 rib_server_side_cache_cleanup(void *argp)
4907 {
4908 	cache_avl_struct_t    *rcas;
4909 	rib_lrc_entry_t		*rb;
4910 	rib_hca_t *hca = (rib_hca_t *)argp;
4911 
4912 	mutex_enter(&hca->cache_allocation_lock);
4913 	if (hca->cache_allocation < cache_limit) {
4914 		mutex_exit(&hca->cache_allocation_lock);
4915 		return;
4916 	}
4917 	mutex_exit(&hca->cache_allocation_lock);
4918 
4919 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4920 	rcas = avl_last(&hca->avl_tree);
4921 	if (rcas != NULL)
4922 		avl_remove(&hca->avl_tree, rcas);
4923 
4924 	while (rcas != NULL) {
4925 		while (rcas->r.forw != &rcas->r) {
4926 			rcas->elements--;
4927 			rb = rcas->r.forw;
4928 			remque(rb);
4929 			if (rb->registered)
4930 				(void) rib_deregistermem_via_hca(hca,
4931 				    rb->lrc_buf, rb->lrc_mhandle);
4932 
4933 			hca->cache_allocation -= rb->lrc_len;
4934 
4935 			kmem_free(rb->lrc_buf, rb->lrc_len);
4936 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4937 		}
4938 		mutex_destroy(&rcas->node_lock);
4939 		if (hca->server_side_cache) {
4940 			kmem_cache_free(hca->server_side_cache, rcas);
4941 		}
4942 
4943 		if (hca->cache_allocation < cache_limit) {
4944 			rw_exit(&hca->avl_rw_lock);
4945 			return;
4946 		}
4947 
4948 		rcas = avl_last(&hca->avl_tree);
4949 		if (rcas != NULL)
4950 			avl_remove(&hca->avl_tree, rcas);
4951 	}
4952 	rw_exit(&hca->avl_rw_lock);
4953 }
4954 
4955 static int
4956 avl_compare(const void *t1, const void *t2)
4957 {
4958 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
4959 		return (0);
4960 
4961 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
4962 		return (-1);
4963 
4964 	return (1);
4965 }
4966 
4967 static void
4968 rib_destroy_cache(rib_hca_t *hca)
4969 {
4970 	if (hca->avl_init) {
4971 		rib_server_side_cache_reclaim((void *)hca);
4972 		if (hca->server_side_cache) {
4973 			kmem_cache_destroy(hca->server_side_cache);
4974 			hca->server_side_cache = NULL;
4975 		}
4976 		avl_destroy(&hca->avl_tree);
4977 		mutex_destroy(&hca->cache_allocation_lock);
4978 		rw_destroy(&hca->avl_rw_lock);
4979 	}
4980 	hca->avl_init = FALSE;
4981 }
4982 
4983 static void
4984 rib_force_cleanup(void *hca)
4985 {
4986 	if (((rib_hca_t *)hca)->cleanup_helper != NULL)
4987 		(void) ddi_taskq_dispatch(
4988 		    ((rib_hca_t *)hca)->cleanup_helper,
4989 		    rib_server_side_cache_cleanup,
4990 		    (void *)hca, DDI_NOSLEEP);
4991 }
4992 
4993 static rib_lrc_entry_t *
4994 rib_get_cache_buf(CONN *conn, uint32_t len)
4995 {
4996 	cache_avl_struct_t	cas, *rcas;
4997 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4998 	rib_lrc_entry_t *reply_buf;
4999 	avl_index_t where = NULL;
5000 	uint64_t c_alloc = 0;
5001 
5002 	if (!hca->avl_init)
5003 		goto  error_alloc;
5004 
5005 	cas.len = len;
5006 
5007 	rw_enter(&hca->avl_rw_lock, RW_READER);
5008 
5009 	mutex_enter(&hca->cache_allocation_lock);
5010 	c_alloc = hca->cache_allocation;
5011 	mutex_exit(&hca->cache_allocation_lock);
5012 
5013 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
5014 	    &where)) == NULL) {
5015 		/* Am I above the cache limit */
5016 		if ((c_alloc + len) >= cache_limit) {
5017 			rib_force_cleanup((void *)hca);
5018 			rw_exit(&hca->avl_rw_lock);
5019 			mutex_enter(&hca->cache_allocation_lock);
5020 			hca->cache_misses_above_the_limit ++;
5021 			mutex_exit(&hca->cache_allocation_lock);
5022 
5023 			/* Allocate and register the buffer directly */
5024 			goto error_alloc;
5025 		}
5026 
5027 		rw_exit(&hca->avl_rw_lock);
5028 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
5029 
5030 		/* Recheck to make sure no other thread added the entry in */
5031 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
5032 		    &cas, &where)) == NULL) {
5033 			/* Allocate an avl tree entry */
5034 			rcas = (cache_avl_struct_t *)
5035 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
5036 
5037 			bzero(rcas, sizeof (cache_avl_struct_t));
5038 			rcas->elements = 0;
5039 			rcas->r.forw = &rcas->r;
5040 			rcas->r.back = &rcas->r;
5041 			rcas->len = len;
5042 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5043 			avl_insert(&hca->avl_tree, rcas, where);
5044 		}
5045 	}
5046 
5047 	mutex_enter(&rcas->node_lock);
5048 
5049 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
5050 		reply_buf = rcas->r.forw;
5051 		remque(reply_buf);
5052 		rcas->elements--;
5053 		mutex_exit(&rcas->node_lock);
5054 		rw_exit(&hca->avl_rw_lock);
5055 
5056 		mutex_enter(&hca->cache_allocation_lock);
5057 		hca->cache_hits++;
5058 		hca->cache_allocation -= len;
5059 		mutex_exit(&hca->cache_allocation_lock);
5060 	} else {
5061 		/* Am I above the cache limit */
5062 		mutex_exit(&rcas->node_lock);
5063 		if ((c_alloc + len) >= cache_limit) {
5064 			rib_force_cleanup((void *)hca);
5065 			rw_exit(&hca->avl_rw_lock);
5066 
5067 			mutex_enter(&hca->cache_allocation_lock);
5068 			hca->cache_misses_above_the_limit++;
5069 			mutex_exit(&hca->cache_allocation_lock);
5070 			/* Allocate and register the buffer directly */
5071 			goto error_alloc;
5072 		}
5073 		rw_exit(&hca->avl_rw_lock);
5074 		mutex_enter(&hca->cache_allocation_lock);
5075 		hca->cache_misses++;
5076 		mutex_exit(&hca->cache_allocation_lock);
5077 		/* Allocate a reply_buf entry */
5078 		reply_buf = (rib_lrc_entry_t *)
5079 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5080 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
5081 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
5082 		reply_buf->lrc_len  = len;
5083 		reply_buf->registered = FALSE;
5084 		reply_buf->avl_node = (void *)rcas;
5085 	}
5086 
5087 	return (reply_buf);
5088 
5089 error_alloc:
5090 	reply_buf = (rib_lrc_entry_t *)
5091 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5092 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
5093 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5094 	reply_buf->lrc_len = len;
5095 	reply_buf->registered = FALSE;
5096 	reply_buf->avl_node = NULL;
5097 
5098 	return (reply_buf);
5099 }
5100 
5101 /*
5102  * Return a pre-registered back to the cache (without
5103  * unregistering the buffer)..
5104  */
5105 
5106 static void
5107 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5108 {
5109 	cache_avl_struct_t    cas, *rcas;
5110 	avl_index_t where = NULL;
5111 	rib_hca_t	*hca = (ctoqp(conn))->hca;
5112 
5113 	if (!hca->avl_init)
5114 		goto  error_free;
5115 
5116 	cas.len = reg_buf->lrc_len;
5117 	rw_enter(&hca->avl_rw_lock, RW_READER);
5118 	if ((rcas = (cache_avl_struct_t *)
5119 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5120 		rw_exit(&hca->avl_rw_lock);
5121 		goto error_free;
5122 	} else {
5123 		cas.len = reg_buf->lrc_len;
5124 		mutex_enter(&rcas->node_lock);
5125 		insque(reg_buf, &rcas->r);
5126 		rcas->elements ++;
5127 		mutex_exit(&rcas->node_lock);
5128 		rw_exit(&hca->avl_rw_lock);
5129 		mutex_enter(&hca->cache_allocation_lock);
5130 		hca->cache_allocation += cas.len;
5131 		mutex_exit(&hca->cache_allocation_lock);
5132 	}
5133 
5134 	return;
5135 
5136 error_free:
5137 
5138 	if (reg_buf->registered)
5139 		(void) rib_deregistermem_via_hca(hca,
5140 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5141 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5142 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5143 }
5144 
5145 static rdma_stat
5146 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5147 	uint_t buflen, struct mrc *buf_handle)
5148 {
5149 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
5150 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
5151 	rdma_stat	status;
5152 
5153 
5154 	/*
5155 	 * Note: ALL buffer pools use the same memory type RDMARW.
5156 	 */
5157 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5158 	if (status == RDMA_SUCCESS) {
5159 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5160 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5161 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5162 	} else {
5163 		buf_handle->mrc_linfo = NULL;
5164 		buf_handle->mrc_lmr = 0;
5165 		buf_handle->mrc_rmr = 0;
5166 	}
5167 	return (status);
5168 }
5169 
5170 /* ARGSUSED */
5171 static rdma_stat
5172 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5173     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5174 {
5175 
5176 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5177 	return (RDMA_SUCCESS);
5178 }
5179 
5180 /* ARGSUSED */
5181 static rdma_stat
5182 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5183 {
5184 
5185 	(void) ibt_deregister_mr(hca->hca_hdl,
5186 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5187 	return (RDMA_SUCCESS);
5188 }
5189 
5190 /*
5191  * Check if the IP interface named by `lifrp' is RDMA-capable.
5192  */
5193 static boolean_t
5194 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5195 {
5196 	char ifname[LIFNAMSIZ];
5197 	char *cp;
5198 
5199 	if (lifrp->lifr_type == IFT_IB)
5200 		return (B_TRUE);
5201 
5202 	/*
5203 	 * Strip off the logical interface portion before getting
5204 	 * intimate with the name.
5205 	 */
5206 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5207 	if ((cp = strchr(ifname, ':')) != NULL)
5208 		*cp = '\0';
5209 
5210 	return (strcmp("lo0", ifname) == 0);
5211 }
5212 
5213 static int
5214 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5215 {
5216 	vnode_t *kkvp, *vp;
5217 	TIUSER  *tiptr;
5218 	struct  strioctl iocb;
5219 	k_sigset_t smask;
5220 	int	err = 0;
5221 
5222 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) {
5223 		if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE,
5224 		    &tiptr, CRED()) == 0) {
5225 			vp = tiptr->fp->f_vnode;
5226 		} else {
5227 			VN_RELE(kkvp);
5228 			return (EPROTO);
5229 		}
5230 	} else {
5231 		return (EPROTO);
5232 	}
5233 
5234 	iocb.ic_cmd = cmd;
5235 	iocb.ic_timout = 0;
5236 	iocb.ic_len = len;
5237 	iocb.ic_dp = (caddr_t)arg;
5238 	sigintr(&smask, 0);
5239 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5240 	sigunintr(&smask);
5241 	(void) t_kclose(tiptr, 0);
5242 	VN_RELE(kkvp);
5243 	return (err);
5244 }
5245 
5246 /*
5247  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5248  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5249  */
5250 static int
5251 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5252 {
5253 	int err;
5254 	struct lifnum lifn;
5255 
5256 	bzero(&lifn, sizeof (struct lifnum));
5257 	lifn.lifn_family = AF_UNSPEC;
5258 
5259 	err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5260 	if (err != 0)
5261 		return (err);
5262 
5263 	/*
5264 	 * Pad the interface count to account for additional interfaces that
5265 	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5266 	 */
5267 	lifn.lifn_count += 4;
5268 
5269 	bzero(lifcp, sizeof (struct lifconf));
5270 	lifcp->lifc_family = AF_UNSPEC;
5271 	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5272 	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5273 
5274 	err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5275 	if (err != 0) {
5276 		kmem_free(lifcp->lifc_buf, *bufsizep);
5277 		return (err);
5278 	}
5279 	return (0);
5280 }
5281 
5282 static boolean_t
5283 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5284 {
5285 	uint_t i, nifs;
5286 	uint_t bufsize;
5287 	struct lifconf lifc;
5288 	struct lifreq *lifrp;
5289 	struct sockaddr_in *sinp;
5290 	struct sockaddr_in6 *sin6p;
5291 
5292 	bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5293 	bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5294 
5295 	if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5296 		return (B_FALSE);
5297 
5298 	if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5299 		kmem_free(lifc.lifc_buf, bufsize);
5300 		return (B_FALSE);
5301 	}
5302 
5303 	/*
5304 	 * Worst case is that all of the addresses are IB-capable and have
5305 	 * the same address family, so size our buffers accordingly.
5306 	 */
5307 	addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5308 	addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5309 	addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5310 	addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5311 
5312 	for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5313 		if (!rpcib_rdma_capable_interface(lifrp))
5314 			continue;
5315 
5316 		if (lifrp->lifr_addr.ss_family == AF_INET) {
5317 			sinp = addrs4->ri_list;
5318 			bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5319 			    sizeof (struct sockaddr_in));
5320 		} else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5321 			sin6p = addrs6->ri_list;
5322 			bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5323 			    sizeof (struct sockaddr_in6));
5324 		}
5325 	}
5326 
5327 	kmem_free(lifc.lifc_buf, bufsize);
5328 	return (B_TRUE);
5329 }
5330 
5331 /* ARGSUSED */
5332 static int
5333 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5334 {
5335 	rib_hca_t *hca;
5336 
5337 	if (KSTAT_WRITE == rw) {
5338 		return (EACCES);
5339 	}
5340 
5341 	rpcib_kstat.cache_limit.value.ui64 =
5342 	    (uint64_t)cache_limit;
5343 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5344 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5345 		rpcib_kstat.cache_allocation.value.ui64 +=
5346 		    (uint64_t)hca->cache_allocation;
5347 		rpcib_kstat.cache_hits.value.ui64 +=
5348 		    (uint64_t)hca->cache_hits;
5349 		rpcib_kstat.cache_misses.value.ui64 +=
5350 		    (uint64_t)hca->cache_misses;
5351 		rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5352 		    (uint64_t)hca->cache_misses_above_the_limit;
5353 	}
5354 	rw_exit(&rib_stat->hcas_list_lock);
5355 	return (0);
5356 }
5357