xref: /illumos-gate/usr/src/uts/common/rpc/rpcib.c (revision 186d582bd9dbcd38e0aeea49036d47d3426a3536)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
28  *
29  * Portions of this source code is developed by the team members of
30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31  * headed by Professor Dhabaleswar K. (DK) Panda.
32  *
33  * Acknowledgements to contributions from developors:
34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
35  *   Lei Chai      : chail@cse.ohio-state.edu
36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
37  *
38  */
39 
40 /*
41  * The rpcib plugin. Implements the interface for RDMATF's
42  * interaction with IBTF.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/user.h>
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/file.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/stropts.h>
56 #include <sys/errno.h>
57 #include <sys/kmem.h>
58 #include <sys/debug.h>
59 #include <sys/pathname.h>
60 #include <sys/kstat.h>
61 #include <sys/t_lock.h>
62 #include <sys/ddi.h>
63 #include <sys/cmn_err.h>
64 #include <sys/time.h>
65 #include <sys/isa_defs.h>
66 #include <sys/callb.h>
67 #include <sys/sunddi.h>
68 #include <sys/sunndi.h>
69 #include <sys/sdt.h>
70 #include <sys/ib/ibtl/ibti.h>
71 #include <rpc/rpc.h>
72 #include <rpc/ib.h>
73 #include <sys/modctl.h>
74 #include <sys/kstr.h>
75 #include <sys/sockio.h>
76 #include <sys/vnode.h>
77 #include <sys/tiuser.h>
78 #include <net/if.h>
79 #include <net/if_types.h>
80 #include <sys/cred.h>
81 #include <rpc/rpc_rdma.h>
82 #include <nfs/nfs.h>
83 #include <sys/atomic.h>
84 
85 #define	NFS_RDMA_PORT	20049
86 
87 
88 /*
89  * Convenience structures for connection management
90  */
91 typedef struct rpcib_ipaddrs {
92 	void	*ri_list;	/* pointer to list of addresses */
93 	uint_t	ri_count;	/* number of addresses in list */
94 	uint_t	ri_size;	/* size of ri_list in bytes */
95 } rpcib_ipaddrs_t;
96 
97 
98 typedef struct rpcib_ping {
99 	rib_hca_t  *hca;
100 	ibt_path_info_t path;
101 	ibt_ip_addr_t srcip;
102 	ibt_ip_addr_t dstip;
103 } rpcib_ping_t;
104 
105 /*
106  * Prototype declarations for driver ops
107  */
108 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
109 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
110 				void *, void **);
111 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
113 static int	rpcib_do_ip_ioctl(int, int, void *);
114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
115 static int rpcib_cache_kstat_update(kstat_t *, int);
116 static void rib_force_cleanup(void *);
117 static void rib_stop_hca_services(rib_hca_t *);
118 static void rib_attach_hca(void);
119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
120 		struct netbuf *d_svcaddr, CONN **conn);
121 
122 struct {
123 	kstat_named_t cache_limit;
124 	kstat_named_t cache_allocation;
125 	kstat_named_t cache_hits;
126 	kstat_named_t cache_misses;
127 	kstat_named_t cache_misses_above_the_limit;
128 } rpcib_kstat = {
129 	{"cache_limit",			KSTAT_DATA_UINT64 },
130 	{"cache_allocation",		KSTAT_DATA_UINT64 },
131 	{"cache_hits",			KSTAT_DATA_UINT64 },
132 	{"cache_misses",		KSTAT_DATA_UINT64 },
133 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
134 };
135 
136 /* rpcib cb_ops */
137 static struct cb_ops rpcib_cbops = {
138 	nulldev,		/* open */
139 	nulldev,		/* close */
140 	nodev,			/* strategy */
141 	nodev,			/* print */
142 	nodev,			/* dump */
143 	nodev,			/* read */
144 	nodev,			/* write */
145 	nodev,			/* ioctl */
146 	nodev,			/* devmap */
147 	nodev,			/* mmap */
148 	nodev,			/* segmap */
149 	nochpoll,		/* poll */
150 	ddi_prop_op,		/* prop_op */
151 	NULL,			/* stream */
152 	D_MP,			/* cb_flag */
153 	CB_REV,			/* rev */
154 	nodev,			/* int (*cb_aread)() */
155 	nodev			/* int (*cb_awrite)() */
156 };
157 
158 /*
159  * Device options
160  */
161 static struct dev_ops rpcib_ops = {
162 	DEVO_REV,		/* devo_rev, */
163 	0,			/* refcnt  */
164 	rpcib_getinfo,		/* info */
165 	nulldev,		/* identify */
166 	nulldev,		/* probe */
167 	rpcib_attach,		/* attach */
168 	rpcib_detach,		/* detach */
169 	nodev,			/* reset */
170 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
171 	NULL,			/* bus operations */
172 	NULL,			/* power */
173 	ddi_quiesce_not_needed,		/* quiesce */
174 };
175 
176 /*
177  * Module linkage information.
178  */
179 
180 static struct modldrv rib_modldrv = {
181 	&mod_driverops,		/* Driver module */
182 	"RPCIB plugin driver",	/* Driver name and version */
183 	&rpcib_ops,		/* Driver ops */
184 };
185 
186 static struct modlinkage rib_modlinkage = {
187 	MODREV_1,
188 	(void *)&rib_modldrv,
189 	NULL
190 };
191 
192 typedef struct rib_lrc_entry {
193 	struct rib_lrc_entry *forw;
194 	struct rib_lrc_entry *back;
195 	char *lrc_buf;
196 
197 	uint32_t lrc_len;
198 	void  *avl_node;
199 	bool_t registered;
200 
201 	struct mrc lrc_mhandle;
202 	bool_t lrc_on_freed_list;
203 } rib_lrc_entry_t;
204 
205 typedef	struct cache_struct	{
206 	rib_lrc_entry_t		r;
207 	uint32_t		len;
208 	uint32_t		elements;
209 	kmutex_t		node_lock;
210 	avl_node_t		avl_link;
211 } cache_avl_struct_t;
212 
213 uint64_t	cache_limit = 100 * 1024 * 1024;
214 static uint64_t	cache_watermark = 80 * 1024 * 1024;
215 static bool_t	stats_enabled = FALSE;
216 
217 static uint64_t max_unsignaled_rws = 5;
218 int nfs_rdma_port = NFS_RDMA_PORT;
219 
220 #define	RIBNETID_TCP	"tcp"
221 #define	RIBNETID_TCP6	"tcp6"
222 
223 /*
224  * rib_stat: private data pointer used when registering
225  *	with the IBTF.  It is returned to the consumer
226  *	in all callbacks.
227  */
228 static rpcib_state_t *rib_stat = NULL;
229 
230 #define	RNR_RETRIES	IBT_RNR_RETRY_1
231 #define	MAX_PORTS	2
232 #define	RDMA_DUMMY_WRID	0x4D3A1D4D3A1D
233 #define	RDMA_CONN_REAP_RETRY	10	/* 10 secs */
234 
235 int preposted_rbufs = RDMA_BUFS_GRANT;
236 int send_threshold = 1;
237 
238 /*
239  * Old cards with Tavor driver have limited memory footprint
240  * when booted in 32bit. The rib_max_rbufs tunable can be
241  * tuned for more buffers if needed.
242  */
243 
244 #if !defined(_ELF64) && !defined(__sparc)
245 int rib_max_rbufs = MAX_BUFS;
246 #else
247 int rib_max_rbufs = 10 * MAX_BUFS;
248 #endif	/* !(_ELF64) && !(__sparc) */
249 
250 int rib_conn_timeout = 60 * 12;		/* 12 minutes */
251 
252 /*
253  * State of the plugin.
254  * ACCEPT = accepting new connections and requests.
255  * NO_ACCEPT = not accepting new connection and requests.
256  * This should eventually move to rpcib_state_t structure, since this
257  * will tell in which state the plugin is for a particular type of service
258  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
259  * state for one and in no_accept state for the other.
260  */
261 int		plugin_state;
262 kmutex_t	plugin_state_lock;
263 
264 ldi_ident_t rpcib_li;
265 
266 /*
267  * RPCIB RDMATF operations
268  */
269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
270 static rdma_stat rib_disconnect(CONN *conn);
271 static void rib_listen(struct rdma_svc_data *rd);
272 static void rib_listen_stop(struct rdma_svc_data *rd);
273 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
274 	uint_t buflen, struct mrc *buf_handle);
275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
276 	struct mrc buf_handle);
277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
278 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
280 		struct mrc buf_handle);
281 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
282 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
283 	void *lrc);
284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
285 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
287 	caddr_t buf, int len, int cpu);
288 
289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
290 
291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
293 
294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
295 
296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
305 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
306 	int addr_type, void *, CONN **);
307 static rdma_stat rib_conn_release(CONN *conn);
308 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
309 	rpcib_ping_t *, CONN **);
310 static rdma_stat rib_getinfo(rdma_info_t *info);
311 
312 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
313 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
314 static void rib_destroy_cache(rib_hca_t *hca);
315 static	void	rib_server_side_cache_reclaim(void *argp);
316 static int avl_compare(const void *t1, const void *t2);
317 
318 static void rib_stop_services(rib_hca_t *);
319 static void rib_close_channels(rib_conn_list_t *);
320 static void rib_conn_close(void *);
321 static void rib_recv_rele(rib_qp_t *);
322 static rdma_stat rib_conn_release_locked(CONN *conn);
323 
324 /*
325  * RPCIB addressing operations
326  */
327 
328 /*
329  * RDMA operations the RPCIB module exports
330  */
331 static rdmaops_t rib_ops = {
332 	rib_reachable,
333 	rib_conn_get,
334 	rib_conn_release,
335 	rib_listen,
336 	rib_listen_stop,
337 	rib_registermem,
338 	rib_deregistermem,
339 	rib_registermemsync,
340 	rib_deregistermemsync,
341 	rib_syncmem,
342 	rib_reg_buf_alloc,
343 	rib_reg_buf_free,
344 	rib_send,
345 	rib_send_resp,
346 	rib_post_resp,
347 	rib_post_resp_remove,
348 	rib_post_recv,
349 	rib_recv,
350 	rib_read,
351 	rib_write,
352 	rib_getinfo,
353 };
354 
355 /*
356  * RDMATF RPCIB plugin details
357  */
358 static rdma_mod_t rib_mod = {
359 	"ibtf",		/* api name */
360 	RDMATF_VERS_1,
361 	0,
362 	&rib_ops,	/* rdma op vector for ibtf */
363 };
364 
365 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
366 static rdma_stat rib_qp_init(rib_qp_t *, int);
367 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
368 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
369 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
370 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
371 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
372 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
373 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
374 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
375 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
376 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
377 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
378 	rib_qp_t **);
379 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
380 	rib_qp_t **);
381 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
382 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
383 static int rib_free_sendwait(struct send_wid *);
384 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
385 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
386 static void rdma_done_rem_list(rib_qp_t *);
387 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
388 
389 static void rib_async_handler(void *,
390 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
391 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
392 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
393 static int rib_free_svc_recv(struct svc_recv *);
394 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
395 static void rib_free_wid(struct recv_wid *);
396 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
397 static void rib_detach_hca(ibt_hca_hdl_t);
398 static void rib_close_a_channel(CONN *);
399 static void rib_send_hold(rib_qp_t *);
400 static void rib_send_rele(rib_qp_t *);
401 
402 /*
403  * Registration with IBTF as a consumer
404  */
405 static struct ibt_clnt_modinfo_s rib_modinfo = {
406 	IBTI_V_CURR,
407 	IBT_GENERIC,
408 	rib_async_handler,	/* async event handler */
409 	NULL,			/* Memory Region Handler */
410 	"nfs/ib"
411 };
412 
413 /*
414  * Global strucuture
415  */
416 
417 typedef struct rpcib_s {
418 	dev_info_t	*rpcib_dip;
419 	kmutex_t	rpcib_mutex;
420 } rpcib_t;
421 
422 rpcib_t rpcib;
423 
424 /*
425  * /etc/system controlled variable to control
426  * debugging in rpcib kernel module.
427  * Set it to values greater that 1 to control
428  * the amount of debugging messages required.
429  */
430 int rib_debug = 0;
431 
432 int
433 _init(void)
434 {
435 	int error;
436 
437 	error = mod_install((struct modlinkage *)&rib_modlinkage);
438 	if (error != 0) {
439 		/*
440 		 * Could not load module
441 		 */
442 		return (error);
443 	}
444 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
445 	return (0);
446 }
447 
448 int
449 _fini()
450 {
451 	int status;
452 
453 	/*
454 	 * Remove module
455 	 */
456 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
457 		return (status);
458 	}
459 	mutex_destroy(&plugin_state_lock);
460 	return (0);
461 }
462 
463 int
464 _info(struct modinfo *modinfop)
465 {
466 	return (mod_info(&rib_modlinkage, modinfop));
467 }
468 
469 /*
470  * rpcib_getinfo()
471  * Given the device number, return the devinfo pointer or the
472  * instance number.
473  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
474  */
475 
476 /*ARGSUSED*/
477 static int
478 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
479 {
480 	int ret = DDI_SUCCESS;
481 
482 	switch (cmd) {
483 	case DDI_INFO_DEVT2DEVINFO:
484 		if (rpcib.rpcib_dip != NULL)
485 			*result = rpcib.rpcib_dip;
486 		else {
487 			*result = NULL;
488 			ret = DDI_FAILURE;
489 		}
490 		break;
491 
492 	case DDI_INFO_DEVT2INSTANCE:
493 		*result = NULL;
494 		break;
495 
496 	default:
497 		ret = DDI_FAILURE;
498 	}
499 	return (ret);
500 }
501 
502 static void
503 rpcib_free_hca_list()
504 {
505 	rib_hca_t *hca, *hcap;
506 
507 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
508 	hca = rib_stat->hcas_list;
509 	rib_stat->hcas_list = NULL;
510 	rw_exit(&rib_stat->hcas_list_lock);
511 	while (hca != NULL) {
512 		rw_enter(&hca->state_lock, RW_WRITER);
513 		hcap = hca;
514 		hca = hca->next;
515 		rib_stat->nhca_inited--;
516 		rib_mod.rdma_count--;
517 		hcap->state = HCA_DETACHED;
518 		rw_exit(&hcap->state_lock);
519 		rib_stop_hca_services(hcap);
520 
521 		kmem_free(hcap, sizeof (*hcap));
522 	}
523 }
524 
525 static rdma_stat
526 rpcib_free_service_list()
527 {
528 	rib_service_t *service;
529 	ibt_status_t ret;
530 
531 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
532 	while (rib_stat->service_list != NULL) {
533 		service = rib_stat->service_list;
534 		ret = ibt_unbind_all_services(service->srv_hdl);
535 		if (ret != IBT_SUCCESS) {
536 			rw_exit(&rib_stat->service_list_lock);
537 #ifdef DEBUG
538 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
539 			    "ibt_unbind_all_services failed (%d)\n", (int)ret);
540 #endif
541 			return (RDMA_FAILED);
542 		}
543 		ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
544 		    service->srv_hdl);
545 		if (ret != IBT_SUCCESS) {
546 			rw_exit(&rib_stat->service_list_lock);
547 #ifdef DEBUG
548 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
549 			    "ibt_deregister_service failed (%d)\n", (int)ret);
550 #endif
551 			return (RDMA_FAILED);
552 		}
553 		rib_stat->service_list = service->next;
554 		kmem_free(service, sizeof (rib_service_t));
555 	}
556 	rw_exit(&rib_stat->service_list_lock);
557 
558 	return (RDMA_SUCCESS);
559 }
560 
561 static int
562 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
563 {
564 	ibt_status_t	ibt_status;
565 	rdma_stat	r_status;
566 
567 	switch (cmd) {
568 	case DDI_ATTACH:
569 		break;
570 	case DDI_RESUME:
571 		return (DDI_SUCCESS);
572 	default:
573 		return (DDI_FAILURE);
574 	}
575 
576 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
577 
578 	mutex_enter(&rpcib.rpcib_mutex);
579 	if (rpcib.rpcib_dip != NULL) {
580 		mutex_exit(&rpcib.rpcib_mutex);
581 		return (DDI_FAILURE);
582 	}
583 	rpcib.rpcib_dip = dip;
584 	mutex_exit(&rpcib.rpcib_mutex);
585 	/*
586 	 * Create the "rpcib" minor-node.
587 	 */
588 	if (ddi_create_minor_node(dip,
589 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
590 		/* Error message, no cmn_err as they print on console */
591 		return (DDI_FAILURE);
592 	}
593 
594 	if (rib_stat == NULL) {
595 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
596 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
597 		rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
598 		mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
599 	}
600 
601 	rib_stat->hca_count = ibt_get_hca_list(NULL);
602 	if (rib_stat->hca_count < 1) {
603 		mutex_destroy(&rib_stat->listen_lock);
604 		rw_destroy(&rib_stat->hcas_list_lock);
605 		mutex_destroy(&rib_stat->open_hca_lock);
606 		kmem_free(rib_stat, sizeof (*rib_stat));
607 		rib_stat = NULL;
608 		return (DDI_FAILURE);
609 	}
610 
611 	ibt_status = ibt_attach(&rib_modinfo, dip,
612 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
613 
614 	if (ibt_status != IBT_SUCCESS) {
615 		mutex_destroy(&rib_stat->listen_lock);
616 		rw_destroy(&rib_stat->hcas_list_lock);
617 		mutex_destroy(&rib_stat->open_hca_lock);
618 		kmem_free(rib_stat, sizeof (*rib_stat));
619 		rib_stat = NULL;
620 		return (DDI_FAILURE);
621 	}
622 
623 	rib_stat->service_list = NULL;
624 	rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
625 	mutex_enter(&rib_stat->open_hca_lock);
626 	if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
627 		mutex_exit(&rib_stat->open_hca_lock);
628 		goto open_fail;
629 	}
630 	mutex_exit(&rib_stat->open_hca_lock);
631 
632 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
633 	    DDI_PROP_SUCCESS) {
634 		cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
635 		    "failed.");
636 		goto register_fail;
637 	}
638 
639 	/*
640 	 * Register with rdmatf
641 	 */
642 	r_status = rdma_register_mod(&rib_mod);
643 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
644 		cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
645 		    "status = %d", r_status);
646 		goto register_fail;
647 	}
648 
649 	return (DDI_SUCCESS);
650 
651 register_fail:
652 
653 open_fail:
654 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
655 	rpcib_free_hca_list();
656 	(void) rpcib_free_service_list();
657 	mutex_destroy(&rib_stat->listen_lock);
658 	rw_destroy(&rib_stat->hcas_list_lock);
659 	mutex_destroy(&rib_stat->open_hca_lock);
660 	rw_destroy(&rib_stat->service_list_lock);
661 	kmem_free(rib_stat, sizeof (*rib_stat));
662 	rib_stat = NULL;
663 	return (DDI_FAILURE);
664 }
665 
666 /*ARGSUSED*/
667 static int
668 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
669 {
670 	switch (cmd) {
671 
672 	case DDI_DETACH:
673 		break;
674 
675 	case DDI_SUSPEND:
676 	default:
677 		return (DDI_FAILURE);
678 	}
679 
680 	/*
681 	 * Detach the hca and free resources
682 	 */
683 	mutex_enter(&plugin_state_lock);
684 	plugin_state = NO_ACCEPT;
685 	mutex_exit(&plugin_state_lock);
686 
687 	if (rpcib_free_service_list() != RDMA_SUCCESS)
688 		return (DDI_FAILURE);
689 	rpcib_free_hca_list();
690 
691 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
692 	mutex_destroy(&rib_stat->listen_lock);
693 	rw_destroy(&rib_stat->hcas_list_lock);
694 	mutex_destroy(&rib_stat->open_hca_lock);
695 	rw_destroy(&rib_stat->service_list_lock);
696 
697 	kmem_free(rib_stat, sizeof (*rib_stat));
698 	rib_stat = NULL;
699 
700 	mutex_enter(&rpcib.rpcib_mutex);
701 	rpcib.rpcib_dip = NULL;
702 	mutex_exit(&rpcib.rpcib_mutex);
703 	mutex_destroy(&rpcib.rpcib_mutex);
704 	return (DDI_SUCCESS);
705 }
706 
707 
708 static void rib_rbufpool_free(rib_hca_t *, int);
709 static void rib_rbufpool_deregister(rib_hca_t *, int);
710 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
711 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
712 static rdma_stat rib_rem_replylist(rib_qp_t *);
713 static int rib_remreply(rib_qp_t *, struct reply *);
714 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
715 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
716 
717 
718 /*
719  * One CQ pair per HCA
720  */
721 static rdma_stat
722 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
723 	rib_cq_t **cqp)
724 {
725 	rib_cq_t	*cq;
726 	ibt_cq_attr_t	cq_attr;
727 	uint32_t	real_size;
728 	ibt_status_t	status;
729 	rdma_stat	error = RDMA_SUCCESS;
730 
731 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
732 	cq->rib_hca = hca;
733 	cq_attr.cq_size = cq_size;
734 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
735 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
736 	    &real_size);
737 	if (status != IBT_SUCCESS) {
738 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
739 		    " status=%d", status);
740 		error = RDMA_FAILED;
741 		goto fail;
742 	}
743 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
744 
745 	/*
746 	 * Enable CQ callbacks. CQ Callbacks are single shot
747 	 * (e.g. you have to call ibt_enable_cq_notify()
748 	 * after each callback to get another one).
749 	 */
750 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
751 	if (status != IBT_SUCCESS) {
752 		cmn_err(CE_WARN, "rib_create_cq: "
753 		    "enable_cq_notify failed, status %d", status);
754 		error = RDMA_FAILED;
755 		goto fail;
756 	}
757 	*cqp = cq;
758 
759 	return (error);
760 fail:
761 	if (cq->rib_cq_hdl)
762 		(void) ibt_free_cq(cq->rib_cq_hdl);
763 	if (cq)
764 		kmem_free(cq, sizeof (rib_cq_t));
765 	return (error);
766 }
767 
768 /*
769  * rpcib_find_hca
770  *
771  * Caller should have already locked the hcas_lock before calling
772  * this function.
773  */
774 static rib_hca_t *
775 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
776 {
777 	rib_hca_t *hca = ribstat->hcas_list;
778 
779 	while (hca && hca->hca_guid != guid)
780 		hca = hca->next;
781 
782 	return (hca);
783 }
784 
785 static rdma_stat
786 rpcib_open_hcas(rpcib_state_t *ribstat)
787 {
788 	rib_hca_t		*hca;
789 	ibt_status_t		ibt_status;
790 	rdma_stat		status;
791 	ibt_hca_portinfo_t	*pinfop;
792 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
793 	uint_t			size, cq_size;
794 	int			i;
795 	kstat_t *ksp;
796 	cache_avl_struct_t example_avl_node;
797 	char rssc_name[32];
798 	int old_nhca_inited = ribstat->nhca_inited;
799 	ib_guid_t		*hca_guids;
800 
801 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
802 
803 	ribstat->hca_count = ibt_get_hca_list(&hca_guids);
804 	if (ribstat->hca_count == 0)
805 		return (RDMA_FAILED);
806 
807 	rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
808 	/*
809 	 * Open a hca and setup for RDMA
810 	 */
811 	for (i = 0; i < ribstat->hca_count; i++) {
812 		if (rpcib_find_hca(ribstat, hca_guids[i]))
813 			continue;
814 		hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
815 
816 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
817 		    hca_guids[i], &hca->hca_hdl);
818 		if (ibt_status != IBT_SUCCESS) {
819 			kmem_free(hca, sizeof (rib_hca_t));
820 			continue;
821 		}
822 		hca->hca_guid = hca_guids[i];
823 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
824 		hca->state = HCA_INITED;
825 
826 		/*
827 		 * query HCA info
828 		 */
829 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
830 		if (ibt_status != IBT_SUCCESS) {
831 			goto fail1;
832 		}
833 
834 		/*
835 		 * One PD (Protection Domain) per HCA.
836 		 * A qp is allowed to access a memory region
837 		 * only when it's in the same PD as that of
838 		 * the memory region.
839 		 */
840 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
841 		if (ibt_status != IBT_SUCCESS) {
842 			goto fail1;
843 		}
844 
845 		/*
846 		 * query HCA ports
847 		 */
848 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
849 		    0, &pinfop, &hca->hca_nports, &size);
850 		if (ibt_status != IBT_SUCCESS) {
851 			goto fail2;
852 		}
853 		hca->hca_ports = pinfop;
854 		hca->hca_pinfosz = size;
855 		pinfop = NULL;
856 
857 		cq_size = DEF_CQ_SIZE; /* default cq size */
858 		/*
859 		 * Create 2 pairs of cq's (1 pair for client
860 		 * and the other pair for server) on this hca.
861 		 * If number of qp's gets too large, then several
862 		 * cq's will be needed.
863 		 */
864 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
865 		    &hca->svc_rcq);
866 		if (status != RDMA_SUCCESS) {
867 			goto fail3;
868 		}
869 
870 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
871 		    &hca->svc_scq);
872 		if (status != RDMA_SUCCESS) {
873 			goto fail3;
874 		}
875 
876 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
877 		    &hca->clnt_rcq);
878 		if (status != RDMA_SUCCESS) {
879 			goto fail3;
880 		}
881 
882 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
883 		    &hca->clnt_scq);
884 		if (status != RDMA_SUCCESS) {
885 			goto fail3;
886 		}
887 
888 		/*
889 		 * Create buffer pools.
890 		 * Note rib_rbuf_create also allocates memory windows.
891 		 */
892 		hca->recv_pool = rib_rbufpool_create(hca,
893 		    RECV_BUFFER, rib_max_rbufs);
894 		if (hca->recv_pool == NULL) {
895 			goto fail3;
896 		}
897 
898 		hca->send_pool = rib_rbufpool_create(hca,
899 		    SEND_BUFFER, rib_max_rbufs);
900 		if (hca->send_pool == NULL) {
901 			rib_rbufpool_destroy(hca, RECV_BUFFER);
902 			goto fail3;
903 		}
904 
905 		if (hca->server_side_cache == NULL) {
906 			(void) sprintf(rssc_name,
907 			    "rib_srvr_cache_%llx",
908 			    (long long unsigned int) hca->hca_guid);
909 			hca->server_side_cache = kmem_cache_create(
910 			    rssc_name,
911 			    sizeof (cache_avl_struct_t), 0,
912 			    NULL,
913 			    NULL,
914 			    rib_server_side_cache_reclaim,
915 			    hca, NULL, 0);
916 		}
917 
918 		avl_create(&hca->avl_tree,
919 		    avl_compare,
920 		    sizeof (cache_avl_struct_t),
921 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
922 		    (uint_t)(uintptr_t)&example_avl_node);
923 
924 		rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
925 		    hca->iblock);
926 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
927 		rw_init(&hca->avl_rw_lock,
928 		    NULL, RW_DRIVER, hca->iblock);
929 		mutex_init(&hca->cache_allocation_lock,
930 		    NULL, MUTEX_DRIVER, NULL);
931 		hca->avl_init = TRUE;
932 
933 		/* Create kstats for the cache */
934 		ASSERT(INGLOBALZONE(curproc));
935 
936 		if (!stats_enabled) {
937 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
938 			    KSTAT_TYPE_NAMED,
939 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
940 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
941 			    GLOBAL_ZONEID);
942 			if (ksp) {
943 				ksp->ks_data = (void *) &rpcib_kstat;
944 				ksp->ks_update = rpcib_cache_kstat_update;
945 				kstat_install(ksp);
946 				stats_enabled = TRUE;
947 			}
948 		}
949 		if (hca->cleanup_helper == NULL) {
950 			char tq_name[sizeof (hca->hca_guid) * 2 + 1];
951 
952 			(void) snprintf(tq_name, sizeof (tq_name), "%llX",
953 			    (unsigned long long int) hca->hca_guid);
954 			hca->cleanup_helper = ddi_taskq_create(NULL,
955 			    tq_name, 1, TASKQ_DEFAULTPRI, 0);
956 		}
957 
958 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
959 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
960 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
961 		    hca->iblock);
962 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
963 		    hca->iblock);
964 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
965 		hca->inuse = TRUE;
966 
967 		hca->next = ribstat->hcas_list;
968 		ribstat->hcas_list = hca;
969 		ribstat->nhca_inited++;
970 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
971 		continue;
972 
973 fail3:
974 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
975 fail2:
976 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
977 fail1:
978 		(void) ibt_close_hca(hca->hca_hdl);
979 		kmem_free(hca, sizeof (rib_hca_t));
980 	}
981 	rw_exit(&ribstat->hcas_list_lock);
982 	ibt_free_hca_list(hca_guids, ribstat->hca_count);
983 	rib_mod.rdma_count = rib_stat->nhca_inited;
984 
985 	/*
986 	 * return success if at least one new hca has been configured.
987 	 */
988 	if (ribstat->nhca_inited != old_nhca_inited)
989 		return (RDMA_SUCCESS);
990 	else
991 		return (RDMA_FAILED);
992 }
993 
994 /*
995  * Callback routines
996  */
997 
998 /*
999  * SCQ handlers
1000  */
1001 /* ARGSUSED */
1002 static void
1003 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1004 {
1005 	ibt_status_t	ibt_status;
1006 	ibt_wc_t	wc;
1007 	struct send_wid	*wd;
1008 	CONN		*conn;
1009 	rib_qp_t	*qp;
1010 	int		i;
1011 
1012 	/*
1013 	 * Re-enable cq notify here to avoid missing any
1014 	 * completion queue notification.
1015 	 */
1016 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1017 
1018 	ibt_status = IBT_SUCCESS;
1019 	while (ibt_status != IBT_CQ_EMPTY) {
1020 		bzero(&wc, sizeof (wc));
1021 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1022 		if (ibt_status != IBT_SUCCESS)
1023 			return;
1024 
1025 		/*
1026 		 * Got a send completion
1027 		 */
1028 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1029 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1030 			qp = wd->qp;
1031 			conn = qptoc(qp);
1032 
1033 			mutex_enter(&wd->sendwait_lock);
1034 			switch (wc.wc_status) {
1035 			case IBT_WC_SUCCESS:
1036 				wd->status = RDMA_SUCCESS;
1037 				break;
1038 			default:
1039 /*
1040  *    RC Send Q Error Code		Local state     Remote State
1041  *    ==================== 		===========     ============
1042  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
1043  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
1044  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
1045  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
1046  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
1047  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
1048  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
1049  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
1050  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
1051  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
1052  *    IBT_WC_WR_FLUSHED_ERR               ERROR           None
1053  */
1054 				/*
1055 				 * Channel in error state. Set connection to
1056 				 * ERROR and cleanup will happen either from
1057 				 * conn_release  or from rib_conn_get
1058 				 */
1059 				wd->status = RDMA_FAILED;
1060 				mutex_enter(&conn->c_lock);
1061 				if (conn->c_state != C_DISCONN_PEND)
1062 					conn->c_state = C_ERROR_CONN;
1063 				mutex_exit(&conn->c_lock);
1064 				break;
1065 			}
1066 
1067 			if (wd->cv_sig == 1) {
1068 				/*
1069 				 * Notify poster
1070 				 */
1071 				cv_signal(&wd->wait_cv);
1072 				mutex_exit(&wd->sendwait_lock);
1073 			} else {
1074 				/*
1075 				 * Poster not waiting for notification.
1076 				 * Free the send buffers and send_wid
1077 				 */
1078 				for (i = 0; i < wd->nsbufs; i++) {
1079 					rib_rbuf_free(qptoc(wd->qp),
1080 					    SEND_BUFFER,
1081 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1082 				}
1083 
1084 				/* decrement the send ref count */
1085 				rib_send_rele(qp);
1086 
1087 				mutex_exit(&wd->sendwait_lock);
1088 				(void) rib_free_sendwait(wd);
1089 			}
1090 		}
1091 	}
1092 }
1093 
1094 /* ARGSUSED */
1095 static void
1096 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1097 {
1098 	ibt_status_t	ibt_status;
1099 	ibt_wc_t	wc;
1100 	struct send_wid	*wd;
1101 	rib_qp_t	*qp;
1102 	CONN		*conn;
1103 	int		i;
1104 
1105 	/*
1106 	 * Re-enable cq notify here to avoid missing any
1107 	 * completion queue notification.
1108 	 */
1109 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1110 
1111 	ibt_status = IBT_SUCCESS;
1112 	while (ibt_status != IBT_CQ_EMPTY) {
1113 		bzero(&wc, sizeof (wc));
1114 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1115 		if (ibt_status != IBT_SUCCESS)
1116 			return;
1117 
1118 		/*
1119 		 * Got a send completion
1120 		 */
1121 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1122 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1123 			qp = wd->qp;
1124 			conn = qptoc(qp);
1125 			mutex_enter(&wd->sendwait_lock);
1126 
1127 			switch (wc.wc_status) {
1128 			case IBT_WC_SUCCESS:
1129 				wd->status = RDMA_SUCCESS;
1130 				break;
1131 			default:
1132 				/*
1133 				 * Channel in error state. Set connection to
1134 				 * ERROR and cleanup will happen either from
1135 				 * conn_release  or conn timeout.
1136 				 */
1137 				wd->status = RDMA_FAILED;
1138 				mutex_enter(&conn->c_lock);
1139 				if (conn->c_state != C_DISCONN_PEND)
1140 					conn->c_state = C_ERROR_CONN;
1141 				mutex_exit(&conn->c_lock);
1142 				break;
1143 			}
1144 
1145 			if (wd->cv_sig == 1) {
1146 				/*
1147 				 * Update completion status and notify poster
1148 				 */
1149 				cv_signal(&wd->wait_cv);
1150 				mutex_exit(&wd->sendwait_lock);
1151 			} else {
1152 				/*
1153 				 * Poster not waiting for notification.
1154 				 * Free the send buffers and send_wid
1155 				 */
1156 				for (i = 0; i < wd->nsbufs; i++) {
1157 					rib_rbuf_free(qptoc(wd->qp),
1158 					    SEND_BUFFER,
1159 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1160 				}
1161 
1162 				/* decrement the send ref count */
1163 				rib_send_rele(qp);
1164 
1165 				mutex_exit(&wd->sendwait_lock);
1166 				(void) rib_free_sendwait(wd);
1167 			}
1168 		}
1169 	}
1170 }
1171 
1172 /*
1173  * RCQ handler
1174  */
1175 /* ARGSUSED */
1176 static void
1177 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1178 {
1179 	rib_qp_t	*qp;
1180 	ibt_status_t	ibt_status;
1181 	ibt_wc_t	wc;
1182 	struct recv_wid	*rwid;
1183 
1184 	/*
1185 	 * Re-enable cq notify here to avoid missing any
1186 	 * completion queue notification.
1187 	 */
1188 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1189 
1190 	ibt_status = IBT_SUCCESS;
1191 	while (ibt_status != IBT_CQ_EMPTY) {
1192 		bzero(&wc, sizeof (wc));
1193 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1194 		if (ibt_status != IBT_SUCCESS)
1195 			return;
1196 
1197 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1198 		qp = rwid->qp;
1199 
1200 		if (wc.wc_status == IBT_WC_SUCCESS) {
1201 			XDR	inxdrs, *xdrs;
1202 			uint_t	xid, vers, op, find_xid = 0;
1203 			struct reply	*r;
1204 			CONN *conn = qptoc(qp);
1205 			uint32_t rdma_credit = 0;
1206 
1207 			xdrs = &inxdrs;
1208 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1209 			    wc.wc_bytes_xfer, XDR_DECODE);
1210 			/*
1211 			 * Treat xid as opaque (xid is the first entity
1212 			 * in the rpc rdma message).
1213 			 */
1214 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1215 
1216 			/* Skip xid and set the xdr position accordingly. */
1217 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1218 			(void) xdr_u_int(xdrs, &vers);
1219 			(void) xdr_u_int(xdrs, &rdma_credit);
1220 			(void) xdr_u_int(xdrs, &op);
1221 			XDR_DESTROY(xdrs);
1222 
1223 			if (vers != RPCRDMA_VERS) {
1224 				/*
1225 				 * Invalid RPC/RDMA version. Cannot
1226 				 * interoperate.  Set connection to
1227 				 * ERROR state and bail out.
1228 				 */
1229 				mutex_enter(&conn->c_lock);
1230 				if (conn->c_state != C_DISCONN_PEND)
1231 					conn->c_state = C_ERROR_CONN;
1232 				mutex_exit(&conn->c_lock);
1233 				rib_rbuf_free(conn, RECV_BUFFER,
1234 				    (void *)(uintptr_t)rwid->addr);
1235 				rib_free_wid(rwid);
1236 				rib_recv_rele(qp);
1237 				continue;
1238 			}
1239 
1240 			mutex_enter(&qp->replylist_lock);
1241 			for (r = qp->replylist; r != NULL; r = r->next) {
1242 				if (r->xid == xid) {
1243 					find_xid = 1;
1244 					switch (op) {
1245 					case RDMA_MSG:
1246 					case RDMA_NOMSG:
1247 					case RDMA_MSGP:
1248 						r->status = RDMA_SUCCESS;
1249 						r->vaddr_cq = rwid->addr;
1250 						r->bytes_xfer =
1251 						    wc.wc_bytes_xfer;
1252 						cv_signal(&r->wait_cv);
1253 						break;
1254 					default:
1255 						rib_rbuf_free(qptoc(qp),
1256 						    RECV_BUFFER,
1257 						    (void *)(uintptr_t)
1258 						    rwid->addr);
1259 						break;
1260 					}
1261 					break;
1262 				}
1263 			}
1264 			mutex_exit(&qp->replylist_lock);
1265 			if (find_xid == 0) {
1266 				/* RPC caller not waiting for reply */
1267 
1268 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1269 				    int, xid);
1270 
1271 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1272 				    (void *)(uintptr_t)rwid->addr);
1273 			}
1274 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1275 			CONN *conn = qptoc(qp);
1276 
1277 			/*
1278 			 * Connection being flushed. Just free
1279 			 * the posted buffer
1280 			 */
1281 			rib_rbuf_free(conn, RECV_BUFFER,
1282 			    (void *)(uintptr_t)rwid->addr);
1283 		} else {
1284 			CONN *conn = qptoc(qp);
1285 /*
1286  *  RC Recv Q Error Code		Local state     Remote State
1287  *  ====================		===========     ============
1288  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1289  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1290  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1291  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1292  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1293  *  IBT_WC_WR_FLUSHED_ERR               None            None
1294  */
1295 			/*
1296 			 * Channel in error state. Set connection
1297 			 * in ERROR state.
1298 			 */
1299 			mutex_enter(&conn->c_lock);
1300 			if (conn->c_state != C_DISCONN_PEND)
1301 				conn->c_state = C_ERROR_CONN;
1302 			mutex_exit(&conn->c_lock);
1303 			rib_rbuf_free(conn, RECV_BUFFER,
1304 			    (void *)(uintptr_t)rwid->addr);
1305 		}
1306 		rib_free_wid(rwid);
1307 		rib_recv_rele(qp);
1308 	}
1309 }
1310 
1311 /* Server side */
1312 /* ARGSUSED */
1313 static void
1314 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1315 {
1316 	rdma_recv_data_t *rdp;
1317 	rib_qp_t	*qp;
1318 	ibt_status_t	ibt_status;
1319 	ibt_wc_t	wc;
1320 	struct svc_recv	*s_recvp;
1321 	CONN		*conn;
1322 	mblk_t		*mp;
1323 
1324 	/*
1325 	 * Re-enable cq notify here to avoid missing any
1326 	 * completion queue notification.
1327 	 */
1328 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1329 
1330 	ibt_status = IBT_SUCCESS;
1331 	while (ibt_status != IBT_CQ_EMPTY) {
1332 		bzero(&wc, sizeof (wc));
1333 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1334 		if (ibt_status != IBT_SUCCESS)
1335 			return;
1336 
1337 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1338 		qp = s_recvp->qp;
1339 		conn = qptoc(qp);
1340 
1341 		if (wc.wc_status == IBT_WC_SUCCESS) {
1342 			XDR	inxdrs, *xdrs;
1343 			uint_t	xid, vers, op;
1344 			uint32_t rdma_credit;
1345 
1346 			xdrs = &inxdrs;
1347 			/* s_recvp->vaddr stores data */
1348 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1349 			    wc.wc_bytes_xfer, XDR_DECODE);
1350 
1351 			/*
1352 			 * Treat xid as opaque (xid is the first entity
1353 			 * in the rpc rdma message).
1354 			 */
1355 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1356 			/* Skip xid and set the xdr position accordingly. */
1357 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1358 			if (!xdr_u_int(xdrs, &vers) ||
1359 			    !xdr_u_int(xdrs, &rdma_credit) ||
1360 			    !xdr_u_int(xdrs, &op)) {
1361 				rib_rbuf_free(conn, RECV_BUFFER,
1362 				    (void *)(uintptr_t)s_recvp->vaddr);
1363 				XDR_DESTROY(xdrs);
1364 				rib_recv_rele(qp);
1365 				(void) rib_free_svc_recv(s_recvp);
1366 				continue;
1367 			}
1368 			XDR_DESTROY(xdrs);
1369 
1370 			if (vers != RPCRDMA_VERS) {
1371 				/*
1372 				 * Invalid RPC/RDMA version.
1373 				 * Drop rpc rdma message.
1374 				 */
1375 				rib_rbuf_free(conn, RECV_BUFFER,
1376 				    (void *)(uintptr_t)s_recvp->vaddr);
1377 				rib_recv_rele(qp);
1378 				(void) rib_free_svc_recv(s_recvp);
1379 				continue;
1380 			}
1381 			/*
1382 			 * Is this for RDMA_DONE?
1383 			 */
1384 			if (op == RDMA_DONE) {
1385 				rib_rbuf_free(conn, RECV_BUFFER,
1386 				    (void *)(uintptr_t)s_recvp->vaddr);
1387 				/*
1388 				 * Wake up the thread waiting on
1389 				 * a RDMA_DONE for xid
1390 				 */
1391 				mutex_enter(&qp->rdlist_lock);
1392 				rdma_done_notify(qp, xid);
1393 				mutex_exit(&qp->rdlist_lock);
1394 				rib_recv_rele(qp);
1395 				(void) rib_free_svc_recv(s_recvp);
1396 				continue;
1397 			}
1398 
1399 			mutex_enter(&plugin_state_lock);
1400 			if (plugin_state == ACCEPT) {
1401 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1402 				    == NULL)
1403 					(void) strwaitbuf(
1404 					    sizeof (*rdp), BPRI_LO);
1405 				/*
1406 				 * Plugin is in accept state, hence the master
1407 				 * transport queue for this is still accepting
1408 				 * requests. Hence we can call svc_queuereq to
1409 				 * queue this recieved msg.
1410 				 */
1411 				rdp = (rdma_recv_data_t *)mp->b_rptr;
1412 				rdp->conn = conn;
1413 				rdp->rpcmsg.addr =
1414 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1415 				rdp->rpcmsg.type = RECV_BUFFER;
1416 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1417 				rdp->status = wc.wc_status;
1418 				mutex_enter(&conn->c_lock);
1419 				conn->c_ref++;
1420 				mutex_exit(&conn->c_lock);
1421 				mp->b_wptr += sizeof (*rdp);
1422 				svc_queuereq((queue_t *)rib_stat->q, mp);
1423 				mutex_exit(&plugin_state_lock);
1424 			} else {
1425 				/*
1426 				 * The master transport for this is going
1427 				 * away and the queue is not accepting anymore
1428 				 * requests for krpc, so don't do anything, just
1429 				 * free the msg.
1430 				 */
1431 				mutex_exit(&plugin_state_lock);
1432 				rib_rbuf_free(conn, RECV_BUFFER,
1433 				    (void *)(uintptr_t)s_recvp->vaddr);
1434 			}
1435 		} else {
1436 			rib_rbuf_free(conn, RECV_BUFFER,
1437 			    (void *)(uintptr_t)s_recvp->vaddr);
1438 		}
1439 		rib_recv_rele(qp);
1440 		(void) rib_free_svc_recv(s_recvp);
1441 	}
1442 }
1443 
1444 static void
1445 rib_attach_hca()
1446 {
1447 	mutex_enter(&rib_stat->open_hca_lock);
1448 	(void) rpcib_open_hcas(rib_stat);
1449 	rib_listen(NULL);
1450 	mutex_exit(&rib_stat->open_hca_lock);
1451 }
1452 
1453 /*
1454  * Handles DR event of IBT_HCA_DETACH_EVENT.
1455  */
1456 /* ARGSUSED */
1457 static void
1458 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1459 	ibt_async_code_t code, ibt_async_event_t *event)
1460 {
1461 	switch (code) {
1462 	case IBT_HCA_ATTACH_EVENT:
1463 		rib_attach_hca();
1464 		break;
1465 	case IBT_HCA_DETACH_EVENT:
1466 		rib_detach_hca(hca_hdl);
1467 #ifdef DEBUG
1468 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1469 #endif
1470 		break;
1471 	case IBT_EVENT_PORT_UP:
1472 		/*
1473 		 * A port is up. We should call rib_listen() since there is
1474 		 * a chance that rib_listen() may have failed during
1475 		 * rib_attach_hca() because the port had not been up yet.
1476 		 */
1477 		rib_listen(NULL);
1478 #ifdef DEBUG
1479 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1480 #endif
1481 		break;
1482 #ifdef DEBUG
1483 	case IBT_EVENT_PATH_MIGRATED:
1484 		cmn_err(CE_NOTE, "rib_async_handler(): "
1485 		    "IBT_EVENT_PATH_MIGRATED\n");
1486 		break;
1487 	case IBT_EVENT_SQD:
1488 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1489 		break;
1490 	case IBT_EVENT_COM_EST:
1491 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1492 		break;
1493 	case IBT_ERROR_CATASTROPHIC_CHAN:
1494 		cmn_err(CE_NOTE, "rib_async_handler(): "
1495 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1496 		break;
1497 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1498 		cmn_err(CE_NOTE, "rib_async_handler(): "
1499 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1500 		break;
1501 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1502 		cmn_err(CE_NOTE, "rib_async_handler(): "
1503 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1504 		break;
1505 	case IBT_ERROR_PATH_MIGRATE_REQ:
1506 		cmn_err(CE_NOTE, "rib_async_handler(): "
1507 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1508 		break;
1509 	case IBT_ERROR_CQ:
1510 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1511 		break;
1512 	case IBT_ERROR_PORT_DOWN:
1513 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1514 		break;
1515 	case IBT_ASYNC_OPAQUE1:
1516 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1517 		break;
1518 	case IBT_ASYNC_OPAQUE2:
1519 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1520 		break;
1521 	case IBT_ASYNC_OPAQUE3:
1522 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1523 		break;
1524 	case IBT_ASYNC_OPAQUE4:
1525 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1526 		break;
1527 #endif
1528 	default:
1529 		break;
1530 	}
1531 }
1532 
1533 /*
1534  * Client's reachable function.
1535  */
1536 static rdma_stat
1537 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1538 {
1539 	rdma_stat	status;
1540 	rpcib_ping_t	rpt;
1541 	struct netbuf	saddr;
1542 	CONN		*conn;
1543 
1544 	bzero(&saddr, sizeof (struct netbuf));
1545 	status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
1546 
1547 	if (status == RDMA_SUCCESS) {
1548 		*handle = (void *)rpt.hca;
1549 		/* release the reference */
1550 		(void) rib_conn_release(conn);
1551 		return (RDMA_SUCCESS);
1552 	} else {
1553 		*handle = NULL;
1554 		DTRACE_PROBE(rpcib__i__pingfailed);
1555 		return (RDMA_FAILED);
1556 	}
1557 }
1558 
1559 /* Client side qp creation */
1560 static rdma_stat
1561 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1562 {
1563 	rib_qp_t	*kqp = NULL;
1564 	CONN		*conn;
1565 	rdma_clnt_cred_ctrl_t *cc_info;
1566 
1567 	ASSERT(qp != NULL);
1568 	*qp = NULL;
1569 
1570 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1571 	conn = qptoc(kqp);
1572 	kqp->hca = hca;
1573 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1574 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1575 
1576 	kqp->mode = RIB_CLIENT;
1577 	kqp->chan_flags = IBT_BLOCKING;
1578 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1579 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1580 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1581 	/*
1582 	 * Initialize
1583 	 */
1584 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1585 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1586 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1587 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1588 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1589 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1590 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1591 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1592 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1593 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1594 	/*
1595 	 * Initialize the client credit control
1596 	 * portion of the rdmaconn struct.
1597 	 */
1598 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1599 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1600 	cc_info->clnt_cc_granted_ops = 0;
1601 	cc_info->clnt_cc_in_flight_ops = 0;
1602 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1603 
1604 	*qp = kqp;
1605 	return (RDMA_SUCCESS);
1606 }
1607 
1608 /* Server side qp creation */
1609 static rdma_stat
1610 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1611 {
1612 	rib_qp_t	*kqp = NULL;
1613 	ibt_chan_sizes_t	chan_sizes;
1614 	ibt_rc_chan_alloc_args_t	qp_attr;
1615 	ibt_status_t		ibt_status;
1616 	rdma_srv_cred_ctrl_t *cc_info;
1617 
1618 	*qp = NULL;
1619 
1620 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1621 	kqp->hca = hca;
1622 	kqp->port_num = port;
1623 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1624 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1625 
1626 	/*
1627 	 * Create the qp handle
1628 	 */
1629 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1630 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1631 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1632 	qp_attr.rc_pd = hca->pd_hdl;
1633 	qp_attr.rc_hca_port_num = port;
1634 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1635 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1636 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1637 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1638 	qp_attr.rc_clone_chan = NULL;
1639 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1640 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1641 
1642 	rw_enter(&hca->state_lock, RW_READER);
1643 	if (hca->state != HCA_DETACHED) {
1644 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1645 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1646 		    &chan_sizes);
1647 	} else {
1648 		rw_exit(&hca->state_lock);
1649 		goto fail;
1650 	}
1651 	rw_exit(&hca->state_lock);
1652 
1653 	if (ibt_status != IBT_SUCCESS) {
1654 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1655 		    int, ibt_status);
1656 		goto fail;
1657 	}
1658 
1659 	kqp->mode = RIB_SERVER;
1660 	kqp->chan_flags = IBT_BLOCKING;
1661 	kqp->q = q;	/* server ONLY */
1662 
1663 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1664 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1665 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1666 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1667 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1668 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1669 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1670 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1671 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1672 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1673 	/*
1674 	 * Set the private data area to qp to be used in callbacks
1675 	 */
1676 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1677 	kqp->rdmaconn.c_state = C_CONNECTED;
1678 
1679 	/*
1680 	 * Initialize the server credit control
1681 	 * portion of the rdmaconn struct.
1682 	 */
1683 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1684 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1685 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1686 	cc_info->srv_cc_cur_buffers_used = 0;
1687 	cc_info->srv_cc_posted = preposted_rbufs;
1688 
1689 	*qp = kqp;
1690 
1691 	return (RDMA_SUCCESS);
1692 fail:
1693 	if (kqp)
1694 		kmem_free(kqp, sizeof (rib_qp_t));
1695 
1696 	return (RDMA_FAILED);
1697 }
1698 
1699 /* ARGSUSED */
1700 ibt_cm_status_t
1701 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1702     ibt_cm_return_args_t *ret_args, void *priv_data,
1703     ibt_priv_data_len_t len)
1704 {
1705 	rib_hca_t	*hca;
1706 
1707 	hca = (rib_hca_t *)clnt_hdl;
1708 
1709 	switch (event->cm_type) {
1710 
1711 	/* got a connection close event */
1712 	case IBT_CM_EVENT_CONN_CLOSED:
1713 	{
1714 		CONN	*conn;
1715 		rib_qp_t *qp;
1716 
1717 		/* check reason why connection was closed */
1718 		switch (event->cm_event.closed) {
1719 		case IBT_CM_CLOSED_DREP_RCVD:
1720 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1721 		case IBT_CM_CLOSED_DUP:
1722 		case IBT_CM_CLOSED_ABORT:
1723 		case IBT_CM_CLOSED_ALREADY:
1724 			/*
1725 			 * These cases indicate the local end initiated
1726 			 * the closing of the channel. Nothing to do here.
1727 			 */
1728 			break;
1729 		default:
1730 			/*
1731 			 * Reason for CONN_CLOSED event must be one of
1732 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1733 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1734 			 * the remote end is closing the channel. In these
1735 			 * cases free the channel and transition to error
1736 			 * state
1737 			 */
1738 			qp = ibt_get_chan_private(event->cm_channel);
1739 			conn = qptoc(qp);
1740 			mutex_enter(&conn->c_lock);
1741 			if (conn->c_state == C_DISCONN_PEND) {
1742 				mutex_exit(&conn->c_lock);
1743 				break;
1744 			}
1745 
1746 			conn->c_state = C_ERROR_CONN;
1747 
1748 			/*
1749 			 * Free the conn if c_ref is down to 0 already
1750 			 */
1751 			if (conn->c_ref == 0) {
1752 				/*
1753 				 * Remove from list and free conn
1754 				 */
1755 				conn->c_state = C_DISCONN_PEND;
1756 				mutex_exit(&conn->c_lock);
1757 				rw_enter(&hca->state_lock, RW_READER);
1758 				if (hca->state != HCA_DETACHED)
1759 					(void) rib_disconnect_channel(conn,
1760 					    &hca->cl_conn_list);
1761 				rw_exit(&hca->state_lock);
1762 			} else {
1763 				/*
1764 				 * conn will be freed when c_ref goes to 0.
1765 				 * Indicate to cleaning thread not to close
1766 				 * the connection, but just free the channel.
1767 				 */
1768 				conn->c_flags |= C_CLOSE_NOTNEEDED;
1769 				mutex_exit(&conn->c_lock);
1770 			}
1771 #ifdef DEBUG
1772 			if (rib_debug)
1773 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1774 				    "(CONN_CLOSED) channel disconnected");
1775 #endif
1776 			break;
1777 		}
1778 		break;
1779 	}
1780 	default:
1781 		break;
1782 	}
1783 	return (IBT_CM_ACCEPT);
1784 }
1785 
1786 /*
1787  * Connect to the server.
1788  */
1789 rdma_stat
1790 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1791 {
1792 	ibt_chan_open_args_t	chan_args;	/* channel args */
1793 	ibt_chan_sizes_t	chan_sizes;
1794 	ibt_rc_chan_alloc_args_t	qp_attr;
1795 	ibt_status_t		ibt_status;
1796 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1797 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1798 	ibt_ip_cm_info_t	ipcm_info;
1799 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1800 
1801 
1802 	(void) bzero(&chan_args, sizeof (chan_args));
1803 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1804 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1805 
1806 	ipcm_info.src_addr.family = rptp->srcip.family;
1807 	switch (ipcm_info.src_addr.family) {
1808 	case AF_INET:
1809 		ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1810 		break;
1811 	case AF_INET6:
1812 		ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1813 		break;
1814 	}
1815 
1816 	ipcm_info.dst_addr.family = rptp->srcip.family;
1817 	switch (ipcm_info.dst_addr.family) {
1818 	case AF_INET:
1819 		ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1820 		break;
1821 	case AF_INET6:
1822 		ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1823 		break;
1824 	}
1825 
1826 	ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1827 
1828 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1829 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1830 
1831 	if (ibt_status != IBT_SUCCESS) {
1832 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1833 		return (-1);
1834 	}
1835 
1836 	qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1837 	/* Alloc a RC channel */
1838 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1839 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1840 	qp_attr.rc_pd = hca->pd_hdl;
1841 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1842 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1843 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1844 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1845 	qp_attr.rc_clone_chan = NULL;
1846 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1847 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1848 
1849 	rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1850 	chan_args.oc_path = &rptp->path;
1851 
1852 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1853 	chan_args.oc_cm_clnt_private = (void *)hca;
1854 	chan_args.oc_rdma_ra_out = 4;
1855 	chan_args.oc_rdma_ra_in = 4;
1856 	chan_args.oc_path_retry_cnt = 2;
1857 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1858 	chan_args.oc_priv_data = cmp_ip_pvt;
1859 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1860 
1861 refresh:
1862 	rw_enter(&hca->state_lock, RW_READER);
1863 	if (hca->state != HCA_DETACHED) {
1864 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1865 		    IBT_ACHAN_NO_FLAGS,
1866 		    &qp_attr, &qp->qp_hdl,
1867 		    &chan_sizes);
1868 	} else {
1869 		rw_exit(&hca->state_lock);
1870 		return (RDMA_FAILED);
1871 	}
1872 	rw_exit(&hca->state_lock);
1873 
1874 	if (ibt_status != IBT_SUCCESS) {
1875 		DTRACE_PROBE1(rpcib__i_conntosrv,
1876 		    int, ibt_status);
1877 		return (RDMA_FAILED);
1878 	}
1879 
1880 	/* Connect to the Server */
1881 	(void) bzero(&ret_args, sizeof (ret_args));
1882 	mutex_enter(&qp->cb_lock);
1883 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1884 	    IBT_BLOCKING, &chan_args, &ret_args);
1885 	if (ibt_status != IBT_SUCCESS) {
1886 		DTRACE_PROBE2(rpcib__i_openrctosrv,
1887 		    int, ibt_status, int, ret_args.rc_status);
1888 
1889 		(void) ibt_free_channel(qp->qp_hdl);
1890 		qp->qp_hdl = NULL;
1891 		mutex_exit(&qp->cb_lock);
1892 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1893 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1894 			/*
1895 			 * Got IBT_CM_CONN_STALE probably because of stale
1896 			 * data on the passive end of a channel that existed
1897 			 * prior to reboot. Retry establishing a channel
1898 			 * REFRESH_ATTEMPTS times, during which time the
1899 			 * stale conditions on the server might clear up.
1900 			 */
1901 			goto refresh;
1902 		}
1903 		return (RDMA_FAILED);
1904 	}
1905 	mutex_exit(&qp->cb_lock);
1906 	/*
1907 	 * Set the private data area to qp to be used in callbacks
1908 	 */
1909 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1910 	return (RDMA_SUCCESS);
1911 }
1912 
1913 rdma_stat
1914 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1915 {
1916 	uint_t			i, addr_count;
1917 	ibt_status_t		ibt_status;
1918 	uint8_t			num_paths_p;
1919 	ibt_ip_path_attr_t	ipattr;
1920 	ibt_path_ip_src_t	srcip;
1921 	rpcib_ipaddrs_t		addrs4;
1922 	rpcib_ipaddrs_t		addrs6;
1923 	struct sockaddr_in	*sinp;
1924 	struct sockaddr_in6	*sin6p;
1925 	rdma_stat		retval = RDMA_FAILED;
1926 	rib_hca_t *hca;
1927 
1928 	if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1929 		return (RDMA_INVAL);
1930 	ASSERT(raddr->buf != NULL);
1931 
1932 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1933 
1934 	if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1935 	    (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1936 		retval = RDMA_FAILED;
1937 		goto done2;
1938 	}
1939 
1940 	if (addr_type == AF_INET) {
1941 		addr_count = addrs4.ri_count;
1942 		sinp = (struct sockaddr_in *)raddr->buf;
1943 		rptp->dstip.family = AF_INET;
1944 		rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1945 		sinp = addrs4.ri_list;
1946 	} else {
1947 		addr_count = addrs6.ri_count;
1948 		sin6p = (struct sockaddr_in6 *)raddr->buf;
1949 		rptp->dstip.family = AF_INET6;
1950 		rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1951 		sin6p = addrs6.ri_list;
1952 	}
1953 
1954 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1955 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1956 		rw_enter(&hca->state_lock, RW_READER);
1957 		if (hca->state == HCA_DETACHED) {
1958 			rw_exit(&hca->state_lock);
1959 			continue;
1960 		}
1961 
1962 		ipattr.ipa_dst_ip 	= &rptp->dstip;
1963 		ipattr.ipa_hca_guid	= hca->hca_guid;
1964 		ipattr.ipa_ndst		= 1;
1965 		ipattr.ipa_max_paths	= 1;
1966 		ipattr.ipa_src_ip.family = rptp->dstip.family;
1967 		for (i = 0; i < addr_count; i++) {
1968 			num_paths_p = 0;
1969 			if (addr_type == AF_INET) {
1970 				ipattr.ipa_src_ip.un.ip4addr =
1971 				    sinp[i].sin_addr.s_addr;
1972 			} else {
1973 				ipattr.ipa_src_ip.un.ip6addr =
1974 				    sin6p[i].sin6_addr;
1975 			}
1976 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
1977 
1978 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1979 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1980 			    &num_paths_p, &srcip);
1981 			if (ibt_status == IBT_SUCCESS &&
1982 			    num_paths_p != 0 &&
1983 			    rptp->path.pi_hca_guid == hca->hca_guid) {
1984 				rptp->hca = hca;
1985 				rw_exit(&hca->state_lock);
1986 				if (addr_type == AF_INET) {
1987 					rptp->srcip.family = AF_INET;
1988 					rptp->srcip.un.ip4addr =
1989 					    srcip.ip_primary.un.ip4addr;
1990 				} else {
1991 					rptp->srcip.family = AF_INET6;
1992 					rptp->srcip.un.ip6addr =
1993 					    srcip.ip_primary.un.ip6addr;
1994 
1995 				}
1996 				retval = RDMA_SUCCESS;
1997 				goto done1;
1998 			}
1999 		}
2000 		rw_exit(&hca->state_lock);
2001 	}
2002 done1:
2003 	rw_exit(&rib_stat->hcas_list_lock);
2004 done2:
2005 	if (addrs4.ri_size > 0)
2006 		kmem_free(addrs4.ri_list, addrs4.ri_size);
2007 	if (addrs6.ri_size > 0)
2008 		kmem_free(addrs6.ri_list, addrs6.ri_size);
2009 	return (retval);
2010 }
2011 
2012 /*
2013  * Close channel, remove from connection list and
2014  * free up resources allocated for that channel.
2015  */
2016 rdma_stat
2017 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2018 {
2019 	rib_qp_t	*qp = ctoqp(conn);
2020 	rib_hca_t	*hca;
2021 
2022 	mutex_enter(&conn->c_lock);
2023 	if (conn->c_timeout != NULL) {
2024 		mutex_exit(&conn->c_lock);
2025 		(void) untimeout(conn->c_timeout);
2026 		mutex_enter(&conn->c_lock);
2027 	}
2028 
2029 	while (conn->c_flags & C_CLOSE_PENDING) {
2030 		cv_wait(&conn->c_cv, &conn->c_lock);
2031 	}
2032 	mutex_exit(&conn->c_lock);
2033 
2034 	/*
2035 	 * c_ref == 0 and connection is in C_DISCONN_PEND
2036 	 */
2037 	hca = qp->hca;
2038 	if (conn_list != NULL)
2039 		(void) rib_rm_conn(conn, conn_list);
2040 
2041 	/*
2042 	 * There is only one case where we get here with
2043 	 * qp_hdl = NULL, which is during connection setup on
2044 	 * the client. In such a case there are no posted
2045 	 * send/recv buffers.
2046 	 */
2047 	if (qp->qp_hdl != NULL) {
2048 		mutex_enter(&qp->posted_rbufs_lock);
2049 		while (qp->n_posted_rbufs)
2050 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2051 		mutex_exit(&qp->posted_rbufs_lock);
2052 
2053 		mutex_enter(&qp->send_rbufs_lock);
2054 		while (qp->n_send_rbufs)
2055 			cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2056 			mutex_exit(&qp->send_rbufs_lock);
2057 
2058 		(void) ibt_free_channel(qp->qp_hdl);
2059 			qp->qp_hdl = NULL;
2060 	}
2061 
2062 	ASSERT(qp->rdlist == NULL);
2063 
2064 	if (qp->replylist != NULL) {
2065 		(void) rib_rem_replylist(qp);
2066 	}
2067 
2068 	cv_destroy(&qp->cb_conn_cv);
2069 	cv_destroy(&qp->posted_rbufs_cv);
2070 	cv_destroy(&qp->send_rbufs_cv);
2071 	mutex_destroy(&qp->cb_lock);
2072 	mutex_destroy(&qp->replylist_lock);
2073 	mutex_destroy(&qp->posted_rbufs_lock);
2074 	mutex_destroy(&qp->send_rbufs_lock);
2075 	mutex_destroy(&qp->rdlist_lock);
2076 
2077 	cv_destroy(&conn->c_cv);
2078 	mutex_destroy(&conn->c_lock);
2079 
2080 	if (conn->c_raddr.buf != NULL) {
2081 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2082 	}
2083 	if (conn->c_laddr.buf != NULL) {
2084 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2085 	}
2086 	if (conn->c_netid != NULL) {
2087 		kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1));
2088 	}
2089 
2090 	/*
2091 	 * Credit control cleanup.
2092 	 */
2093 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2094 		rdma_clnt_cred_ctrl_t *cc_info;
2095 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2096 		cv_destroy(&cc_info->clnt_cc_cv);
2097 	}
2098 
2099 	kmem_free(qp, sizeof (rib_qp_t));
2100 
2101 	/*
2102 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2103 	 * then the hca is no longer being used.
2104 	 */
2105 	if (conn_list != NULL) {
2106 		rw_enter(&hca->state_lock, RW_READER);
2107 		if (hca->state == HCA_DETACHED) {
2108 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2109 			if (hca->srv_conn_list.conn_hd == NULL) {
2110 				rw_enter(&hca->cl_conn_list.conn_lock,
2111 				    RW_READER);
2112 
2113 				if (hca->cl_conn_list.conn_hd == NULL) {
2114 					mutex_enter(&hca->inuse_lock);
2115 					hca->inuse = FALSE;
2116 					cv_signal(&hca->cb_cv);
2117 					mutex_exit(&hca->inuse_lock);
2118 				}
2119 				rw_exit(&hca->cl_conn_list.conn_lock);
2120 			}
2121 			rw_exit(&hca->srv_conn_list.conn_lock);
2122 		}
2123 		rw_exit(&hca->state_lock);
2124 	}
2125 
2126 	return (RDMA_SUCCESS);
2127 }
2128 
2129 /*
2130  * All sends are done under the protection of
2131  * the wdesc->sendwait_lock. n_send_rbufs count
2132  * is protected using the send_rbufs_lock.
2133  * lock ordering is:
2134  * sendwait_lock -> send_rbufs_lock
2135  */
2136 
2137 void
2138 rib_send_hold(rib_qp_t *qp)
2139 {
2140 	mutex_enter(&qp->send_rbufs_lock);
2141 	qp->n_send_rbufs++;
2142 	mutex_exit(&qp->send_rbufs_lock);
2143 }
2144 
2145 void
2146 rib_send_rele(rib_qp_t *qp)
2147 {
2148 	mutex_enter(&qp->send_rbufs_lock);
2149 	qp->n_send_rbufs--;
2150 	if (qp->n_send_rbufs == 0)
2151 		cv_signal(&qp->send_rbufs_cv);
2152 	mutex_exit(&qp->send_rbufs_lock);
2153 }
2154 
2155 void
2156 rib_recv_rele(rib_qp_t *qp)
2157 {
2158 	mutex_enter(&qp->posted_rbufs_lock);
2159 	qp->n_posted_rbufs--;
2160 	if (qp->n_posted_rbufs == 0)
2161 		cv_signal(&qp->posted_rbufs_cv);
2162 	mutex_exit(&qp->posted_rbufs_lock);
2163 }
2164 
2165 /*
2166  * Wait for send completion notification. Only on receiving a
2167  * notification be it a successful or error completion, free the
2168  * send_wid.
2169  */
2170 static rdma_stat
2171 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2172 {
2173 	clock_t timout, cv_wait_ret;
2174 	rdma_stat error = RDMA_SUCCESS;
2175 	int	i;
2176 
2177 	/*
2178 	 * Wait for send to complete
2179 	 */
2180 	ASSERT(wd != NULL);
2181 	mutex_enter(&wd->sendwait_lock);
2182 	if (wd->status == (uint_t)SEND_WAIT) {
2183 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2184 		    ddi_get_lbolt();
2185 
2186 		if (qp->mode == RIB_SERVER) {
2187 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2188 			    &wd->sendwait_lock, timout)) > 0 &&
2189 			    wd->status == (uint_t)SEND_WAIT)
2190 				;
2191 			switch (cv_wait_ret) {
2192 			case -1:	/* timeout */
2193 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2194 
2195 				wd->cv_sig = 0;		/* no signal needed */
2196 				error = RDMA_TIMEDOUT;
2197 				break;
2198 			default:	/* got send completion */
2199 				break;
2200 			}
2201 		} else {
2202 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2203 			    &wd->sendwait_lock, timout)) > 0 &&
2204 			    wd->status == (uint_t)SEND_WAIT)
2205 				;
2206 			switch (cv_wait_ret) {
2207 			case -1:	/* timeout */
2208 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2209 
2210 				wd->cv_sig = 0;		/* no signal needed */
2211 				error = RDMA_TIMEDOUT;
2212 				break;
2213 			case 0:		/* interrupted */
2214 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2215 
2216 				wd->cv_sig = 0;		/* no signal needed */
2217 				error = RDMA_INTR;
2218 				break;
2219 			default:	/* got send completion */
2220 				break;
2221 			}
2222 		}
2223 	}
2224 
2225 	if (wd->status != (uint_t)SEND_WAIT) {
2226 		/* got send completion */
2227 		if (wd->status != RDMA_SUCCESS) {
2228 			switch (wd->status) {
2229 			case RDMA_CONNLOST:
2230 				error = RDMA_CONNLOST;
2231 				break;
2232 			default:
2233 				error = RDMA_FAILED;
2234 				break;
2235 			}
2236 		}
2237 		for (i = 0; i < wd->nsbufs; i++) {
2238 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2239 			    (void *)(uintptr_t)wd->sbufaddr[i]);
2240 		}
2241 
2242 		rib_send_rele(qp);
2243 
2244 		mutex_exit(&wd->sendwait_lock);
2245 		(void) rib_free_sendwait(wd);
2246 
2247 	} else {
2248 		mutex_exit(&wd->sendwait_lock);
2249 	}
2250 	return (error);
2251 }
2252 
2253 static struct send_wid *
2254 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2255 {
2256 	struct send_wid	*wd;
2257 
2258 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2259 	wd->xid = xid;
2260 	wd->cv_sig = cv_sig;
2261 	wd->qp = qp;
2262 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2263 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2264 	wd->status = (uint_t)SEND_WAIT;
2265 
2266 	return (wd);
2267 }
2268 
2269 static int
2270 rib_free_sendwait(struct send_wid *wdesc)
2271 {
2272 	cv_destroy(&wdesc->wait_cv);
2273 	mutex_destroy(&wdesc->sendwait_lock);
2274 	kmem_free(wdesc, sizeof (*wdesc));
2275 
2276 	return (0);
2277 }
2278 
2279 static rdma_stat
2280 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2281 {
2282 	mutex_enter(&qp->replylist_lock);
2283 	if (rep != NULL) {
2284 		(void) rib_remreply(qp, rep);
2285 		mutex_exit(&qp->replylist_lock);
2286 		return (RDMA_SUCCESS);
2287 	}
2288 	mutex_exit(&qp->replylist_lock);
2289 	return (RDMA_FAILED);
2290 }
2291 
2292 /*
2293  * Send buffers are freed here only in case of error in posting
2294  * on QP. If the post succeeded, the send buffers are freed upon
2295  * send completion in rib_sendwait() or in the scq_handler.
2296  */
2297 rdma_stat
2298 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2299 	int send_sig, int cv_sig, caddr_t *swid)
2300 {
2301 	struct send_wid	*wdesc;
2302 	struct clist	*clp;
2303 	ibt_status_t	ibt_status = IBT_SUCCESS;
2304 	rdma_stat	ret = RDMA_SUCCESS;
2305 	ibt_send_wr_t	tx_wr;
2306 	int		i, nds;
2307 	ibt_wr_ds_t	sgl[DSEG_MAX];
2308 	uint_t		total_msg_size;
2309 	rib_qp_t	*qp;
2310 
2311 	qp = ctoqp(conn);
2312 
2313 	ASSERT(cl != NULL);
2314 
2315 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2316 
2317 	nds = 0;
2318 	total_msg_size = 0;
2319 	clp = cl;
2320 	while (clp != NULL) {
2321 		if (nds >= DSEG_MAX) {
2322 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2323 			return (RDMA_FAILED);
2324 		}
2325 		sgl[nds].ds_va = clp->w.c_saddr;
2326 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2327 		sgl[nds].ds_len = clp->c_len;
2328 		total_msg_size += clp->c_len;
2329 		clp = clp->c_next;
2330 		nds++;
2331 	}
2332 
2333 	if (send_sig) {
2334 		/* Set SEND_SIGNAL flag. */
2335 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2336 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2337 		*swid = (caddr_t)wdesc;
2338 		tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2339 		mutex_enter(&wdesc->sendwait_lock);
2340 		wdesc->nsbufs = nds;
2341 		for (i = 0; i < nds; i++) {
2342 			wdesc->sbufaddr[i] = sgl[i].ds_va;
2343 		}
2344 	} else {
2345 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2346 		*swid = NULL;
2347 		tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2348 	}
2349 
2350 	tx_wr.wr_opcode = IBT_WRC_SEND;
2351 	tx_wr.wr_trans = IBT_RC_SRV;
2352 	tx_wr.wr_nds = nds;
2353 	tx_wr.wr_sgl = sgl;
2354 
2355 	mutex_enter(&conn->c_lock);
2356 	if (conn->c_state == C_CONNECTED) {
2357 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2358 	}
2359 	if (conn->c_state != C_CONNECTED ||
2360 	    ibt_status != IBT_SUCCESS) {
2361 		if (conn->c_state != C_DISCONN_PEND)
2362 			conn->c_state = C_ERROR_CONN;
2363 		mutex_exit(&conn->c_lock);
2364 		if (send_sig) {
2365 			for (i = 0; i < nds; i++) {
2366 				rib_rbuf_free(conn, SEND_BUFFER,
2367 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2368 			}
2369 			mutex_exit(&wdesc->sendwait_lock);
2370 			(void) rib_free_sendwait(wdesc);
2371 		}
2372 		return (RDMA_CONNLOST);
2373 	}
2374 
2375 	mutex_exit(&conn->c_lock);
2376 
2377 	if (send_sig) {
2378 		rib_send_hold(qp);
2379 		mutex_exit(&wdesc->sendwait_lock);
2380 		if (cv_sig) {
2381 			/*
2382 			 * cv_wait for send to complete.
2383 			 * We can fail due to a timeout or signal or
2384 			 * unsuccessful send.
2385 			 */
2386 			ret = rib_sendwait(qp, wdesc);
2387 
2388 			return (ret);
2389 		}
2390 	}
2391 
2392 	return (RDMA_SUCCESS);
2393 }
2394 
2395 
2396 rdma_stat
2397 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2398 {
2399 	rdma_stat	ret;
2400 	caddr_t		wd;
2401 
2402 	/* send-wait & cv_signal */
2403 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2404 	return (ret);
2405 }
2406 
2407 /*
2408  * Deprecated/obsolete interface not used currently
2409  * but earlier used for READ-READ protocol.
2410  * Send RPC reply and wait for RDMA_DONE.
2411  */
2412 rdma_stat
2413 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2414 {
2415 	rdma_stat ret = RDMA_SUCCESS;
2416 	struct rdma_done_list *rd;
2417 	clock_t cv_wait_ret;
2418 	caddr_t *wid = NULL;
2419 	rib_qp_t *qp = ctoqp(conn);
2420 
2421 	mutex_enter(&qp->rdlist_lock);
2422 	rd = rdma_done_add(qp, msgid);
2423 
2424 	/* No cv_signal (whether send-wait or no-send-wait) */
2425 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2426 
2427 	if (ret != RDMA_SUCCESS) {
2428 		rdma_done_rm(qp, rd);
2429 	} else {
2430 		/*
2431 		 * Wait for RDMA_DONE from remote end
2432 		 */
2433 		cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv,
2434 		    &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000),
2435 		    TR_CLOCK_TICK);
2436 
2437 		rdma_done_rm(qp, rd);
2438 
2439 		if (cv_wait_ret < 0) {
2440 			ret = RDMA_TIMEDOUT;
2441 		}
2442 	}
2443 
2444 	mutex_exit(&qp->rdlist_lock);
2445 	return (ret);
2446 }
2447 
2448 static struct recv_wid *
2449 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2450 {
2451 	struct recv_wid	*rwid;
2452 
2453 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2454 	rwid->xid = msgid;
2455 	rwid->addr = sgl->ds_va;
2456 	rwid->qp = qp;
2457 
2458 	return (rwid);
2459 }
2460 
2461 static void
2462 rib_free_wid(struct recv_wid *rwid)
2463 {
2464 	kmem_free(rwid, sizeof (struct recv_wid));
2465 }
2466 
2467 rdma_stat
2468 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2469 {
2470 	rib_qp_t	*qp = ctoqp(conn);
2471 	struct clist	*clp = cl;
2472 	struct reply	*rep;
2473 	struct recv_wid	*rwid;
2474 	int		nds;
2475 	ibt_wr_ds_t	sgl[DSEG_MAX];
2476 	ibt_recv_wr_t	recv_wr;
2477 	rdma_stat	ret;
2478 	ibt_status_t	ibt_status;
2479 
2480 	/*
2481 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2482 	 */
2483 
2484 	nds = 0;
2485 	while (cl != NULL) {
2486 		if (nds >= DSEG_MAX) {
2487 			ret = RDMA_FAILED;
2488 			goto done;
2489 		}
2490 		sgl[nds].ds_va = cl->w.c_saddr;
2491 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2492 		sgl[nds].ds_len = cl->c_len;
2493 		cl = cl->c_next;
2494 		nds++;
2495 	}
2496 
2497 	if (nds != 1) {
2498 		ret = RDMA_FAILED;
2499 		goto done;
2500 	}
2501 
2502 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2503 	recv_wr.wr_nds = nds;
2504 	recv_wr.wr_sgl = sgl;
2505 
2506 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2507 	if (rwid) {
2508 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2509 	} else {
2510 		ret = RDMA_NORESOURCE;
2511 		goto done;
2512 	}
2513 	rep = rib_addreplylist(qp, msgid);
2514 	if (!rep) {
2515 		rib_free_wid(rwid);
2516 		ret = RDMA_NORESOURCE;
2517 		goto done;
2518 	}
2519 
2520 	mutex_enter(&conn->c_lock);
2521 
2522 	if (conn->c_state == C_CONNECTED) {
2523 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2524 	}
2525 
2526 	if (conn->c_state != C_CONNECTED ||
2527 	    ibt_status != IBT_SUCCESS) {
2528 		if (conn->c_state != C_DISCONN_PEND)
2529 			conn->c_state = C_ERROR_CONN;
2530 		mutex_exit(&conn->c_lock);
2531 		rib_free_wid(rwid);
2532 		(void) rib_rem_rep(qp, rep);
2533 		ret = RDMA_CONNLOST;
2534 		goto done;
2535 	}
2536 
2537 	mutex_enter(&qp->posted_rbufs_lock);
2538 	qp->n_posted_rbufs++;
2539 	mutex_exit(&qp->posted_rbufs_lock);
2540 
2541 	mutex_exit(&conn->c_lock);
2542 	return (RDMA_SUCCESS);
2543 
2544 done:
2545 	while (clp != NULL) {
2546 		rib_rbuf_free(conn, RECV_BUFFER,
2547 		    (void *)(uintptr_t)clp->w.c_saddr3);
2548 		clp = clp->c_next;
2549 	}
2550 	return (ret);
2551 }
2552 
2553 rdma_stat
2554 rib_svc_post(CONN* conn, struct clist *cl)
2555 {
2556 	rib_qp_t	*qp = ctoqp(conn);
2557 	struct svc_recv	*s_recvp;
2558 	int		nds;
2559 	ibt_wr_ds_t	sgl[DSEG_MAX];
2560 	ibt_recv_wr_t	recv_wr;
2561 	ibt_status_t	ibt_status;
2562 
2563 	nds = 0;
2564 	while (cl != NULL) {
2565 		if (nds >= DSEG_MAX) {
2566 			return (RDMA_FAILED);
2567 		}
2568 		sgl[nds].ds_va = cl->w.c_saddr;
2569 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2570 		sgl[nds].ds_len = cl->c_len;
2571 		cl = cl->c_next;
2572 		nds++;
2573 	}
2574 
2575 	if (nds != 1) {
2576 		rib_rbuf_free(conn, RECV_BUFFER,
2577 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2578 
2579 		return (RDMA_FAILED);
2580 	}
2581 
2582 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2583 	recv_wr.wr_nds = nds;
2584 	recv_wr.wr_sgl = sgl;
2585 
2586 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2587 	/* Use s_recvp's addr as wr id */
2588 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2589 	mutex_enter(&conn->c_lock);
2590 	if (conn->c_state == C_CONNECTED) {
2591 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2592 	}
2593 	if (conn->c_state != C_CONNECTED ||
2594 	    ibt_status != IBT_SUCCESS) {
2595 		if (conn->c_state != C_DISCONN_PEND)
2596 			conn->c_state = C_ERROR_CONN;
2597 		mutex_exit(&conn->c_lock);
2598 		rib_rbuf_free(conn, RECV_BUFFER,
2599 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2600 		(void) rib_free_svc_recv(s_recvp);
2601 
2602 		return (RDMA_CONNLOST);
2603 	}
2604 	mutex_exit(&conn->c_lock);
2605 
2606 	return (RDMA_SUCCESS);
2607 }
2608 
2609 /* Client */
2610 rdma_stat
2611 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2612 {
2613 	return (rib_clnt_post(conn, cl, msgid));
2614 }
2615 
2616 /* Client */
2617 rdma_stat
2618 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2619 {
2620 	rib_qp_t	*qp = ctoqp(conn);
2621 	struct reply	*rep;
2622 
2623 	mutex_enter(&qp->replylist_lock);
2624 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2625 		if (rep->xid == msgid) {
2626 			if (rep->vaddr_cq) {
2627 				rib_rbuf_free(conn, RECV_BUFFER,
2628 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2629 			}
2630 			(void) rib_remreply(qp, rep);
2631 			break;
2632 		}
2633 	}
2634 	mutex_exit(&qp->replylist_lock);
2635 
2636 	return (RDMA_SUCCESS);
2637 }
2638 
2639 /* Server */
2640 rdma_stat
2641 rib_post_recv(CONN *conn, struct clist *cl)
2642 {
2643 	rib_qp_t	*qp = ctoqp(conn);
2644 
2645 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2646 		mutex_enter(&qp->posted_rbufs_lock);
2647 		qp->n_posted_rbufs++;
2648 		mutex_exit(&qp->posted_rbufs_lock);
2649 		return (RDMA_SUCCESS);
2650 	}
2651 	return (RDMA_FAILED);
2652 }
2653 
2654 /*
2655  * Client side only interface to "recv" the rpc reply buf
2656  * posted earlier by rib_post_resp(conn, cl, msgid).
2657  */
2658 rdma_stat
2659 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2660 {
2661 	struct reply *rep = NULL;
2662 	clock_t timout, cv_wait_ret;
2663 	rdma_stat ret = RDMA_SUCCESS;
2664 	rib_qp_t *qp = ctoqp(conn);
2665 
2666 	/*
2667 	 * Find the reply structure for this msgid
2668 	 */
2669 	mutex_enter(&qp->replylist_lock);
2670 
2671 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2672 		if (rep->xid == msgid)
2673 			break;
2674 	}
2675 
2676 	if (rep != NULL) {
2677 		/*
2678 		 * If message not yet received, wait.
2679 		 */
2680 		if (rep->status == (uint_t)REPLY_WAIT) {
2681 			timout = ddi_get_lbolt() +
2682 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2683 
2684 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2685 			    &qp->replylist_lock, timout)) > 0 &&
2686 			    rep->status == (uint_t)REPLY_WAIT)
2687 				;
2688 
2689 			switch (cv_wait_ret) {
2690 			case -1:	/* timeout */
2691 				ret = RDMA_TIMEDOUT;
2692 				break;
2693 			case 0:
2694 				ret = RDMA_INTR;
2695 				break;
2696 			default:
2697 				break;
2698 			}
2699 		}
2700 
2701 		if (rep->status == RDMA_SUCCESS) {
2702 			struct clist *cl = NULL;
2703 
2704 			/*
2705 			 * Got message successfully
2706 			 */
2707 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2708 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2709 			*clp = cl;
2710 		} else {
2711 			if (rep->status != (uint_t)REPLY_WAIT) {
2712 				/*
2713 				 * Got error in reply message. Free
2714 				 * recv buffer here.
2715 				 */
2716 				ret = rep->status;
2717 				rib_rbuf_free(conn, RECV_BUFFER,
2718 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2719 			}
2720 		}
2721 		(void) rib_remreply(qp, rep);
2722 	} else {
2723 		/*
2724 		 * No matching reply structure found for given msgid on the
2725 		 * reply wait list.
2726 		 */
2727 		ret = RDMA_INVAL;
2728 		DTRACE_PROBE(rpcib__i__nomatchxid2);
2729 	}
2730 
2731 	/*
2732 	 * Done.
2733 	 */
2734 	mutex_exit(&qp->replylist_lock);
2735 	return (ret);
2736 }
2737 
2738 /*
2739  * RDMA write a buffer to the remote address.
2740  */
2741 rdma_stat
2742 rib_write(CONN *conn, struct clist *cl, int wait)
2743 {
2744 	ibt_send_wr_t	tx_wr;
2745 	int		cv_sig;
2746 	ibt_wr_ds_t	sgl[DSEG_MAX];
2747 	struct send_wid	*wdesc;
2748 	ibt_status_t	ibt_status;
2749 	rdma_stat	ret = RDMA_SUCCESS;
2750 	rib_qp_t	*qp = ctoqp(conn);
2751 	uint64_t	n_writes = 0;
2752 
2753 	if (cl == NULL) {
2754 		return (RDMA_FAILED);
2755 	}
2756 
2757 	while ((cl != NULL)) {
2758 		if (cl->c_len > 0) {
2759 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2760 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2761 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2762 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2763 			sgl[0].ds_va = cl->w.c_saddr;
2764 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2765 			sgl[0].ds_len = cl->c_len;
2766 
2767 			if (wait) {
2768 				cv_sig = 1;
2769 			} else {
2770 				if (n_writes > max_unsignaled_rws) {
2771 					n_writes = 0;
2772 					cv_sig = 1;
2773 				} else {
2774 					cv_sig = 0;
2775 				}
2776 			}
2777 
2778 			if (cv_sig) {
2779 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2780 				wdesc = rib_init_sendwait(0, cv_sig, qp);
2781 				tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2782 				mutex_enter(&wdesc->sendwait_lock);
2783 			} else {
2784 				tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2785 				tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2786 			}
2787 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2788 			tx_wr.wr_trans = IBT_RC_SRV;
2789 			tx_wr.wr_nds = 1;
2790 			tx_wr.wr_sgl = sgl;
2791 
2792 			mutex_enter(&conn->c_lock);
2793 			if (conn->c_state == C_CONNECTED) {
2794 				ibt_status =
2795 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2796 			}
2797 			if (conn->c_state != C_CONNECTED ||
2798 			    ibt_status != IBT_SUCCESS) {
2799 				if (conn->c_state != C_DISCONN_PEND)
2800 					conn->c_state = C_ERROR_CONN;
2801 				mutex_exit(&conn->c_lock);
2802 				if (cv_sig) {
2803 					mutex_exit(&wdesc->sendwait_lock);
2804 					(void) rib_free_sendwait(wdesc);
2805 				}
2806 				return (RDMA_CONNLOST);
2807 			}
2808 
2809 			mutex_exit(&conn->c_lock);
2810 
2811 			/*
2812 			 * Wait for send to complete
2813 			 */
2814 			if (cv_sig) {
2815 
2816 				rib_send_hold(qp);
2817 				mutex_exit(&wdesc->sendwait_lock);
2818 
2819 				ret = rib_sendwait(qp, wdesc);
2820 				if (ret != 0)
2821 					return (ret);
2822 			}
2823 			n_writes ++;
2824 		}
2825 		cl = cl->c_next;
2826 	}
2827 	return (RDMA_SUCCESS);
2828 }
2829 
2830 /*
2831  * RDMA Read a buffer from the remote address.
2832  */
2833 rdma_stat
2834 rib_read(CONN *conn, struct clist *cl, int wait)
2835 {
2836 	ibt_send_wr_t	rx_wr;
2837 	int		cv_sig = 0;
2838 	ibt_wr_ds_t	sgl;
2839 	struct send_wid	*wdesc;
2840 	ibt_status_t	ibt_status = IBT_SUCCESS;
2841 	rdma_stat	ret = RDMA_SUCCESS;
2842 	rib_qp_t	*qp = ctoqp(conn);
2843 
2844 	if (cl == NULL) {
2845 		return (RDMA_FAILED);
2846 	}
2847 
2848 	while (cl != NULL) {
2849 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2850 		/*
2851 		 * Remote address is at the head chunk item in list.
2852 		 */
2853 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2854 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2855 
2856 		sgl.ds_va = cl->u.c_daddr;
2857 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2858 		sgl.ds_len = cl->c_len;
2859 
2860 		/*
2861 		 * If there are multiple chunks to be read, and
2862 		 * wait is set, ask for signal only for the last chunk
2863 		 * and wait only on the last chunk. The completion of
2864 		 * RDMA_READ on last chunk ensures that reads on all
2865 		 * previous chunks are also completed.
2866 		 */
2867 		if (wait && (cl->c_next == NULL)) {
2868 			cv_sig = 1;
2869 			wdesc = rib_init_sendwait(0, cv_sig, qp);
2870 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2871 			rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2872 			mutex_enter(&wdesc->sendwait_lock);
2873 		} else {
2874 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2875 			rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2876 		}
2877 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2878 		rx_wr.wr_trans = IBT_RC_SRV;
2879 		rx_wr.wr_nds = 1;
2880 		rx_wr.wr_sgl = &sgl;
2881 
2882 		mutex_enter(&conn->c_lock);
2883 		if (conn->c_state == C_CONNECTED) {
2884 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2885 		}
2886 		if (conn->c_state != C_CONNECTED ||
2887 		    ibt_status != IBT_SUCCESS) {
2888 			if (conn->c_state != C_DISCONN_PEND)
2889 				conn->c_state = C_ERROR_CONN;
2890 			mutex_exit(&conn->c_lock);
2891 			if (wait && (cl->c_next == NULL)) {
2892 				mutex_exit(&wdesc->sendwait_lock);
2893 				(void) rib_free_sendwait(wdesc);
2894 			}
2895 			return (RDMA_CONNLOST);
2896 		}
2897 
2898 		mutex_exit(&conn->c_lock);
2899 
2900 		/*
2901 		 * Wait for send to complete if this is the
2902 		 * last item in the list.
2903 		 */
2904 		if (wait && cl->c_next == NULL) {
2905 			rib_send_hold(qp);
2906 			mutex_exit(&wdesc->sendwait_lock);
2907 
2908 			ret = rib_sendwait(qp, wdesc);
2909 
2910 			if (ret != 0)
2911 				return (ret);
2912 		}
2913 		cl = cl->c_next;
2914 	}
2915 	return (RDMA_SUCCESS);
2916 }
2917 
2918 /*
2919  * rib_srv_cm_handler()
2920  *    Connection Manager callback to handle RC connection requests.
2921  */
2922 /* ARGSUSED */
2923 static ibt_cm_status_t
2924 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2925 	ibt_cm_return_args_t *ret_args, void *priv_data,
2926 	ibt_priv_data_len_t len)
2927 {
2928 	queue_t		*q;
2929 	rib_qp_t	*qp;
2930 	rib_hca_t	*hca;
2931 	rdma_stat	status = RDMA_SUCCESS;
2932 	int		i;
2933 	struct clist	cl;
2934 	rdma_buf_t	rdbuf = {0};
2935 	void		*buf = NULL;
2936 	CONN		*conn;
2937 	ibt_ip_cm_info_t	ipinfo;
2938 	struct sockaddr_in *s;
2939 	struct sockaddr_in6 *s6;
2940 	int sin_size = sizeof (struct sockaddr_in);
2941 	int in_size = sizeof (struct in_addr);
2942 	int sin6_size = sizeof (struct sockaddr_in6);
2943 
2944 	ASSERT(any != NULL);
2945 	ASSERT(event != NULL);
2946 
2947 	hca = (rib_hca_t *)any;
2948 
2949 	/* got a connection request */
2950 	switch (event->cm_type) {
2951 	case IBT_CM_EVENT_REQ_RCV:
2952 		/*
2953 		 * If the plugin is in the NO_ACCEPT state, bail out.
2954 		 */
2955 		mutex_enter(&plugin_state_lock);
2956 		if (plugin_state == NO_ACCEPT) {
2957 			mutex_exit(&plugin_state_lock);
2958 			return (IBT_CM_REJECT);
2959 		}
2960 		mutex_exit(&plugin_state_lock);
2961 
2962 		/*
2963 		 * Need to send a MRA MAD to CM so that it does not
2964 		 * timeout on us.
2965 		 */
2966 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2967 		    event->cm_event.req.req_timeout * 8, NULL, 0);
2968 
2969 		mutex_enter(&rib_stat->open_hca_lock);
2970 		q = rib_stat->q;
2971 		mutex_exit(&rib_stat->open_hca_lock);
2972 
2973 		status = rib_svc_create_chan(hca, (caddr_t)q,
2974 		    event->cm_event.req.req_prim_hca_port, &qp);
2975 
2976 		if (status) {
2977 			return (IBT_CM_REJECT);
2978 		}
2979 
2980 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2981 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2982 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2983 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2984 
2985 		/*
2986 		 * Pre-posts RECV buffers
2987 		 */
2988 		conn = qptoc(qp);
2989 		for (i = 0; i < preposted_rbufs; i++) {
2990 			bzero(&rdbuf, sizeof (rdbuf));
2991 			rdbuf.type = RECV_BUFFER;
2992 			buf = rib_rbuf_alloc(conn, &rdbuf);
2993 			if (buf == NULL) {
2994 				/*
2995 				 * A connection is not established yet.
2996 				 * Just flush the channel. Buffers
2997 				 * posted till now will error out with
2998 				 * IBT_WC_WR_FLUSHED_ERR.
2999 				 */
3000 				(void) ibt_flush_channel(qp->qp_hdl);
3001 				(void) rib_disconnect_channel(conn, NULL);
3002 				return (IBT_CM_REJECT);
3003 			}
3004 
3005 			bzero(&cl, sizeof (cl));
3006 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
3007 			cl.c_len = rdbuf.len;
3008 			cl.c_smemhandle.mrc_lmr =
3009 			    rdbuf.handle.mrc_lmr; /* lkey */
3010 			cl.c_next = NULL;
3011 			status = rib_post_recv(conn, &cl);
3012 			if (status != RDMA_SUCCESS) {
3013 				/*
3014 				 * A connection is not established yet.
3015 				 * Just flush the channel. Buffers
3016 				 * posted till now will error out with
3017 				 * IBT_WC_WR_FLUSHED_ERR.
3018 				 */
3019 				(void) ibt_flush_channel(qp->qp_hdl);
3020 				(void) rib_disconnect_channel(conn, NULL);
3021 				return (IBT_CM_REJECT);
3022 			}
3023 		}
3024 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
3025 
3026 		/*
3027 		 * Get the address translation
3028 		 */
3029 		rw_enter(&hca->state_lock, RW_READER);
3030 		if (hca->state == HCA_DETACHED) {
3031 			rw_exit(&hca->state_lock);
3032 			return (IBT_CM_REJECT);
3033 		}
3034 		rw_exit(&hca->state_lock);
3035 
3036 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3037 
3038 		if (ibt_get_ip_data(event->cm_priv_data_len,
3039 		    event->cm_priv_data,
3040 		    &ipinfo) != IBT_SUCCESS) {
3041 
3042 			return (IBT_CM_REJECT);
3043 		}
3044 
3045 		switch (ipinfo.src_addr.family) {
3046 		case AF_INET:
3047 
3048 			conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1,
3049 			    KM_SLEEP);
3050 			(void) strcpy(conn->c_netid, RIBNETID_TCP);
3051 
3052 			conn->c_raddr.maxlen =
3053 			    conn->c_raddr.len = sin_size;
3054 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3055 
3056 			s = (struct sockaddr_in *)conn->c_raddr.buf;
3057 			s->sin_family = AF_INET;
3058 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3059 			    &s->sin_addr, in_size);
3060 
3061 			conn->c_laddr.maxlen =
3062 			    conn->c_laddr.len = sin_size;
3063 			conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3064 
3065 			s = (struct sockaddr_in *)conn->c_laddr.buf;
3066 			s->sin_family = AF_INET;
3067 			bcopy((void *)&ipinfo.dst_addr.un.ip4addr,
3068 			    &s->sin_addr, in_size);
3069 
3070 			break;
3071 
3072 		case AF_INET6:
3073 
3074 			conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1,
3075 			    KM_SLEEP);
3076 			(void) strcpy(conn->c_netid, RIBNETID_TCP6);
3077 
3078 			conn->c_raddr.maxlen =
3079 			    conn->c_raddr.len = sin6_size;
3080 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3081 
3082 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3083 			s6->sin6_family = AF_INET6;
3084 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3085 			    &s6->sin6_addr,
3086 			    sizeof (struct in6_addr));
3087 
3088 			conn->c_laddr.maxlen =
3089 			    conn->c_laddr.len = sin6_size;
3090 			conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3091 
3092 			s6 = (struct sockaddr_in6 *)conn->c_laddr.buf;
3093 			s6->sin6_family = AF_INET6;
3094 			bcopy((void *)&ipinfo.dst_addr.un.ip6addr,
3095 			    &s6->sin6_addr,
3096 			    sizeof (struct in6_addr));
3097 
3098 			break;
3099 
3100 		default:
3101 			return (IBT_CM_REJECT);
3102 		}
3103 
3104 		break;
3105 
3106 	case IBT_CM_EVENT_CONN_CLOSED:
3107 	{
3108 		CONN		*conn;
3109 		rib_qp_t	*qp;
3110 
3111 		switch (event->cm_event.closed) {
3112 		case IBT_CM_CLOSED_DREP_RCVD:
3113 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
3114 		case IBT_CM_CLOSED_DUP:
3115 		case IBT_CM_CLOSED_ABORT:
3116 		case IBT_CM_CLOSED_ALREADY:
3117 			/*
3118 			 * These cases indicate the local end initiated
3119 			 * the closing of the channel. Nothing to do here.
3120 			 */
3121 			break;
3122 		default:
3123 			/*
3124 			 * Reason for CONN_CLOSED event must be one of
3125 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3126 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
3127 			 * the remote end is closing the channel. In these
3128 			 * cases free the channel and transition to error
3129 			 * state
3130 			 */
3131 			qp = ibt_get_chan_private(event->cm_channel);
3132 			conn = qptoc(qp);
3133 			mutex_enter(&conn->c_lock);
3134 			if (conn->c_state == C_DISCONN_PEND) {
3135 				mutex_exit(&conn->c_lock);
3136 				break;
3137 			}
3138 			conn->c_state = C_ERROR_CONN;
3139 
3140 			/*
3141 			 * Free the conn if c_ref goes down to 0
3142 			 */
3143 			if (conn->c_ref == 0) {
3144 				/*
3145 				 * Remove from list and free conn
3146 				 */
3147 				conn->c_state = C_DISCONN_PEND;
3148 				mutex_exit(&conn->c_lock);
3149 				(void) rib_disconnect_channel(conn,
3150 				    &hca->srv_conn_list);
3151 			} else {
3152 				/*
3153 				 * conn will be freed when c_ref goes to 0.
3154 				 * Indicate to cleaning thread not to close
3155 				 * the connection, but just free the channel.
3156 				 */
3157 				conn->c_flags |= C_CLOSE_NOTNEEDED;
3158 				mutex_exit(&conn->c_lock);
3159 			}
3160 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3161 			break;
3162 		}
3163 		break;
3164 	}
3165 	case IBT_CM_EVENT_CONN_EST:
3166 		/*
3167 		 * RTU received, hence connection established.
3168 		 */
3169 		if (rib_debug > 1)
3170 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3171 			    "(CONN_EST) channel established");
3172 		break;
3173 
3174 	default:
3175 		if (rib_debug > 2) {
3176 			/* Let CM handle the following events. */
3177 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3178 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3179 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3180 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3181 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3182 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3183 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3184 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3185 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3186 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3187 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3188 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3189 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3190 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3191 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
3192 			}
3193 		}
3194 		return (IBT_CM_DEFAULT);
3195 	}
3196 
3197 	/* accept all other CM messages (i.e. let the CM handle them) */
3198 	return (IBT_CM_ACCEPT);
3199 }
3200 
3201 static rdma_stat
3202 rib_register_service(rib_hca_t *hca, int service_type,
3203 	uint8_t protocol_num, in_port_t dst_port)
3204 {
3205 	ibt_srv_desc_t		sdesc;
3206 	ibt_hca_portinfo_t	*port_infop;
3207 	ib_svc_id_t		srv_id;
3208 	ibt_srv_hdl_t		srv_hdl;
3209 	uint_t			port_size;
3210 	uint_t			pki, i, num_ports, nbinds;
3211 	ibt_status_t		ibt_status;
3212 	rib_service_t		*service;
3213 	ib_pkey_t		pkey;
3214 
3215 	/*
3216 	 * Query all ports for the given HCA
3217 	 */
3218 	rw_enter(&hca->state_lock, RW_READER);
3219 	if (hca->state != HCA_DETACHED) {
3220 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3221 		    &num_ports, &port_size);
3222 		rw_exit(&hca->state_lock);
3223 	} else {
3224 		rw_exit(&hca->state_lock);
3225 		return (RDMA_FAILED);
3226 	}
3227 	if (ibt_status != IBT_SUCCESS) {
3228 		return (RDMA_FAILED);
3229 	}
3230 
3231 	DTRACE_PROBE1(rpcib__i__regservice_numports,
3232 	    int, num_ports);
3233 
3234 	for (i = 0; i < num_ports; i++) {
3235 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3236 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3237 			    int, i+1);
3238 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3239 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
3240 			    int, i+1);
3241 		}
3242 	}
3243 
3244 	/*
3245 	 * Get all the IP addresses on this system to register the
3246 	 * given "service type" on all DNS recognized IP addrs.
3247 	 * Each service type such as NFS will have all the systems
3248 	 * IP addresses as its different names. For now the only
3249 	 * type of service we support in RPCIB is NFS.
3250 	 */
3251 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3252 	/*
3253 	 * Start registering and binding service to active
3254 	 * on active ports on this HCA.
3255 	 */
3256 	nbinds = 0;
3257 	for (service = rib_stat->service_list;
3258 	    service && (service->srv_type != service_type);
3259 	    service = service->next)
3260 		;
3261 
3262 	if (service == NULL) {
3263 		/*
3264 		 * We use IP addresses as the service names for
3265 		 * service registration.  Register each of them
3266 		 * with CM to obtain a svc_id and svc_hdl.  We do not
3267 		 * register the service with machine's loopback address.
3268 		 */
3269 		(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3270 		(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3271 		(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3272 		sdesc.sd_handler = rib_srv_cm_handler;
3273 		sdesc.sd_flags = 0;
3274 		ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3275 		    &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3276 		    1, &srv_hdl, &srv_id);
3277 		if ((ibt_status != IBT_SUCCESS) &&
3278 		    (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3279 			rw_exit(&rib_stat->service_list_lock);
3280 			DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3281 			    int, ibt_status);
3282 			ibt_free_portinfo(port_infop, port_size);
3283 			return (RDMA_FAILED);
3284 		}
3285 
3286 		/*
3287 		 * Allocate and prepare a service entry
3288 		 */
3289 		service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3290 
3291 		service->srv_type = service_type;
3292 		service->srv_hdl = srv_hdl;
3293 		service->srv_id = srv_id;
3294 
3295 		service->next = rib_stat->service_list;
3296 		rib_stat->service_list = service;
3297 		DTRACE_PROBE1(rpcib__i__regservice__new__service,
3298 		    int, service->srv_type);
3299 	} else {
3300 		srv_hdl = service->srv_hdl;
3301 		srv_id = service->srv_id;
3302 		DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3303 		    int, service->srv_type);
3304 	}
3305 
3306 	for (i = 0; i < num_ports; i++) {
3307 		ibt_sbind_hdl_t		sbp;
3308 		rib_hca_service_t	*hca_srv;
3309 		ib_gid_t		gid;
3310 
3311 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3312 			continue;
3313 
3314 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3315 			pkey = port_infop[i].p_pkey_tbl[pki];
3316 
3317 			rw_enter(&hca->bound_services_lock, RW_READER);
3318 			gid = port_infop[i].p_sgid_tbl[0];
3319 			for (hca_srv = hca->bound_services; hca_srv;
3320 			    hca_srv = hca_srv->next) {
3321 				if ((hca_srv->srv_id == service->srv_id) &&
3322 				    (hca_srv->gid.gid_prefix ==
3323 				    gid.gid_prefix) &&
3324 				    (hca_srv->gid.gid_guid == gid.gid_guid))
3325 					break;
3326 			}
3327 			rw_exit(&hca->bound_services_lock);
3328 			if (hca_srv != NULL) {
3329 				/*
3330 				 * port is alreay bound the the service
3331 				 */
3332 				DTRACE_PROBE1(
3333 				    rpcib__i__regservice__already__bound,
3334 				    int, i+1);
3335 				nbinds++;
3336 				continue;
3337 			}
3338 
3339 			if ((pkey & IBSRM_HB) &&
3340 			    (pkey != IB_PKEY_INVALID_FULL)) {
3341 
3342 				sbp = NULL;
3343 				ibt_status = ibt_bind_service(srv_hdl,
3344 				    gid, NULL, hca, &sbp);
3345 
3346 				if (ibt_status == IBT_SUCCESS) {
3347 					hca_srv = kmem_zalloc(
3348 					    sizeof (rib_hca_service_t),
3349 					    KM_SLEEP);
3350 					hca_srv->srv_id = srv_id;
3351 					hca_srv->gid = gid;
3352 					hca_srv->sbind_hdl = sbp;
3353 
3354 					rw_enter(&hca->bound_services_lock,
3355 					    RW_WRITER);
3356 					hca_srv->next = hca->bound_services;
3357 					hca->bound_services = hca_srv;
3358 					rw_exit(&hca->bound_services_lock);
3359 					nbinds++;
3360 				}
3361 
3362 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3363 				    int, ibt_status);
3364 			}
3365 		}
3366 	}
3367 	rw_exit(&rib_stat->service_list_lock);
3368 
3369 	ibt_free_portinfo(port_infop, port_size);
3370 
3371 	if (nbinds == 0) {
3372 		return (RDMA_FAILED);
3373 	} else {
3374 		/*
3375 		 * Put this plugin into accept state, since atleast
3376 		 * one registration was successful.
3377 		 */
3378 		mutex_enter(&plugin_state_lock);
3379 		plugin_state = ACCEPT;
3380 		mutex_exit(&plugin_state_lock);
3381 		return (RDMA_SUCCESS);
3382 	}
3383 }
3384 
3385 void
3386 rib_listen(struct rdma_svc_data *rd)
3387 {
3388 	rdma_stat status;
3389 	int n_listening = 0;
3390 	rib_hca_t *hca;
3391 
3392 	mutex_enter(&rib_stat->listen_lock);
3393 	/*
3394 	 * if rd parameter is NULL then it means that rib_stat->q is
3395 	 * already initialized by a call from RDMA and we just want to
3396 	 * add a newly attached HCA to the same listening state as other
3397 	 * HCAs.
3398 	 */
3399 	if (rd == NULL) {
3400 		if (rib_stat->q == NULL) {
3401 			mutex_exit(&rib_stat->listen_lock);
3402 			return;
3403 		}
3404 	} else {
3405 		rib_stat->q = &rd->q;
3406 	}
3407 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3408 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3409 		/*
3410 		 * First check if a hca is still attached
3411 		 */
3412 		rw_enter(&hca->state_lock, RW_READER);
3413 		if (hca->state != HCA_INITED) {
3414 			rw_exit(&hca->state_lock);
3415 			continue;
3416 		}
3417 		rw_exit(&hca->state_lock);
3418 
3419 		/*
3420 		 * Right now the only service type is NFS. Hence
3421 		 * force feed this value. Ideally to communicate
3422 		 * the service type it should be passed down in
3423 		 * rdma_svc_data.
3424 		 */
3425 		status = rib_register_service(hca, NFS,
3426 		    IPPROTO_TCP, nfs_rdma_port);
3427 		if (status == RDMA_SUCCESS)
3428 			n_listening++;
3429 	}
3430 	rw_exit(&rib_stat->hcas_list_lock);
3431 
3432 	/*
3433 	 * Service active on an HCA, check rd->err_code for more
3434 	 * explainable errors.
3435 	 */
3436 	if (rd) {
3437 		if (n_listening > 0) {
3438 			rd->active = 1;
3439 			rd->err_code = RDMA_SUCCESS;
3440 		} else {
3441 			rd->active = 0;
3442 			rd->err_code = RDMA_FAILED;
3443 		}
3444 	}
3445 	mutex_exit(&rib_stat->listen_lock);
3446 }
3447 
3448 /* XXXX */
3449 /* ARGSUSED */
3450 static void
3451 rib_listen_stop(struct rdma_svc_data *svcdata)
3452 {
3453 	rib_hca_t		*hca;
3454 
3455 	mutex_enter(&rib_stat->listen_lock);
3456 	/*
3457 	 * KRPC called the RDMATF to stop the listeners, this means
3458 	 * stop sending incomming or recieved requests to KRPC master
3459 	 * transport handle for RDMA-IB. This is also means that the
3460 	 * master transport handle, responsible for us, is going away.
3461 	 */
3462 	mutex_enter(&plugin_state_lock);
3463 	plugin_state = NO_ACCEPT;
3464 	if (svcdata != NULL)
3465 		svcdata->active = 0;
3466 	mutex_exit(&plugin_state_lock);
3467 
3468 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3469 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3470 		/*
3471 		 * First check if a hca is still attached
3472 		 */
3473 		rw_enter(&hca->state_lock, RW_READER);
3474 		if (hca->state == HCA_DETACHED) {
3475 			rw_exit(&hca->state_lock);
3476 			continue;
3477 		}
3478 		rib_close_channels(&hca->srv_conn_list);
3479 		rib_stop_services(hca);
3480 		rw_exit(&hca->state_lock);
3481 	}
3482 	rw_exit(&rib_stat->hcas_list_lock);
3483 
3484 	/*
3485 	 * Avoid rib_listen() using the stale q field.
3486 	 * This could happen if a port goes up after all services
3487 	 * are already unregistered.
3488 	 */
3489 	rib_stat->q = NULL;
3490 	mutex_exit(&rib_stat->listen_lock);
3491 }
3492 
3493 /*
3494  * Traverse the HCA's service list to unbind and deregister services.
3495  * For each bound service of HCA to be removed, first find the corresponding
3496  * service handle (srv_hdl) and then unbind the service by calling
3497  * ibt_unbind_service().
3498  */
3499 static void
3500 rib_stop_services(rib_hca_t *hca)
3501 {
3502 	rib_hca_service_t *srv_list, *to_remove;
3503 
3504 	/*
3505 	 * unbind and deregister the services for this service type.
3506 	 * Right now there is only one service type. In future it will
3507 	 * be passed down to this function.
3508 	 */
3509 	rw_enter(&hca->bound_services_lock, RW_READER);
3510 	srv_list = hca->bound_services;
3511 	hca->bound_services = NULL;
3512 	rw_exit(&hca->bound_services_lock);
3513 
3514 	while (srv_list != NULL) {
3515 		rib_service_t *sc;
3516 
3517 		to_remove = srv_list;
3518 		srv_list = to_remove->next;
3519 		rw_enter(&rib_stat->service_list_lock, RW_READER);
3520 		for (sc = rib_stat->service_list;
3521 		    sc && (sc->srv_id != to_remove->srv_id);
3522 		    sc = sc->next)
3523 			;
3524 		/*
3525 		 * if sc is NULL then the service doesn't exist anymore,
3526 		 * probably just removed completely through rib_stat.
3527 		 */
3528 		if (sc != NULL)
3529 			(void) ibt_unbind_service(sc->srv_hdl,
3530 			    to_remove->sbind_hdl);
3531 		rw_exit(&rib_stat->service_list_lock);
3532 		kmem_free(to_remove, sizeof (rib_hca_service_t));
3533 	}
3534 }
3535 
3536 static struct svc_recv *
3537 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3538 {
3539 	struct svc_recv	*recvp;
3540 
3541 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3542 	recvp->vaddr = sgl->ds_va;
3543 	recvp->qp = qp;
3544 	recvp->bytes_xfer = 0;
3545 	return (recvp);
3546 }
3547 
3548 static int
3549 rib_free_svc_recv(struct svc_recv *recvp)
3550 {
3551 	kmem_free(recvp, sizeof (*recvp));
3552 
3553 	return (0);
3554 }
3555 
3556 static struct reply *
3557 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3558 {
3559 	struct reply	*rep;
3560 
3561 
3562 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3563 	if (rep == NULL) {
3564 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3565 		return (NULL);
3566 	}
3567 	rep->xid = msgid;
3568 	rep->vaddr_cq = NULL;
3569 	rep->bytes_xfer = 0;
3570 	rep->status = (uint_t)REPLY_WAIT;
3571 	rep->prev = NULL;
3572 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3573 
3574 	mutex_enter(&qp->replylist_lock);
3575 	if (qp->replylist) {
3576 		rep->next = qp->replylist;
3577 		qp->replylist->prev = rep;
3578 	}
3579 	qp->rep_list_size++;
3580 
3581 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3582 	    int, qp->rep_list_size);
3583 
3584 	qp->replylist = rep;
3585 	mutex_exit(&qp->replylist_lock);
3586 
3587 	return (rep);
3588 }
3589 
3590 static rdma_stat
3591 rib_rem_replylist(rib_qp_t *qp)
3592 {
3593 	struct reply	*r, *n;
3594 
3595 	mutex_enter(&qp->replylist_lock);
3596 	for (r = qp->replylist; r != NULL; r = n) {
3597 		n = r->next;
3598 		(void) rib_remreply(qp, r);
3599 	}
3600 	mutex_exit(&qp->replylist_lock);
3601 
3602 	return (RDMA_SUCCESS);
3603 }
3604 
3605 static int
3606 rib_remreply(rib_qp_t *qp, struct reply *rep)
3607 {
3608 
3609 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3610 	if (rep->prev) {
3611 		rep->prev->next = rep->next;
3612 	}
3613 	if (rep->next) {
3614 		rep->next->prev = rep->prev;
3615 	}
3616 	if (qp->replylist == rep)
3617 		qp->replylist = rep->next;
3618 
3619 	cv_destroy(&rep->wait_cv);
3620 	qp->rep_list_size--;
3621 
3622 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3623 	    int, qp->rep_list_size);
3624 
3625 	kmem_free(rep, sizeof (*rep));
3626 
3627 	return (0);
3628 }
3629 
3630 rdma_stat
3631 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3632 	struct mrc *buf_handle)
3633 {
3634 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3635 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3636 	rdma_stat	status;
3637 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3638 
3639 	/*
3640 	 * Note: ALL buffer pools use the same memory type RDMARW.
3641 	 */
3642 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3643 	if (status == RDMA_SUCCESS) {
3644 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3645 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3646 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3647 	} else {
3648 		buf_handle->mrc_linfo = NULL;
3649 		buf_handle->mrc_lmr = 0;
3650 		buf_handle->mrc_rmr = 0;
3651 	}
3652 	return (status);
3653 }
3654 
3655 static rdma_stat
3656 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3657 	ibt_mr_flags_t spec,
3658 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3659 {
3660 	ibt_mr_attr_t	mem_attr;
3661 	ibt_status_t	ibt_status;
3662 	mem_attr.mr_vaddr = (uintptr_t)buf;
3663 	mem_attr.mr_len = (ib_msglen_t)size;
3664 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3665 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3666 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3667 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3668 
3669 	rw_enter(&hca->state_lock, RW_READER);
3670 	if (hca->state != HCA_DETACHED) {
3671 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3672 		    &mem_attr, mr_hdlp, mr_descp);
3673 		rw_exit(&hca->state_lock);
3674 	} else {
3675 		rw_exit(&hca->state_lock);
3676 		return (RDMA_FAILED);
3677 	}
3678 
3679 	if (ibt_status != IBT_SUCCESS) {
3680 		return (RDMA_FAILED);
3681 	}
3682 	return (RDMA_SUCCESS);
3683 }
3684 
3685 rdma_stat
3686 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3687 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3688 {
3689 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3690 	rib_lrc_entry_t *l;
3691 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3692 	rdma_stat	status;
3693 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3694 
3695 	/*
3696 	 * Non-coherent memory registration.
3697 	 */
3698 	l = (rib_lrc_entry_t *)lrc;
3699 	if (l) {
3700 		if (l->registered) {
3701 			buf_handle->mrc_linfo =
3702 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3703 			buf_handle->mrc_lmr =
3704 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3705 			buf_handle->mrc_rmr =
3706 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3707 			*sync_handle = (RIB_SYNCMEM_HANDLE)
3708 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3709 			return (RDMA_SUCCESS);
3710 		} else {
3711 			/* Always register the whole buffer */
3712 			buf = (caddr_t)l->lrc_buf;
3713 			buflen = l->lrc_len;
3714 		}
3715 	}
3716 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3717 
3718 	if (status == RDMA_SUCCESS) {
3719 		if (l) {
3720 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3721 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3722 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3723 			l->registered		 = TRUE;
3724 		}
3725 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3726 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3727 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3728 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3729 	} else {
3730 		buf_handle->mrc_linfo = NULL;
3731 		buf_handle->mrc_lmr = 0;
3732 		buf_handle->mrc_rmr = 0;
3733 	}
3734 	return (status);
3735 }
3736 
3737 /* ARGSUSED */
3738 rdma_stat
3739 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3740 {
3741 	rib_hca_t *hca = (ctoqp(conn))->hca;
3742 	/*
3743 	 * Allow memory deregistration even if HCA is
3744 	 * getting detached. Need all outstanding
3745 	 * memory registrations to be deregistered
3746 	 * before HCA_DETACH_EVENT can be accepted.
3747 	 */
3748 	(void) ibt_deregister_mr(hca->hca_hdl,
3749 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3750 	return (RDMA_SUCCESS);
3751 }
3752 
3753 /* ARGSUSED */
3754 rdma_stat
3755 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3756 		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3757 {
3758 	rib_lrc_entry_t *l;
3759 	l = (rib_lrc_entry_t *)lrc;
3760 	if (l)
3761 		if (l->registered)
3762 			return (RDMA_SUCCESS);
3763 
3764 	(void) rib_deregistermem(conn, buf, buf_handle);
3765 
3766 	return (RDMA_SUCCESS);
3767 }
3768 
3769 /* ARGSUSED */
3770 rdma_stat
3771 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3772 		int len, int cpu)
3773 {
3774 	ibt_status_t	status;
3775 	rib_hca_t *hca = (ctoqp(conn))->hca;
3776 	ibt_mr_sync_t	mr_segment;
3777 
3778 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3779 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3780 	mr_segment.ms_len = (ib_memlen_t)len;
3781 	if (cpu) {
3782 		/* make incoming data visible to memory */
3783 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3784 	} else {
3785 		/* make memory changes visible to IO */
3786 		mr_segment.ms_flags = IBT_SYNC_READ;
3787 	}
3788 	rw_enter(&hca->state_lock, RW_READER);
3789 	if (hca->state != HCA_DETACHED) {
3790 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3791 		rw_exit(&hca->state_lock);
3792 	} else {
3793 		rw_exit(&hca->state_lock);
3794 		return (RDMA_FAILED);
3795 	}
3796 
3797 	if (status == IBT_SUCCESS)
3798 		return (RDMA_SUCCESS);
3799 	else {
3800 		return (RDMA_FAILED);
3801 	}
3802 }
3803 
3804 /*
3805  * XXXX	????
3806  */
3807 static rdma_stat
3808 rib_getinfo(rdma_info_t *info)
3809 {
3810 	/*
3811 	 * XXXX	Hack!
3812 	 */
3813 	info->addrlen = 16;
3814 	info->mts = 1000000;
3815 	info->mtu = 1000000;
3816 
3817 	return (RDMA_SUCCESS);
3818 }
3819 
3820 rib_bufpool_t *
3821 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3822 {
3823 	rib_bufpool_t	*rbp = NULL;
3824 	bufpool_t	*bp = NULL;
3825 	caddr_t		buf;
3826 	ibt_mr_attr_t	mem_attr;
3827 	ibt_status_t	ibt_status;
3828 	int		i, j;
3829 
3830 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3831 
3832 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3833 	    num * sizeof (void *), KM_SLEEP);
3834 
3835 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3836 	bp->numelems = num;
3837 
3838 
3839 	switch (ptype) {
3840 	case SEND_BUFFER:
3841 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3842 		bp->rsize = RPC_MSG_SZ;
3843 		break;
3844 	case RECV_BUFFER:
3845 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3846 		bp->rsize = RPC_BUF_SIZE;
3847 		break;
3848 	default:
3849 		goto fail;
3850 	}
3851 
3852 	/*
3853 	 * Register the pool.
3854 	 */
3855 	bp->bufsize = num * bp->rsize;
3856 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3857 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3858 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3859 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3860 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3861 	rw_enter(&hca->state_lock, RW_READER);
3862 
3863 	if (hca->state == HCA_DETACHED) {
3864 		rw_exit(&hca->state_lock);
3865 		goto fail;
3866 	}
3867 
3868 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3869 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3870 		mem_attr.mr_vaddr = (uintptr_t)buf;
3871 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3872 		mem_attr.mr_as = NULL;
3873 		ibt_status = ibt_register_mr(hca->hca_hdl,
3874 		    hca->pd_hdl, &mem_attr,
3875 		    &rbp->mr_hdl[i],
3876 		    &rbp->mr_desc[i]);
3877 		if (ibt_status != IBT_SUCCESS) {
3878 			for (j = 0; j < i; j++) {
3879 				(void) ibt_deregister_mr(hca->hca_hdl,
3880 				    rbp->mr_hdl[j]);
3881 			}
3882 			rw_exit(&hca->state_lock);
3883 			goto fail;
3884 		}
3885 	}
3886 	rw_exit(&hca->state_lock);
3887 	buf = (caddr_t)bp->buf;
3888 	for (i = 0; i < num; i++, buf += bp->rsize) {
3889 		bp->buflist[i] = (void *)buf;
3890 	}
3891 	bp->buffree = num - 1;	/* no. of free buffers */
3892 	rbp->bpool = bp;
3893 
3894 	return (rbp);
3895 fail:
3896 	if (bp) {
3897 		if (bp->buf)
3898 			kmem_free(bp->buf, bp->bufsize);
3899 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3900 	}
3901 	if (rbp) {
3902 		if (rbp->mr_hdl)
3903 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3904 		if (rbp->mr_desc)
3905 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3906 		kmem_free(rbp, sizeof (rib_bufpool_t));
3907 	}
3908 	return (NULL);
3909 }
3910 
3911 static void
3912 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3913 {
3914 	int i;
3915 	rib_bufpool_t *rbp = NULL;
3916 	bufpool_t *bp;
3917 
3918 	/*
3919 	 * Obtain pool address based on type of pool
3920 	 */
3921 	switch (ptype) {
3922 		case SEND_BUFFER:
3923 			rbp = hca->send_pool;
3924 			break;
3925 		case RECV_BUFFER:
3926 			rbp = hca->recv_pool;
3927 			break;
3928 		default:
3929 			return;
3930 	}
3931 	if (rbp == NULL)
3932 		return;
3933 
3934 	bp = rbp->bpool;
3935 
3936 	/*
3937 	 * Deregister the pool memory and free it.
3938 	 */
3939 	for (i = 0; i < bp->numelems; i++) {
3940 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3941 	}
3942 }
3943 
3944 static void
3945 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3946 {
3947 
3948 	rib_bufpool_t *rbp = NULL;
3949 	bufpool_t *bp;
3950 
3951 	/*
3952 	 * Obtain pool address based on type of pool
3953 	 */
3954 	switch (ptype) {
3955 		case SEND_BUFFER:
3956 			rbp = hca->send_pool;
3957 			break;
3958 		case RECV_BUFFER:
3959 			rbp = hca->recv_pool;
3960 			break;
3961 		default:
3962 			return;
3963 	}
3964 	if (rbp == NULL)
3965 		return;
3966 
3967 	bp = rbp->bpool;
3968 
3969 	/*
3970 	 * Free the pool memory.
3971 	 */
3972 	if (rbp->mr_hdl)
3973 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3974 
3975 	if (rbp->mr_desc)
3976 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
3977 	if (bp->buf)
3978 		kmem_free(bp->buf, bp->bufsize);
3979 	mutex_destroy(&bp->buflock);
3980 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
3981 	kmem_free(rbp, sizeof (rib_bufpool_t));
3982 }
3983 
3984 void
3985 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
3986 {
3987 	/*
3988 	 * Deregister the pool memory and free it.
3989 	 */
3990 	rib_rbufpool_deregister(hca, ptype);
3991 	rib_rbufpool_free(hca, ptype);
3992 }
3993 
3994 /*
3995  * Fetch a buffer from the pool of type specified in rdbuf->type.
3996  */
3997 static rdma_stat
3998 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3999 {
4000 	rib_lrc_entry_t *rlep;
4001 
4002 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
4003 		rlep = rib_get_cache_buf(conn, rdbuf->len);
4004 		rdbuf->rb_private =  (caddr_t)rlep;
4005 		rdbuf->addr = rlep->lrc_buf;
4006 		rdbuf->handle = rlep->lrc_mhandle;
4007 		return (RDMA_SUCCESS);
4008 	}
4009 
4010 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4011 	if (rdbuf->addr) {
4012 		switch (rdbuf->type) {
4013 		case SEND_BUFFER:
4014 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
4015 			break;
4016 		case RECV_BUFFER:
4017 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
4018 			break;
4019 		default:
4020 			rdbuf->len = 0;
4021 		}
4022 		return (RDMA_SUCCESS);
4023 	} else
4024 		return (RDMA_FAILED);
4025 }
4026 
4027 /*
4028  * Fetch a buffer of specified type.
4029  * Note that rdbuf->handle is mw's rkey.
4030  */
4031 static void *
4032 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4033 {
4034 	rib_qp_t	*qp = ctoqp(conn);
4035 	rib_hca_t	*hca = qp->hca;
4036 	rdma_btype	ptype = rdbuf->type;
4037 	void		*buf;
4038 	rib_bufpool_t	*rbp = NULL;
4039 	bufpool_t	*bp;
4040 	int		i;
4041 
4042 	/*
4043 	 * Obtain pool address based on type of pool
4044 	 */
4045 	switch (ptype) {
4046 	case SEND_BUFFER:
4047 		rbp = hca->send_pool;
4048 		break;
4049 	case RECV_BUFFER:
4050 		rbp = hca->recv_pool;
4051 		break;
4052 	default:
4053 		return (NULL);
4054 	}
4055 	if (rbp == NULL)
4056 		return (NULL);
4057 
4058 	bp = rbp->bpool;
4059 
4060 	mutex_enter(&bp->buflock);
4061 	if (bp->buffree < 0) {
4062 		mutex_exit(&bp->buflock);
4063 		return (NULL);
4064 	}
4065 
4066 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4067 	buf = bp->buflist[bp->buffree];
4068 	rdbuf->addr = buf;
4069 	rdbuf->len = bp->rsize;
4070 	for (i = bp->numelems - 1; i >= 0; i--) {
4071 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4072 			rdbuf->handle.mrc_rmr =
4073 			    (uint32_t)rbp->mr_desc[i].md_rkey;
4074 			rdbuf->handle.mrc_linfo =
4075 			    (uintptr_t)rbp->mr_hdl[i];
4076 			rdbuf->handle.mrc_lmr =
4077 			    (uint32_t)rbp->mr_desc[i].md_lkey;
4078 			bp->buffree--;
4079 
4080 			mutex_exit(&bp->buflock);
4081 
4082 			return (buf);
4083 		}
4084 	}
4085 
4086 	mutex_exit(&bp->buflock);
4087 
4088 	return (NULL);
4089 }
4090 
4091 static void
4092 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4093 {
4094 
4095 	if (rdbuf->type == RDMA_LONG_BUFFER) {
4096 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4097 		rdbuf->rb_private = NULL;
4098 		return;
4099 	}
4100 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4101 }
4102 
4103 static void
4104 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4105 {
4106 	rib_qp_t *qp = ctoqp(conn);
4107 	rib_hca_t *hca = qp->hca;
4108 	rib_bufpool_t *rbp = NULL;
4109 	bufpool_t *bp;
4110 
4111 	/*
4112 	 * Obtain pool address based on type of pool
4113 	 */
4114 	switch (ptype) {
4115 	case SEND_BUFFER:
4116 		rbp = hca->send_pool;
4117 		break;
4118 	case RECV_BUFFER:
4119 		rbp = hca->recv_pool;
4120 		break;
4121 	default:
4122 		return;
4123 	}
4124 	if (rbp == NULL)
4125 		return;
4126 
4127 	bp = rbp->bpool;
4128 
4129 	mutex_enter(&bp->buflock);
4130 	if (++bp->buffree >= bp->numelems) {
4131 		/*
4132 		 * Should never happen
4133 		 */
4134 		bp->buffree--;
4135 	} else {
4136 		bp->buflist[bp->buffree] = buf;
4137 	}
4138 	mutex_exit(&bp->buflock);
4139 }
4140 
4141 static rdma_stat
4142 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4143 {
4144 	rw_enter(&connlist->conn_lock, RW_WRITER);
4145 	if (connlist->conn_hd) {
4146 		cn->c_next = connlist->conn_hd;
4147 		connlist->conn_hd->c_prev = cn;
4148 	}
4149 	connlist->conn_hd = cn;
4150 	rw_exit(&connlist->conn_lock);
4151 
4152 	return (RDMA_SUCCESS);
4153 }
4154 
4155 static rdma_stat
4156 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4157 {
4158 	rw_enter(&connlist->conn_lock, RW_WRITER);
4159 	if (cn->c_prev) {
4160 		cn->c_prev->c_next = cn->c_next;
4161 	}
4162 	if (cn->c_next) {
4163 		cn->c_next->c_prev = cn->c_prev;
4164 	}
4165 	if (connlist->conn_hd == cn)
4166 		connlist->conn_hd = cn->c_next;
4167 	rw_exit(&connlist->conn_lock);
4168 
4169 	return (RDMA_SUCCESS);
4170 }
4171 
4172 /* ARGSUSED */
4173 static rdma_stat
4174 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4175     int addr_type, void *handle, CONN **conn)
4176 {
4177 	rdma_stat status;
4178 	rpcib_ping_t rpt;
4179 
4180 	status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
4181 	return (status);
4182 }
4183 
4184 /*
4185  * rib_find_hca_connection
4186  *
4187  * if there is an existing connection to the specified address then
4188  * it will be returned in conn, otherwise conn will be set to NULL.
4189  * Also cleans up any connection that is in error state.
4190  */
4191 static int
4192 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4193     struct netbuf *d_svcaddr, CONN **conn)
4194 {
4195 	CONN *cn;
4196 	clock_t cv_stat, timout;
4197 
4198 	*conn = NULL;
4199 again:
4200 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4201 	cn = hca->cl_conn_list.conn_hd;
4202 	while (cn != NULL) {
4203 		/*
4204 		 * First, clear up any connection in the ERROR state
4205 		 */
4206 		mutex_enter(&cn->c_lock);
4207 		if (cn->c_state == C_ERROR_CONN) {
4208 			if (cn->c_ref == 0) {
4209 				/*
4210 				 * Remove connection from list and destroy it.
4211 				 */
4212 				cn->c_state = C_DISCONN_PEND;
4213 				mutex_exit(&cn->c_lock);
4214 				rw_exit(&hca->cl_conn_list.conn_lock);
4215 				rib_conn_close((void *)cn);
4216 				goto again;
4217 			}
4218 			mutex_exit(&cn->c_lock);
4219 			cn = cn->c_next;
4220 			continue;
4221 		}
4222 		if (cn->c_state == C_DISCONN_PEND) {
4223 			mutex_exit(&cn->c_lock);
4224 			cn = cn->c_next;
4225 			continue;
4226 		}
4227 
4228 		/*
4229 		 * source address is only checked for if there is one,
4230 		 * this is the case for retries.
4231 		 */
4232 		if ((cn->c_raddr.len == d_svcaddr->len) &&
4233 		    (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4234 		    d_svcaddr->len) == 0) &&
4235 		    ((s_svcaddr->len == 0) ||
4236 		    ((cn->c_laddr.len == s_svcaddr->len) &&
4237 		    (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4238 		    s_svcaddr->len) == 0)))) {
4239 			/*
4240 			 * Our connection. Give up conn list lock
4241 			 * as we are done traversing the list.
4242 			 */
4243 			rw_exit(&hca->cl_conn_list.conn_lock);
4244 			if (cn->c_state == C_CONNECTED) {
4245 				cn->c_ref++;	/* sharing a conn */
4246 				mutex_exit(&cn->c_lock);
4247 				*conn = cn;
4248 				return (RDMA_SUCCESS);
4249 			}
4250 			if (cn->c_state == C_CONN_PEND) {
4251 				/*
4252 				 * Hold a reference to this conn before
4253 				 * we give up the lock.
4254 				 */
4255 				cn->c_ref++;
4256 				timout =  ddi_get_lbolt() +
4257 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
4258 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4259 				    &cn->c_lock, timout)) > 0 &&
4260 				    cn->c_state == C_CONN_PEND)
4261 					;
4262 				if (cv_stat == 0) {
4263 					(void) rib_conn_release_locked(cn);
4264 					return (RDMA_INTR);
4265 				}
4266 				if (cv_stat < 0) {
4267 					(void) rib_conn_release_locked(cn);
4268 					return (RDMA_TIMEDOUT);
4269 				}
4270 				if (cn->c_state == C_CONNECTED) {
4271 					*conn = cn;
4272 					mutex_exit(&cn->c_lock);
4273 					return (RDMA_SUCCESS);
4274 				} else {
4275 					(void) rib_conn_release_locked(cn);
4276 					return (RDMA_TIMEDOUT);
4277 				}
4278 			}
4279 		}
4280 		mutex_exit(&cn->c_lock);
4281 		cn = cn->c_next;
4282 	}
4283 	rw_exit(&hca->cl_conn_list.conn_lock);
4284 	*conn = NULL;
4285 	return (RDMA_FAILED);
4286 }
4287 
4288 /*
4289  * Connection management.
4290  * IBTF does not support recycling of channels. So connections are only
4291  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4292  * C_DISCONN_PEND state. No C_IDLE state.
4293  * C_CONN_PEND state: Connection establishment in progress to the server.
4294  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4295  * It has an RC channel associated with it. ibt_post_send/recv are allowed
4296  * only in this state.
4297  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4298  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4299  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4300  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4301  * c_ref drops to 0 (this indicates that RPC has no more references to this
4302  * connection), the connection should be destroyed. A connection transitions
4303  * into this state when it is being destroyed.
4304  */
4305 /* ARGSUSED */
4306 static rdma_stat
4307 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4308     int addr_type, rpcib_ping_t *rpt, CONN **conn)
4309 {
4310 	CONN *cn;
4311 	int status;
4312 	rib_hca_t *hca;
4313 	rib_qp_t *qp;
4314 	int s_addr_len;
4315 	char *s_addr_buf;
4316 
4317 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4318 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4319 		rw_enter(&hca->state_lock, RW_READER);
4320 		if (hca->state != HCA_DETACHED) {
4321 			status = rib_find_hca_connection(hca, s_svcaddr,
4322 			    d_svcaddr, conn);
4323 			rw_exit(&hca->state_lock);
4324 			if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4325 				rw_exit(&rib_stat->hcas_list_lock);
4326 				return (status);
4327 			}
4328 		} else
4329 			rw_exit(&hca->state_lock);
4330 	}
4331 	rw_exit(&rib_stat->hcas_list_lock);
4332 
4333 	/*
4334 	 * No existing connection found, establish a new connection.
4335 	 */
4336 	bzero(rpt, sizeof (rpcib_ping_t));
4337 
4338 	status = rib_ping_srv(addr_type, d_svcaddr, rpt);
4339 	if (status != RDMA_SUCCESS) {
4340 		return (RDMA_FAILED);
4341 	}
4342 	hca = rpt->hca;
4343 
4344 	if (rpt->srcip.family == AF_INET) {
4345 		s_addr_len = sizeof (rpt->srcip.un.ip4addr);
4346 		s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
4347 	} else if (rpt->srcip.family == AF_INET6) {
4348 		s_addr_len = sizeof (rpt->srcip.un.ip6addr);
4349 		s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
4350 	} else {
4351 		return (RDMA_FAILED);
4352 	}
4353 
4354 	/*
4355 	 * Channel to server doesn't exist yet, create one.
4356 	 */
4357 	if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4358 		return (RDMA_FAILED);
4359 	}
4360 	cn = qptoc(qp);
4361 	cn->c_state = C_CONN_PEND;
4362 	cn->c_ref = 1;
4363 
4364 	cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4365 	bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4366 	cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4367 
4368 	if (rpt->srcip.family == AF_INET) {
4369 		cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP);
4370 		(void) strcpy(cn->c_netid, RIBNETID_TCP);
4371 	} else {
4372 		cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP);
4373 		(void) strcpy(cn->c_netid, RIBNETID_TCP6);
4374 	}
4375 
4376 	/*
4377 	 * Add to conn list.
4378 	 * We had given up the READER lock. In the time since then,
4379 	 * another thread might have created the connection we are
4380 	 * trying here. But for now, that is quiet alright - there
4381 	 * might be two connections between a pair of hosts instead
4382 	 * of one. If we really want to close that window,
4383 	 * then need to check the list after acquiring the
4384 	 * WRITER lock.
4385 	 */
4386 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4387 	status = rib_conn_to_srv(hca, qp, rpt);
4388 	mutex_enter(&cn->c_lock);
4389 
4390 	if (cn->c_flags & C_CLOSE_PENDING) {
4391 		/*
4392 		 * This handles a case where the module or
4393 		 * HCA detached in the time a connection is
4394 		 * established. In such a case close the
4395 		 * connection immediately if this is the
4396 		 * only reference.
4397 		 */
4398 		if (cn->c_ref == 1) {
4399 			cn->c_ref--;
4400 			cn->c_state = C_DISCONN_PEND;
4401 			mutex_exit(&cn->c_lock);
4402 			rib_conn_close((void *)cn);
4403 			return (RDMA_FAILED);
4404 		}
4405 
4406 		/*
4407 		 * Connection to be closed later when c_ref = 0
4408 		 */
4409 		status = RDMA_FAILED;
4410 	}
4411 
4412 	if (status == RDMA_SUCCESS) {
4413 		cn->c_state = C_CONNECTED;
4414 		*conn = cn;
4415 	} else {
4416 		cn->c_state = C_ERROR_CONN;
4417 		cn->c_ref--;
4418 	}
4419 	cv_signal(&cn->c_cv);
4420 	mutex_exit(&cn->c_lock);
4421 	return (status);
4422 }
4423 
4424 static void
4425 rib_conn_close(void *rarg)
4426 {
4427 	CONN *conn = (CONN *)rarg;
4428 	rib_qp_t *qp = ctoqp(conn);
4429 
4430 	mutex_enter(&conn->c_lock);
4431 	if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4432 
4433 		conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4434 
4435 		/*
4436 		 * Live connection in CONNECTED state.
4437 		 */
4438 		if (conn->c_state == C_CONNECTED) {
4439 			conn->c_state = C_ERROR_CONN;
4440 		}
4441 		mutex_exit(&conn->c_lock);
4442 
4443 		rib_close_a_channel(conn);
4444 
4445 		mutex_enter(&conn->c_lock);
4446 		conn->c_flags &= ~C_CLOSE_PENDING;
4447 	}
4448 
4449 	mutex_exit(&conn->c_lock);
4450 
4451 	if (qp->mode == RIB_SERVER)
4452 		(void) rib_disconnect_channel(conn,
4453 		    &qp->hca->srv_conn_list);
4454 	else
4455 		(void) rib_disconnect_channel(conn,
4456 		    &qp->hca->cl_conn_list);
4457 }
4458 
4459 static void
4460 rib_conn_timeout_call(void *carg)
4461 {
4462 	time_t idle_time;
4463 	CONN *conn = (CONN *)carg;
4464 	rib_hca_t *hca = ctoqp(conn)->hca;
4465 	int error;
4466 
4467 	mutex_enter(&conn->c_lock);
4468 	if ((conn->c_ref > 0) ||
4469 	    (conn->c_state == C_DISCONN_PEND)) {
4470 		conn->c_timeout = NULL;
4471 		mutex_exit(&conn->c_lock);
4472 		return;
4473 	}
4474 
4475 	idle_time = (gethrestime_sec() - conn->c_last_used);
4476 
4477 	if ((idle_time <= rib_conn_timeout) &&
4478 	    (conn->c_state != C_ERROR_CONN)) {
4479 		/*
4480 		 * There was activity after the last timeout.
4481 		 * Extend the conn life. Unless the conn is
4482 		 * already in error state.
4483 		 */
4484 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4485 		    SEC_TO_TICK(rib_conn_timeout - idle_time));
4486 		mutex_exit(&conn->c_lock);
4487 		return;
4488 	}
4489 
4490 	error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4491 	    (void *)conn, DDI_NOSLEEP);
4492 
4493 	/*
4494 	 * If taskq dispatch fails above, then reset the timeout
4495 	 * to try again after 10 secs.
4496 	 */
4497 
4498 	if (error != DDI_SUCCESS) {
4499 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4500 		    SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4501 		mutex_exit(&conn->c_lock);
4502 		return;
4503 	}
4504 
4505 	conn->c_state = C_DISCONN_PEND;
4506 	mutex_exit(&conn->c_lock);
4507 }
4508 
4509 static rdma_stat
4510 rib_conn_release(CONN *conn)
4511 {
4512 	mutex_enter(&conn->c_lock);
4513 	return (rib_conn_release_locked(conn));
4514 }
4515 
4516 /*
4517  * Expects conn->c_lock to be held on entry.
4518  * c_lock released on return
4519  */
4520 static rdma_stat
4521 rib_conn_release_locked(CONN *conn)
4522 {
4523 	conn->c_ref--;
4524 
4525 	conn->c_last_used = gethrestime_sec();
4526 	if (conn->c_ref > 0) {
4527 		mutex_exit(&conn->c_lock);
4528 		return (RDMA_SUCCESS);
4529 	}
4530 
4531 	/*
4532 	 * If a conn is C_ERROR_CONN, close the channel.
4533 	 */
4534 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4535 		conn->c_state = C_DISCONN_PEND;
4536 		mutex_exit(&conn->c_lock);
4537 		rib_conn_close((void *)conn);
4538 		return (RDMA_SUCCESS);
4539 	}
4540 
4541 	/*
4542 	 * c_ref == 0, set a timeout for conn release
4543 	 */
4544 
4545 	if (conn->c_timeout == NULL) {
4546 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4547 		    SEC_TO_TICK(rib_conn_timeout));
4548 	}
4549 
4550 	mutex_exit(&conn->c_lock);
4551 	return (RDMA_SUCCESS);
4552 }
4553 
4554 /*
4555  * Add at front of list
4556  */
4557 static struct rdma_done_list *
4558 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4559 {
4560 	struct rdma_done_list *rd;
4561 
4562 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4563 
4564 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4565 	rd->xid = xid;
4566 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4567 
4568 	rd->prev = NULL;
4569 	rd->next = qp->rdlist;
4570 	if (qp->rdlist != NULL)
4571 		qp->rdlist->prev = rd;
4572 	qp->rdlist = rd;
4573 
4574 	return (rd);
4575 }
4576 
4577 static void
4578 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4579 {
4580 	struct rdma_done_list *r;
4581 
4582 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4583 
4584 	r = rd->next;
4585 	if (r != NULL) {
4586 		r->prev = rd->prev;
4587 	}
4588 
4589 	r = rd->prev;
4590 	if (r != NULL) {
4591 		r->next = rd->next;
4592 	} else {
4593 		qp->rdlist = rd->next;
4594 	}
4595 
4596 	cv_destroy(&rd->rdma_done_cv);
4597 	kmem_free(rd, sizeof (*rd));
4598 }
4599 
4600 static void
4601 rdma_done_rem_list(rib_qp_t *qp)
4602 {
4603 	struct rdma_done_list	*r, *n;
4604 
4605 	mutex_enter(&qp->rdlist_lock);
4606 	for (r = qp->rdlist; r != NULL; r = n) {
4607 		n = r->next;
4608 		rdma_done_rm(qp, r);
4609 	}
4610 	mutex_exit(&qp->rdlist_lock);
4611 }
4612 
4613 static void
4614 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4615 {
4616 	struct rdma_done_list *r = qp->rdlist;
4617 
4618 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4619 
4620 	while (r) {
4621 		if (r->xid == xid) {
4622 			cv_signal(&r->rdma_done_cv);
4623 			return;
4624 		} else {
4625 			r = r->next;
4626 		}
4627 	}
4628 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4629 	    int, xid);
4630 }
4631 
4632 /*
4633  * Expects conn->c_lock to be held by the caller.
4634  */
4635 
4636 static void
4637 rib_close_a_channel(CONN *conn)
4638 {
4639 	rib_qp_t	*qp;
4640 	qp = ctoqp(conn);
4641 
4642 	if (qp->qp_hdl == NULL) {
4643 		/* channel already freed */
4644 		return;
4645 	}
4646 
4647 	/*
4648 	 * Call ibt_close_rc_channel in blocking mode
4649 	 * with no callbacks.
4650 	 */
4651 	(void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4652 	    NULL, 0, NULL, NULL, 0);
4653 }
4654 
4655 /*
4656  * Goes through all connections and closes the channel
4657  * This will cause all the WRs on those channels to be
4658  * flushed.
4659  */
4660 static void
4661 rib_close_channels(rib_conn_list_t *connlist)
4662 {
4663 	CONN 		*conn, *tmp;
4664 
4665 	rw_enter(&connlist->conn_lock, RW_READER);
4666 	conn = connlist->conn_hd;
4667 	while (conn != NULL) {
4668 		mutex_enter(&conn->c_lock);
4669 		tmp = conn->c_next;
4670 		if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4671 
4672 			if (conn->c_state == C_CONN_PEND) {
4673 				conn->c_flags |= C_CLOSE_PENDING;
4674 				goto next;
4675 			}
4676 
4677 			conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4678 
4679 			/*
4680 			 * Live connection in CONNECTED state.
4681 			 */
4682 			if (conn->c_state == C_CONNECTED)
4683 				conn->c_state = C_ERROR_CONN;
4684 			mutex_exit(&conn->c_lock);
4685 
4686 			rib_close_a_channel(conn);
4687 
4688 			mutex_enter(&conn->c_lock);
4689 			conn->c_flags &= ~C_CLOSE_PENDING;
4690 			/* Signal a pending rib_disconnect_channel() */
4691 			cv_signal(&conn->c_cv);
4692 		}
4693 next:
4694 		mutex_exit(&conn->c_lock);
4695 		conn = tmp;
4696 	}
4697 	rw_exit(&connlist->conn_lock);
4698 }
4699 
4700 /*
4701  * Frees up all connections that are no longer being referenced
4702  */
4703 static void
4704 rib_purge_connlist(rib_conn_list_t *connlist)
4705 {
4706 	CONN 		*conn;
4707 
4708 top:
4709 	rw_enter(&connlist->conn_lock, RW_READER);
4710 	conn = connlist->conn_hd;
4711 	while (conn != NULL) {
4712 		mutex_enter(&conn->c_lock);
4713 
4714 		/*
4715 		 * At this point connection is either in ERROR
4716 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4717 		 * then some other thread is culling that connection.
4718 		 * If not and if c_ref is 0, then destroy the connection.
4719 		 */
4720 		if (conn->c_ref == 0 &&
4721 		    conn->c_state != C_DISCONN_PEND) {
4722 			/*
4723 			 * Cull the connection
4724 			 */
4725 			conn->c_state = C_DISCONN_PEND;
4726 			mutex_exit(&conn->c_lock);
4727 			rw_exit(&connlist->conn_lock);
4728 			(void) rib_disconnect_channel(conn, connlist);
4729 			goto top;
4730 		} else {
4731 			/*
4732 			 * conn disconnect already scheduled or will
4733 			 * happen from conn_release when c_ref drops to 0.
4734 			 */
4735 			mutex_exit(&conn->c_lock);
4736 		}
4737 		conn = conn->c_next;
4738 	}
4739 	rw_exit(&connlist->conn_lock);
4740 
4741 	/*
4742 	 * At this point, only connections with c_ref != 0 are on the list
4743 	 */
4744 }
4745 
4746 /*
4747  * Free all the HCA resources and close
4748  * the hca.
4749  */
4750 
4751 static void
4752 rib_free_hca(rib_hca_t *hca)
4753 {
4754 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4755 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4756 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4757 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4758 
4759 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4760 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4761 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4762 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4763 
4764 	rib_rbufpool_destroy(hca, RECV_BUFFER);
4765 	rib_rbufpool_destroy(hca, SEND_BUFFER);
4766 	rib_destroy_cache(hca);
4767 	if (rib_mod.rdma_count == 0)
4768 		(void) rdma_unregister_mod(&rib_mod);
4769 	(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4770 	(void) ibt_close_hca(hca->hca_hdl);
4771 	hca->hca_hdl = NULL;
4772 }
4773 
4774 
4775 static void
4776 rib_stop_hca_services(rib_hca_t *hca)
4777 {
4778 	rib_stop_services(hca);
4779 	rib_close_channels(&hca->cl_conn_list);
4780 	rib_close_channels(&hca->srv_conn_list);
4781 
4782 	rib_purge_connlist(&hca->cl_conn_list);
4783 	rib_purge_connlist(&hca->srv_conn_list);
4784 
4785 	if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4786 		kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4787 		    GLOBAL_ZONEID);
4788 		stats_enabled = FALSE;
4789 	}
4790 
4791 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4792 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4793 	if (hca->srv_conn_list.conn_hd == NULL &&
4794 	    hca->cl_conn_list.conn_hd == NULL) {
4795 		/*
4796 		 * conn_lists are NULL, so destroy
4797 		 * buffers, close hca and be done.
4798 		 */
4799 		rib_free_hca(hca);
4800 	}
4801 	rw_exit(&hca->cl_conn_list.conn_lock);
4802 	rw_exit(&hca->srv_conn_list.conn_lock);
4803 
4804 	if (hca->hca_hdl != NULL) {
4805 		mutex_enter(&hca->inuse_lock);
4806 		while (hca->inuse)
4807 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4808 		mutex_exit(&hca->inuse_lock);
4809 
4810 		rib_free_hca(hca);
4811 	}
4812 	rw_destroy(&hca->bound_services_lock);
4813 
4814 	if (hca->cleanup_helper != NULL) {
4815 		ddi_taskq_destroy(hca->cleanup_helper);
4816 		hca->cleanup_helper = NULL;
4817 	}
4818 }
4819 
4820 /*
4821  * Cleans and closes up all uses of the HCA
4822  */
4823 static void
4824 rib_detach_hca(ibt_hca_hdl_t hca_hdl)
4825 {
4826 	rib_hca_t *hca = NULL;
4827 	rib_hca_t **hcap;
4828 
4829 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4830 	for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) {
4831 		hca = *hcap;
4832 		rw_enter(&hca->state_lock, RW_WRITER);
4833 		if (hca->hca_hdl == hca_hdl) {
4834 			/*
4835 			 * Mark as detached and remove from
4836 			 * hca list.
4837 			 */
4838 			hca->state = HCA_DETACHED;
4839 			*hcap = hca->next;
4840 			rib_stat->nhca_inited--;
4841 			rib_mod.rdma_count--;
4842 			rw_exit(&hca->state_lock);
4843 			break;
4844 		}
4845 		rw_exit(&hca->state_lock);
4846 	}
4847 	rw_exit(&rib_stat->hcas_list_lock);
4848 
4849 	if (hca == NULL)
4850 		return;
4851 	ASSERT(hca->hca_hdl == hca_hdl);
4852 
4853 	/*
4854 	 * Stop all services on the HCA
4855 	 * Go through cl_conn_list and close all rc_channels
4856 	 * Go through svr_conn_list and close all rc_channels
4857 	 * Free connections whose c_ref has dropped to 0
4858 	 * Destroy all CQs
4859 	 * Deregister and released all buffer pool memory after all
4860 	 * connections are destroyed
4861 	 * Free the protection domain
4862 	 * ibt_close_hca()
4863 	 */
4864 	rib_stop_hca_services(hca);
4865 
4866 	kmem_free(hca, sizeof (*hca));
4867 }
4868 
4869 static void
4870 rib_server_side_cache_reclaim(void *argp)
4871 {
4872 	cache_avl_struct_t    *rcas;
4873 	rib_lrc_entry_t		*rb;
4874 	rib_hca_t *hca = (rib_hca_t *)argp;
4875 
4876 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4877 	rcas = avl_first(&hca->avl_tree);
4878 	if (rcas != NULL)
4879 		avl_remove(&hca->avl_tree, rcas);
4880 
4881 	while (rcas != NULL) {
4882 		while (rcas->r.forw != &rcas->r) {
4883 			rcas->elements--;
4884 			rb = rcas->r.forw;
4885 			remque(rb);
4886 			if (rb->registered)
4887 				(void) rib_deregistermem_via_hca(hca,
4888 				    rb->lrc_buf, rb->lrc_mhandle);
4889 
4890 			hca->cache_allocation -= rb->lrc_len;
4891 			kmem_free(rb->lrc_buf, rb->lrc_len);
4892 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4893 		}
4894 		mutex_destroy(&rcas->node_lock);
4895 		kmem_cache_free(hca->server_side_cache, rcas);
4896 		rcas = avl_first(&hca->avl_tree);
4897 		if (rcas != NULL)
4898 			avl_remove(&hca->avl_tree, rcas);
4899 	}
4900 	rw_exit(&hca->avl_rw_lock);
4901 }
4902 
4903 static void
4904 rib_server_side_cache_cleanup(void *argp)
4905 {
4906 	cache_avl_struct_t    *rcas;
4907 	rib_lrc_entry_t		*rb;
4908 	rib_hca_t *hca = (rib_hca_t *)argp;
4909 
4910 	mutex_enter(&hca->cache_allocation_lock);
4911 	if (hca->cache_allocation < cache_limit) {
4912 		mutex_exit(&hca->cache_allocation_lock);
4913 		return;
4914 	}
4915 	mutex_exit(&hca->cache_allocation_lock);
4916 
4917 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4918 	rcas = avl_last(&hca->avl_tree);
4919 	if (rcas != NULL)
4920 		avl_remove(&hca->avl_tree, rcas);
4921 
4922 	while (rcas != NULL) {
4923 		while (rcas->r.forw != &rcas->r) {
4924 			rcas->elements--;
4925 			rb = rcas->r.forw;
4926 			remque(rb);
4927 			if (rb->registered)
4928 				(void) rib_deregistermem_via_hca(hca,
4929 				    rb->lrc_buf, rb->lrc_mhandle);
4930 
4931 			hca->cache_allocation -= rb->lrc_len;
4932 
4933 			kmem_free(rb->lrc_buf, rb->lrc_len);
4934 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4935 		}
4936 		mutex_destroy(&rcas->node_lock);
4937 		if (hca->server_side_cache) {
4938 			kmem_cache_free(hca->server_side_cache, rcas);
4939 		}
4940 
4941 		if (hca->cache_allocation < cache_limit) {
4942 			rw_exit(&hca->avl_rw_lock);
4943 			return;
4944 		}
4945 
4946 		rcas = avl_last(&hca->avl_tree);
4947 		if (rcas != NULL)
4948 			avl_remove(&hca->avl_tree, rcas);
4949 	}
4950 	rw_exit(&hca->avl_rw_lock);
4951 }
4952 
4953 static int
4954 avl_compare(const void *t1, const void *t2)
4955 {
4956 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
4957 		return (0);
4958 
4959 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
4960 		return (-1);
4961 
4962 	return (1);
4963 }
4964 
4965 static void
4966 rib_destroy_cache(rib_hca_t *hca)
4967 {
4968 	if (hca->avl_init) {
4969 		rib_server_side_cache_reclaim((void *)hca);
4970 		if (hca->server_side_cache) {
4971 			kmem_cache_destroy(hca->server_side_cache);
4972 			hca->server_side_cache = NULL;
4973 		}
4974 		avl_destroy(&hca->avl_tree);
4975 		mutex_destroy(&hca->cache_allocation_lock);
4976 		rw_destroy(&hca->avl_rw_lock);
4977 	}
4978 	hca->avl_init = FALSE;
4979 }
4980 
4981 static void
4982 rib_force_cleanup(void *hca)
4983 {
4984 	if (((rib_hca_t *)hca)->cleanup_helper != NULL)
4985 		(void) ddi_taskq_dispatch(
4986 		    ((rib_hca_t *)hca)->cleanup_helper,
4987 		    rib_server_side_cache_cleanup,
4988 		    (void *)hca, DDI_NOSLEEP);
4989 }
4990 
4991 static rib_lrc_entry_t *
4992 rib_get_cache_buf(CONN *conn, uint32_t len)
4993 {
4994 	cache_avl_struct_t	cas, *rcas;
4995 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4996 	rib_lrc_entry_t *reply_buf;
4997 	avl_index_t where = NULL;
4998 	uint64_t c_alloc = 0;
4999 
5000 	if (!hca->avl_init)
5001 		goto  error_alloc;
5002 
5003 	cas.len = len;
5004 
5005 	rw_enter(&hca->avl_rw_lock, RW_READER);
5006 
5007 	mutex_enter(&hca->cache_allocation_lock);
5008 	c_alloc = hca->cache_allocation;
5009 	mutex_exit(&hca->cache_allocation_lock);
5010 
5011 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
5012 	    &where)) == NULL) {
5013 		/* Am I above the cache limit */
5014 		if ((c_alloc + len) >= cache_limit) {
5015 			rib_force_cleanup((void *)hca);
5016 			rw_exit(&hca->avl_rw_lock);
5017 			mutex_enter(&hca->cache_allocation_lock);
5018 			hca->cache_misses_above_the_limit ++;
5019 			mutex_exit(&hca->cache_allocation_lock);
5020 
5021 			/* Allocate and register the buffer directly */
5022 			goto error_alloc;
5023 		}
5024 
5025 		rw_exit(&hca->avl_rw_lock);
5026 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
5027 
5028 		/* Recheck to make sure no other thread added the entry in */
5029 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
5030 		    &cas, &where)) == NULL) {
5031 			/* Allocate an avl tree entry */
5032 			rcas = (cache_avl_struct_t *)
5033 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
5034 
5035 			bzero(rcas, sizeof (cache_avl_struct_t));
5036 			rcas->elements = 0;
5037 			rcas->r.forw = &rcas->r;
5038 			rcas->r.back = &rcas->r;
5039 			rcas->len = len;
5040 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5041 			avl_insert(&hca->avl_tree, rcas, where);
5042 		}
5043 	}
5044 
5045 	mutex_enter(&rcas->node_lock);
5046 
5047 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
5048 		reply_buf = rcas->r.forw;
5049 		remque(reply_buf);
5050 		rcas->elements--;
5051 		mutex_exit(&rcas->node_lock);
5052 		rw_exit(&hca->avl_rw_lock);
5053 
5054 		mutex_enter(&hca->cache_allocation_lock);
5055 		hca->cache_hits++;
5056 		hca->cache_allocation -= len;
5057 		mutex_exit(&hca->cache_allocation_lock);
5058 	} else {
5059 		/* Am I above the cache limit */
5060 		mutex_exit(&rcas->node_lock);
5061 		if ((c_alloc + len) >= cache_limit) {
5062 			rib_force_cleanup((void *)hca);
5063 			rw_exit(&hca->avl_rw_lock);
5064 
5065 			mutex_enter(&hca->cache_allocation_lock);
5066 			hca->cache_misses_above_the_limit++;
5067 			mutex_exit(&hca->cache_allocation_lock);
5068 			/* Allocate and register the buffer directly */
5069 			goto error_alloc;
5070 		}
5071 		rw_exit(&hca->avl_rw_lock);
5072 		mutex_enter(&hca->cache_allocation_lock);
5073 		hca->cache_misses++;
5074 		mutex_exit(&hca->cache_allocation_lock);
5075 		/* Allocate a reply_buf entry */
5076 		reply_buf = (rib_lrc_entry_t *)
5077 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5078 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
5079 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
5080 		reply_buf->lrc_len  = len;
5081 		reply_buf->registered = FALSE;
5082 		reply_buf->avl_node = (void *)rcas;
5083 	}
5084 
5085 	return (reply_buf);
5086 
5087 error_alloc:
5088 	reply_buf = (rib_lrc_entry_t *)
5089 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5090 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
5091 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5092 	reply_buf->lrc_len = len;
5093 	reply_buf->registered = FALSE;
5094 	reply_buf->avl_node = NULL;
5095 
5096 	return (reply_buf);
5097 }
5098 
5099 /*
5100  * Return a pre-registered back to the cache (without
5101  * unregistering the buffer)..
5102  */
5103 
5104 static void
5105 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5106 {
5107 	cache_avl_struct_t    cas, *rcas;
5108 	avl_index_t where = NULL;
5109 	rib_hca_t	*hca = (ctoqp(conn))->hca;
5110 
5111 	if (!hca->avl_init)
5112 		goto  error_free;
5113 
5114 	cas.len = reg_buf->lrc_len;
5115 	rw_enter(&hca->avl_rw_lock, RW_READER);
5116 	if ((rcas = (cache_avl_struct_t *)
5117 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5118 		rw_exit(&hca->avl_rw_lock);
5119 		goto error_free;
5120 	} else {
5121 		cas.len = reg_buf->lrc_len;
5122 		mutex_enter(&rcas->node_lock);
5123 		insque(reg_buf, &rcas->r);
5124 		rcas->elements ++;
5125 		mutex_exit(&rcas->node_lock);
5126 		rw_exit(&hca->avl_rw_lock);
5127 		mutex_enter(&hca->cache_allocation_lock);
5128 		hca->cache_allocation += cas.len;
5129 		mutex_exit(&hca->cache_allocation_lock);
5130 	}
5131 
5132 	return;
5133 
5134 error_free:
5135 
5136 	if (reg_buf->registered)
5137 		(void) rib_deregistermem_via_hca(hca,
5138 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5139 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5140 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5141 }
5142 
5143 static rdma_stat
5144 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5145 	uint_t buflen, struct mrc *buf_handle)
5146 {
5147 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
5148 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
5149 	rdma_stat	status;
5150 
5151 
5152 	/*
5153 	 * Note: ALL buffer pools use the same memory type RDMARW.
5154 	 */
5155 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5156 	if (status == RDMA_SUCCESS) {
5157 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5158 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5159 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5160 	} else {
5161 		buf_handle->mrc_linfo = NULL;
5162 		buf_handle->mrc_lmr = 0;
5163 		buf_handle->mrc_rmr = 0;
5164 	}
5165 	return (status);
5166 }
5167 
5168 /* ARGSUSED */
5169 static rdma_stat
5170 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5171     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5172 {
5173 
5174 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5175 	return (RDMA_SUCCESS);
5176 }
5177 
5178 /* ARGSUSED */
5179 static rdma_stat
5180 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5181 {
5182 
5183 	(void) ibt_deregister_mr(hca->hca_hdl,
5184 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5185 	return (RDMA_SUCCESS);
5186 }
5187 
5188 /*
5189  * Check if the IP interface named by `lifrp' is RDMA-capable.
5190  */
5191 static boolean_t
5192 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5193 {
5194 	char ifname[LIFNAMSIZ];
5195 	char *cp;
5196 
5197 	if (lifrp->lifr_type == IFT_IB)
5198 		return (B_TRUE);
5199 
5200 	/*
5201 	 * Strip off the logical interface portion before getting
5202 	 * intimate with the name.
5203 	 */
5204 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5205 	if ((cp = strchr(ifname, ':')) != NULL)
5206 		*cp = '\0';
5207 
5208 	return (strcmp("lo0", ifname) == 0);
5209 }
5210 
5211 static int
5212 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5213 {
5214 	vnode_t *kkvp, *vp;
5215 	TIUSER  *tiptr;
5216 	struct  strioctl iocb;
5217 	k_sigset_t smask;
5218 	int	err = 0;
5219 
5220 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) {
5221 		if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE,
5222 		    &tiptr, CRED()) == 0) {
5223 			vp = tiptr->fp->f_vnode;
5224 		} else {
5225 			VN_RELE(kkvp);
5226 			return (EPROTO);
5227 		}
5228 	} else {
5229 		return (EPROTO);
5230 	}
5231 
5232 	iocb.ic_cmd = cmd;
5233 	iocb.ic_timout = 0;
5234 	iocb.ic_len = len;
5235 	iocb.ic_dp = (caddr_t)arg;
5236 	sigintr(&smask, 0);
5237 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5238 	sigunintr(&smask);
5239 	(void) t_kclose(tiptr, 0);
5240 	VN_RELE(kkvp);
5241 	return (err);
5242 }
5243 
5244 /*
5245  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5246  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5247  */
5248 static int
5249 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5250 {
5251 	int err;
5252 	struct lifnum lifn;
5253 
5254 	bzero(&lifn, sizeof (struct lifnum));
5255 	lifn.lifn_family = AF_UNSPEC;
5256 
5257 	err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5258 	if (err != 0)
5259 		return (err);
5260 
5261 	/*
5262 	 * Pad the interface count to account for additional interfaces that
5263 	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5264 	 */
5265 	lifn.lifn_count += 4;
5266 
5267 	bzero(lifcp, sizeof (struct lifconf));
5268 	lifcp->lifc_family = AF_UNSPEC;
5269 	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5270 	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5271 
5272 	err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5273 	if (err != 0) {
5274 		kmem_free(lifcp->lifc_buf, *bufsizep);
5275 		return (err);
5276 	}
5277 	return (0);
5278 }
5279 
5280 static boolean_t
5281 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5282 {
5283 	uint_t i, nifs;
5284 	uint_t bufsize;
5285 	struct lifconf lifc;
5286 	struct lifreq *lifrp;
5287 	struct sockaddr_in *sinp;
5288 	struct sockaddr_in6 *sin6p;
5289 
5290 	bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5291 	bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5292 
5293 	if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5294 		return (B_FALSE);
5295 
5296 	if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5297 		kmem_free(lifc.lifc_buf, bufsize);
5298 		return (B_FALSE);
5299 	}
5300 
5301 	/*
5302 	 * Worst case is that all of the addresses are IB-capable and have
5303 	 * the same address family, so size our buffers accordingly.
5304 	 */
5305 	addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5306 	addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5307 	addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5308 	addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5309 
5310 	for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5311 		if (!rpcib_rdma_capable_interface(lifrp))
5312 			continue;
5313 
5314 		if (lifrp->lifr_addr.ss_family == AF_INET) {
5315 			sinp = addrs4->ri_list;
5316 			bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5317 			    sizeof (struct sockaddr_in));
5318 		} else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5319 			sin6p = addrs6->ri_list;
5320 			bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5321 			    sizeof (struct sockaddr_in6));
5322 		}
5323 	}
5324 
5325 	kmem_free(lifc.lifc_buf, bufsize);
5326 	return (B_TRUE);
5327 }
5328 
5329 /* ARGSUSED */
5330 static int
5331 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5332 {
5333 	rib_hca_t *hca;
5334 
5335 	if (KSTAT_WRITE == rw) {
5336 		return (EACCES);
5337 	}
5338 
5339 	rpcib_kstat.cache_limit.value.ui64 =
5340 	    (uint64_t)cache_limit;
5341 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5342 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5343 		rpcib_kstat.cache_allocation.value.ui64 +=
5344 		    (uint64_t)hca->cache_allocation;
5345 		rpcib_kstat.cache_hits.value.ui64 +=
5346 		    (uint64_t)hca->cache_hits;
5347 		rpcib_kstat.cache_misses.value.ui64 +=
5348 		    (uint64_t)hca->cache_misses;
5349 		rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5350 		    (uint64_t)hca->cache_misses_above_the_limit;
5351 	}
5352 	rw_exit(&rib_stat->hcas_list_lock);
5353 	return (0);
5354 }
5355