xref: /titanic_50/usr/src/uts/common/rpc/rpcib.c (revision 5547f1d8c68fa119522b859bbf38315dc773e696)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
28  *
29  * Portions of this source code is developed by the team members of
30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31  * headed by Professor Dhabaleswar K. (DK) Panda.
32  *
33  * Acknowledgements to contributions from developors:
34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
35  *   Lei Chai      : chail@cse.ohio-state.edu
36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
37  *
38  */
39 
40 /*
41  * The rpcib plugin. Implements the interface for RDMATF's
42  * interaction with IBTF.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/user.h>
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/file.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/stropts.h>
56 #include <sys/errno.h>
57 #include <sys/kmem.h>
58 #include <sys/debug.h>
59 #include <sys/pathname.h>
60 #include <sys/kstat.h>
61 #include <sys/t_lock.h>
62 #include <sys/ddi.h>
63 #include <sys/cmn_err.h>
64 #include <sys/time.h>
65 #include <sys/isa_defs.h>
66 #include <sys/callb.h>
67 #include <sys/sunddi.h>
68 #include <sys/sunndi.h>
69 #include <sys/sdt.h>
70 #include <sys/ib/ibtl/ibti.h>
71 #include <rpc/rpc.h>
72 #include <rpc/ib.h>
73 #include <sys/modctl.h>
74 #include <sys/kstr.h>
75 #include <sys/sockio.h>
76 #include <sys/vnode.h>
77 #include <sys/tiuser.h>
78 #include <net/if.h>
79 #include <net/if_types.h>
80 #include <sys/cred.h>
81 #include <rpc/rpc_rdma.h>
82 #include <nfs/nfs.h>
83 #include <sys/atomic.h>
84 
85 #define	NFS_RDMA_PORT	20049
86 
87 
88 /*
89  * Convenience structures for connection management
90  */
91 typedef struct rpcib_ipaddrs {
92 	void	*ri_list;	/* pointer to list of addresses */
93 	uint_t	ri_count;	/* number of addresses in list */
94 	uint_t	ri_size;	/* size of ri_list in bytes */
95 } rpcib_ipaddrs_t;
96 
97 
98 typedef struct rpcib_ping {
99 	rib_hca_t  *hca;
100 	ibt_path_info_t path;
101 	ibt_ip_addr_t srcip;
102 	ibt_ip_addr_t dstip;
103 } rpcib_ping_t;
104 
105 /*
106  * Prototype declarations for driver ops
107  */
108 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
109 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
110 				void *, void **);
111 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
113 static int	rpcib_do_ip_ioctl(int, int, void *);
114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
115 static int rpcib_cache_kstat_update(kstat_t *, int);
116 static void rib_force_cleanup(void *);
117 
118 struct {
119 	kstat_named_t cache_limit;
120 	kstat_named_t cache_allocation;
121 	kstat_named_t cache_hits;
122 	kstat_named_t cache_misses;
123 	kstat_named_t cache_misses_above_the_limit;
124 } rpcib_kstat = {
125 	{"cache_limit",			KSTAT_DATA_UINT64 },
126 	{"cache_allocation",		KSTAT_DATA_UINT64 },
127 	{"cache_hits",			KSTAT_DATA_UINT64 },
128 	{"cache_misses",		KSTAT_DATA_UINT64 },
129 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
130 };
131 
132 /* rpcib cb_ops */
133 static struct cb_ops rpcib_cbops = {
134 	nulldev,		/* open */
135 	nulldev,		/* close */
136 	nodev,			/* strategy */
137 	nodev,			/* print */
138 	nodev,			/* dump */
139 	nodev,			/* read */
140 	nodev,			/* write */
141 	nodev,			/* ioctl */
142 	nodev,			/* devmap */
143 	nodev,			/* mmap */
144 	nodev,			/* segmap */
145 	nochpoll,		/* poll */
146 	ddi_prop_op,		/* prop_op */
147 	NULL,			/* stream */
148 	D_MP,			/* cb_flag */
149 	CB_REV,			/* rev */
150 	nodev,			/* int (*cb_aread)() */
151 	nodev			/* int (*cb_awrite)() */
152 };
153 
154 /*
155  * Device options
156  */
157 static struct dev_ops rpcib_ops = {
158 	DEVO_REV,		/* devo_rev, */
159 	0,			/* refcnt  */
160 	rpcib_getinfo,		/* info */
161 	nulldev,		/* identify */
162 	nulldev,		/* probe */
163 	rpcib_attach,		/* attach */
164 	rpcib_detach,		/* detach */
165 	nodev,			/* reset */
166 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
167 	NULL,			/* bus operations */
168 	NULL,			/* power */
169 	ddi_quiesce_not_needed,		/* quiesce */
170 };
171 
172 /*
173  * Module linkage information.
174  */
175 
176 static struct modldrv rib_modldrv = {
177 	&mod_driverops,		/* Driver module */
178 	"RPCIB plugin driver",	/* Driver name and version */
179 	&rpcib_ops,		/* Driver ops */
180 };
181 
182 static struct modlinkage rib_modlinkage = {
183 	MODREV_1,
184 	(void *)&rib_modldrv,
185 	NULL
186 };
187 
188 typedef struct rib_lrc_entry {
189 	struct rib_lrc_entry *forw;
190 	struct rib_lrc_entry *back;
191 	char *lrc_buf;
192 
193 	uint32_t lrc_len;
194 	void  *avl_node;
195 	bool_t registered;
196 
197 	struct mrc lrc_mhandle;
198 	bool_t lrc_on_freed_list;
199 } rib_lrc_entry_t;
200 
201 typedef	struct cache_struct	{
202 	rib_lrc_entry_t		r;
203 	uint32_t		len;
204 	uint32_t		elements;
205 	kmutex_t		node_lock;
206 	avl_node_t		avl_link;
207 } cache_avl_struct_t;
208 
209 static uint64_t	rib_total_buffers = 0;
210 uint64_t	cache_limit = 100 * 1024 * 1024;
211 static volatile uint64_t	cache_allocation = 0;
212 static uint64_t	cache_watermark = 80 * 1024 * 1024;
213 static uint64_t	cache_hits = 0;
214 static uint64_t	cache_misses = 0;
215 static uint64_t	cache_cold_misses = 0;
216 static uint64_t	cache_hot_misses = 0;
217 static uint64_t	cache_misses_above_the_limit = 0;
218 static bool_t	stats_enabled = FALSE;
219 
220 static uint64_t max_unsignaled_rws = 5;
221 int nfs_rdma_port = NFS_RDMA_PORT;
222 
223 /*
224  * rib_stat: private data pointer used when registering
225  *	with the IBTF.  It is returned to the consumer
226  *	in all callbacks.
227  */
228 static rpcib_state_t *rib_stat = NULL;
229 
230 #define	RNR_RETRIES	IBT_RNR_RETRY_1
231 #define	MAX_PORTS	2
232 #define	RDMA_DUMMY_WRID	0x4D3A1D4D3A1D
233 #define	RDMA_CONN_REAP_RETRY	10	/* 10 secs */
234 
235 int preposted_rbufs = RDMA_BUFS_GRANT;
236 int send_threshold = 1;
237 
238 /*
239  * Old cards with Tavor driver have limited memory footprint
240  * when booted in 32bit. The rib_max_rbufs tunable can be
241  * tuned for more buffers if needed.
242  */
243 
244 #if !defined(_ELF64) && !defined(__sparc)
245 int rib_max_rbufs = MAX_BUFS;
246 #else
247 int rib_max_rbufs = 10 * MAX_BUFS;
248 #endif	/* !(_ELF64) && !(__sparc) */
249 
250 int rib_conn_timeout = 60 * 12;		/* 12 minutes */
251 
252 /*
253  * State of the plugin.
254  * ACCEPT = accepting new connections and requests.
255  * NO_ACCEPT = not accepting new connection and requests.
256  * This should eventually move to rpcib_state_t structure, since this
257  * will tell in which state the plugin is for a particular type of service
258  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
259  * state for one and in no_accept state for the other.
260  */
261 int		plugin_state;
262 kmutex_t	plugin_state_lock;
263 
264 ldi_ident_t rpcib_li;
265 
266 /*
267  * RPCIB RDMATF operations
268  */
269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
270 static rdma_stat rib_disconnect(CONN *conn);
271 static void rib_listen(struct rdma_svc_data *rd);
272 static void rib_listen_stop(struct rdma_svc_data *rd);
273 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
274 	uint_t buflen, struct mrc *buf_handle);
275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
276 	struct mrc buf_handle);
277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
278 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
280 		struct mrc buf_handle);
281 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
282 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
283 	void *lrc);
284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
285 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
287 	caddr_t buf, int len, int cpu);
288 
289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
290 
291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
293 
294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
295 
296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
305 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
306 static rdma_stat rib_conn_release(CONN *conn);
307 static rdma_stat rib_getinfo(rdma_info_t *info);
308 
309 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
310 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
311 static void rib_destroy_cache(rib_hca_t *hca);
312 static	void	rib_server_side_cache_reclaim(void *argp);
313 static int avl_compare(const void *t1, const void *t2);
314 
315 static void rib_stop_services(rib_hca_t *);
316 static void rib_close_channels(rib_conn_list_t *);
317 static void rib_conn_close(void *);
318 
319 /*
320  * RPCIB addressing operations
321  */
322 
323 /*
324  * RDMA operations the RPCIB module exports
325  */
326 static rdmaops_t rib_ops = {
327 	rib_reachable,
328 	rib_conn_get,
329 	rib_conn_release,
330 	rib_listen,
331 	rib_listen_stop,
332 	rib_registermem,
333 	rib_deregistermem,
334 	rib_registermemsync,
335 	rib_deregistermemsync,
336 	rib_syncmem,
337 	rib_reg_buf_alloc,
338 	rib_reg_buf_free,
339 	rib_send,
340 	rib_send_resp,
341 	rib_post_resp,
342 	rib_post_resp_remove,
343 	rib_post_recv,
344 	rib_recv,
345 	rib_read,
346 	rib_write,
347 	rib_getinfo,
348 };
349 
350 /*
351  * RDMATF RPCIB plugin details
352  */
353 static rdma_mod_t rib_mod = {
354 	"ibtf",		/* api name */
355 	RDMATF_VERS_1,
356 	0,
357 	&rib_ops,	/* rdma op vector for ibtf */
358 };
359 
360 static rdma_stat open_hcas(rpcib_state_t *);
361 static rdma_stat rib_qp_init(rib_qp_t *, int);
362 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
363 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
364 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
365 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
366 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
367 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
368 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
369 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
370 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
371 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
372 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
373 	rib_qp_t **);
374 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
375 	rib_qp_t **);
376 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
377 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
378 static int rib_free_sendwait(struct send_wid *);
379 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
380 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
381 static void rdma_done_rem_list(rib_qp_t *);
382 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
383 
384 static void rib_async_handler(void *,
385 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
386 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
387 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
388 static int rib_free_svc_recv(struct svc_recv *);
389 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
390 static void rib_free_wid(struct recv_wid *);
391 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
392 static void rib_detach_hca(rib_hca_t *);
393 static void rib_close_a_channel(CONN *);
394 static void rib_send_hold(rib_qp_t *);
395 static void rib_send_rele(rib_qp_t *);
396 
397 /*
398  * Registration with IBTF as a consumer
399  */
400 static struct ibt_clnt_modinfo_s rib_modinfo = {
401 	IBTI_V_CURR,
402 	IBT_GENERIC,
403 	rib_async_handler,	/* async event handler */
404 	NULL,			/* Memory Region Handler */
405 	"nfs/ib"
406 };
407 
408 /*
409  * Global strucuture
410  */
411 
412 typedef struct rpcib_s {
413 	dev_info_t	*rpcib_dip;
414 	kmutex_t	rpcib_mutex;
415 } rpcib_t;
416 
417 rpcib_t rpcib;
418 
419 /*
420  * /etc/system controlled variable to control
421  * debugging in rpcib kernel module.
422  * Set it to values greater that 1 to control
423  * the amount of debugging messages required.
424  */
425 int rib_debug = 0;
426 
427 int
428 _init(void)
429 {
430 	int error;
431 
432 	error = mod_install((struct modlinkage *)&rib_modlinkage);
433 	if (error != 0) {
434 		/*
435 		 * Could not load module
436 		 */
437 		return (error);
438 	}
439 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
440 	return (0);
441 }
442 
443 int
444 _fini()
445 {
446 	int status;
447 
448 	/*
449 	 * Remove module
450 	 */
451 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
452 		return (status);
453 	}
454 	mutex_destroy(&plugin_state_lock);
455 	return (0);
456 }
457 
458 int
459 _info(struct modinfo *modinfop)
460 {
461 	return (mod_info(&rib_modlinkage, modinfop));
462 }
463 
464 /*
465  * rpcib_getinfo()
466  * Given the device number, return the devinfo pointer or the
467  * instance number.
468  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
469  */
470 
471 /*ARGSUSED*/
472 static int
473 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
474 {
475 	int ret = DDI_SUCCESS;
476 
477 	switch (cmd) {
478 	case DDI_INFO_DEVT2DEVINFO:
479 		if (rpcib.rpcib_dip != NULL)
480 			*result = rpcib.rpcib_dip;
481 		else {
482 			*result = NULL;
483 			ret = DDI_FAILURE;
484 		}
485 		break;
486 
487 	case DDI_INFO_DEVT2INSTANCE:
488 		*result = NULL;
489 		break;
490 
491 	default:
492 		ret = DDI_FAILURE;
493 	}
494 	return (ret);
495 }
496 
497 static int
498 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
499 {
500 	ibt_status_t	ibt_status;
501 	rdma_stat	r_status;
502 
503 	switch (cmd) {
504 	case DDI_ATTACH:
505 		break;
506 	case DDI_RESUME:
507 		return (DDI_SUCCESS);
508 	default:
509 		return (DDI_FAILURE);
510 	}
511 
512 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
513 
514 	mutex_enter(&rpcib.rpcib_mutex);
515 	if (rpcib.rpcib_dip != NULL) {
516 		mutex_exit(&rpcib.rpcib_mutex);
517 		return (DDI_FAILURE);
518 	}
519 	rpcib.rpcib_dip = dip;
520 	mutex_exit(&rpcib.rpcib_mutex);
521 	/*
522 	 * Create the "rpcib" minor-node.
523 	 */
524 	if (ddi_create_minor_node(dip,
525 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
526 		/* Error message, no cmn_err as they print on console */
527 		return (DDI_FAILURE);
528 	}
529 
530 	if (rib_stat == NULL) {
531 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
532 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
533 	}
534 
535 	rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids);
536 	if (rib_stat->hca_count < 1) {
537 		mutex_destroy(&rib_stat->open_hca_lock);
538 		kmem_free(rib_stat, sizeof (*rib_stat));
539 		rib_stat = NULL;
540 		return (DDI_FAILURE);
541 	}
542 
543 	ibt_status = ibt_attach(&rib_modinfo, dip,
544 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
545 
546 	if (ibt_status != IBT_SUCCESS) {
547 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
548 		mutex_destroy(&rib_stat->open_hca_lock);
549 		kmem_free(rib_stat, sizeof (*rib_stat));
550 		rib_stat = NULL;
551 		return (DDI_FAILURE);
552 	}
553 
554 	mutex_enter(&rib_stat->open_hca_lock);
555 	if (open_hcas(rib_stat) != RDMA_SUCCESS) {
556 		mutex_exit(&rib_stat->open_hca_lock);
557 		goto open_fail;
558 	}
559 	mutex_exit(&rib_stat->open_hca_lock);
560 
561 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
562 	    DDI_PROP_SUCCESS) {
563 		cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
564 		    "failed.");
565 		goto register_fail;
566 	}
567 
568 	/*
569 	 * Register with rdmatf
570 	 */
571 	rib_mod.rdma_count = rib_stat->nhca_inited;
572 	r_status = rdma_register_mod(&rib_mod);
573 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
574 		cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
575 		    "status = %d", r_status);
576 		goto register_fail;
577 	}
578 
579 	return (DDI_SUCCESS);
580 
581 register_fail:
582 	rib_detach_hca(rib_stat->hca);
583 open_fail:
584 	ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
585 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
586 	mutex_destroy(&rib_stat->open_hca_lock);
587 	kmem_free(rib_stat, sizeof (*rib_stat));
588 	rib_stat = NULL;
589 	return (DDI_FAILURE);
590 }
591 
592 /*ARGSUSED*/
593 static int
594 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
595 {
596 	switch (cmd) {
597 
598 	case DDI_DETACH:
599 		break;
600 
601 	case DDI_SUSPEND:
602 	default:
603 		return (DDI_FAILURE);
604 	}
605 
606 	/*
607 	 * Detach the hca and free resources
608 	 */
609 	mutex_enter(&plugin_state_lock);
610 	plugin_state = NO_ACCEPT;
611 	mutex_exit(&plugin_state_lock);
612 	rib_detach_hca(rib_stat->hca);
613 	ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
614 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
615 	mutex_destroy(&rib_stat->open_hca_lock);
616 	if (rib_stat->hcas) {
617 		kmem_free(rib_stat->hcas, rib_stat->hca_count *
618 		    sizeof (rib_hca_t));
619 		rib_stat->hcas = NULL;
620 	}
621 	kmem_free(rib_stat, sizeof (*rib_stat));
622 	rib_stat = NULL;
623 
624 	mutex_enter(&rpcib.rpcib_mutex);
625 	rpcib.rpcib_dip = NULL;
626 	mutex_exit(&rpcib.rpcib_mutex);
627 	mutex_destroy(&rpcib.rpcib_mutex);
628 	return (DDI_SUCCESS);
629 }
630 
631 
632 static void rib_rbufpool_free(rib_hca_t *, int);
633 static void rib_rbufpool_deregister(rib_hca_t *, int);
634 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
635 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
636 static rdma_stat rib_rem_replylist(rib_qp_t *);
637 static int rib_remreply(rib_qp_t *, struct reply *);
638 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
639 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
640 
641 
642 /*
643  * One CQ pair per HCA
644  */
645 static rdma_stat
646 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
647 	rib_cq_t **cqp, rpcib_state_t *ribstat)
648 {
649 	rib_cq_t	*cq;
650 	ibt_cq_attr_t	cq_attr;
651 	uint32_t	real_size;
652 	ibt_status_t	status;
653 	rdma_stat	error = RDMA_SUCCESS;
654 
655 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
656 	cq->rib_hca = hca;
657 	cq_attr.cq_size = cq_size;
658 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
659 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
660 	    &real_size);
661 	if (status != IBT_SUCCESS) {
662 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
663 		    " status=%d", status);
664 		error = RDMA_FAILED;
665 		goto fail;
666 	}
667 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat);
668 
669 	/*
670 	 * Enable CQ callbacks. CQ Callbacks are single shot
671 	 * (e.g. you have to call ibt_enable_cq_notify()
672 	 * after each callback to get another one).
673 	 */
674 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
675 	if (status != IBT_SUCCESS) {
676 		cmn_err(CE_WARN, "rib_create_cq: "
677 		    "enable_cq_notify failed, status %d", status);
678 		error = RDMA_FAILED;
679 		goto fail;
680 	}
681 	*cqp = cq;
682 
683 	return (error);
684 fail:
685 	if (cq->rib_cq_hdl)
686 		(void) ibt_free_cq(cq->rib_cq_hdl);
687 	if (cq)
688 		kmem_free(cq, sizeof (rib_cq_t));
689 	return (error);
690 }
691 
692 static rdma_stat
693 open_hcas(rpcib_state_t *ribstat)
694 {
695 	rib_hca_t		*hca;
696 	ibt_status_t		ibt_status;
697 	rdma_stat		status;
698 	ibt_hca_portinfo_t	*pinfop;
699 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
700 	uint_t			size, cq_size;
701 	int			i;
702 	kstat_t *ksp;
703 	cache_avl_struct_t example_avl_node;
704 	char rssc_name[32];
705 
706 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
707 
708 	if (ribstat->hcas == NULL)
709 		ribstat->hcas = kmem_zalloc(ribstat->hca_count *
710 		    sizeof (rib_hca_t), KM_SLEEP);
711 
712 	/*
713 	 * Open a hca and setup for RDMA
714 	 */
715 	for (i = 0; i < ribstat->hca_count; i++) {
716 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
717 		    ribstat->hca_guids[i],
718 		    &ribstat->hcas[i].hca_hdl);
719 		if (ibt_status != IBT_SUCCESS) {
720 			continue;
721 		}
722 		ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
723 		hca = &(ribstat->hcas[i]);
724 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
725 		hca->state = HCA_INITED;
726 
727 		/*
728 		 * query HCA info
729 		 */
730 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
731 		if (ibt_status != IBT_SUCCESS) {
732 			goto fail1;
733 		}
734 
735 		/*
736 		 * One PD (Protection Domain) per HCA.
737 		 * A qp is allowed to access a memory region
738 		 * only when it's in the same PD as that of
739 		 * the memory region.
740 		 */
741 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
742 		if (ibt_status != IBT_SUCCESS) {
743 			goto fail1;
744 		}
745 
746 		/*
747 		 * query HCA ports
748 		 */
749 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
750 		    0, &pinfop, &hca->hca_nports, &size);
751 		if (ibt_status != IBT_SUCCESS) {
752 			goto fail2;
753 		}
754 		hca->hca_ports = pinfop;
755 		hca->hca_pinfosz = size;
756 		pinfop = NULL;
757 
758 		cq_size = DEF_CQ_SIZE; /* default cq size */
759 		/*
760 		 * Create 2 pairs of cq's (1 pair for client
761 		 * and the other pair for server) on this hca.
762 		 * If number of qp's gets too large, then several
763 		 * cq's will be needed.
764 		 */
765 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
766 		    &hca->svc_rcq, ribstat);
767 		if (status != RDMA_SUCCESS) {
768 			goto fail3;
769 		}
770 
771 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
772 		    &hca->svc_scq, ribstat);
773 		if (status != RDMA_SUCCESS) {
774 			goto fail3;
775 		}
776 
777 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
778 		    &hca->clnt_rcq, ribstat);
779 		if (status != RDMA_SUCCESS) {
780 			goto fail3;
781 		}
782 
783 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
784 		    &hca->clnt_scq, ribstat);
785 		if (status != RDMA_SUCCESS) {
786 			goto fail3;
787 		}
788 
789 		/*
790 		 * Create buffer pools.
791 		 * Note rib_rbuf_create also allocates memory windows.
792 		 */
793 		hca->recv_pool = rib_rbufpool_create(hca,
794 		    RECV_BUFFER, rib_max_rbufs);
795 		if (hca->recv_pool == NULL) {
796 			goto fail3;
797 		}
798 
799 		hca->send_pool = rib_rbufpool_create(hca,
800 		    SEND_BUFFER, rib_max_rbufs);
801 		if (hca->send_pool == NULL) {
802 			rib_rbufpool_destroy(hca, RECV_BUFFER);
803 			goto fail3;
804 		}
805 
806 		if (hca->server_side_cache == NULL) {
807 			(void) sprintf(rssc_name,
808 			    "rib_server_side_cache_%04d", i);
809 			hca->server_side_cache = kmem_cache_create(
810 			    rssc_name,
811 			    sizeof (cache_avl_struct_t), 0,
812 			    NULL,
813 			    NULL,
814 			    rib_server_side_cache_reclaim,
815 			    hca, NULL, 0);
816 		}
817 
818 		avl_create(&hca->avl_tree,
819 		    avl_compare,
820 		    sizeof (cache_avl_struct_t),
821 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
822 		    (uint_t)(uintptr_t)&example_avl_node);
823 
824 		rw_init(&hca->avl_rw_lock,
825 		    NULL, RW_DRIVER, hca->iblock);
826 		mutex_init(&hca->cache_allocation,
827 		    NULL, MUTEX_DRIVER, NULL);
828 		hca->avl_init = TRUE;
829 
830 		/* Create kstats for the cache */
831 		ASSERT(INGLOBALZONE(curproc));
832 
833 		if (!stats_enabled) {
834 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
835 			    KSTAT_TYPE_NAMED,
836 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
837 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
838 			    GLOBAL_ZONEID);
839 			if (ksp) {
840 				ksp->ks_data = (void *) &rpcib_kstat;
841 				ksp->ks_update = rpcib_cache_kstat_update;
842 				kstat_install(ksp);
843 				stats_enabled = TRUE;
844 			}
845 		}
846 		if (hca->cleanup_helper == NULL) {
847 			hca->cleanup_helper = ddi_taskq_create(NULL,
848 			    "CLEANUP_HELPER", 1, TASKQ_DEFAULTPRI, 0);
849 		}
850 
851 		/*
852 		 * Initialize the registered service list and
853 		 * the lock
854 		 */
855 		hca->service_list = NULL;
856 		rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
857 
858 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
859 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
860 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
861 		    hca->iblock);
862 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
863 		    hca->iblock);
864 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
865 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
866 		hca->inuse = TRUE;
867 		/*
868 		 * XXX One hca only. Add multi-hca functionality if needed
869 		 * later.
870 		 */
871 		ribstat->hca = hca;
872 		ribstat->nhca_inited++;
873 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
874 		break;
875 
876 fail3:
877 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
878 fail2:
879 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
880 fail1:
881 		(void) ibt_close_hca(hca->hca_hdl);
882 
883 	}
884 	if (ribstat->hca != NULL)
885 		return (RDMA_SUCCESS);
886 	else
887 		return (RDMA_FAILED);
888 }
889 
890 /*
891  * Callback routines
892  */
893 
894 /*
895  * SCQ handlers
896  */
897 /* ARGSUSED */
898 static void
899 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
900 {
901 	ibt_status_t	ibt_status;
902 	ibt_wc_t	wc;
903 	struct send_wid	*wd;
904 	CONN		*conn;
905 	rib_qp_t	*qp;
906 	int		i;
907 
908 	/*
909 	 * Re-enable cq notify here to avoid missing any
910 	 * completion queue notification.
911 	 */
912 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
913 
914 	ibt_status = IBT_SUCCESS;
915 	while (ibt_status != IBT_CQ_EMPTY) {
916 		bzero(&wc, sizeof (wc));
917 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
918 		if (ibt_status != IBT_SUCCESS)
919 			return;
920 
921 		/*
922 		 * Got a send completion
923 		 */
924 		if (wc.wc_id != RDMA_DUMMY_WRID) {
925 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
926 			qp = wd->qp;
927 			conn = qptoc(qp);
928 
929 			mutex_enter(&wd->sendwait_lock);
930 			switch (wc.wc_status) {
931 			case IBT_WC_SUCCESS:
932 				wd->status = RDMA_SUCCESS;
933 				break;
934 			default:
935 /*
936  *    RC Send Q Error Code		Local state     Remote State
937  *    ==================== 		===========     ============
938  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
939  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
940  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
941  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
942  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
943  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
944  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
945  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
946  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
947  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
948  *    IBT_WC_WR_FLUSHED_ERR               ERROR           None
949  */
950 				/*
951 				 * Channel in error state. Set connection to
952 				 * ERROR and cleanup will happen either from
953 				 * conn_release  or from rib_conn_get
954 				 */
955 				wd->status = RDMA_FAILED;
956 				mutex_enter(&conn->c_lock);
957 				if (conn->c_state != C_DISCONN_PEND)
958 					conn->c_state = C_ERROR_CONN;
959 				mutex_exit(&conn->c_lock);
960 				break;
961 			}
962 
963 			if (wd->cv_sig == 1) {
964 				/*
965 				 * Notify poster
966 				 */
967 				cv_signal(&wd->wait_cv);
968 				mutex_exit(&wd->sendwait_lock);
969 			} else {
970 				/*
971 				 * Poster not waiting for notification.
972 				 * Free the send buffers and send_wid
973 				 */
974 				for (i = 0; i < wd->nsbufs; i++) {
975 					rib_rbuf_free(qptoc(wd->qp),
976 					    SEND_BUFFER,
977 					    (void *)(uintptr_t)wd->sbufaddr[i]);
978 				}
979 
980 				/* decrement the send ref count */
981 				rib_send_rele(qp);
982 
983 				mutex_exit(&wd->sendwait_lock);
984 				(void) rib_free_sendwait(wd);
985 			}
986 		}
987 	}
988 }
989 
990 /* ARGSUSED */
991 static void
992 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
993 {
994 	ibt_status_t	ibt_status;
995 	ibt_wc_t	wc;
996 	struct send_wid	*wd;
997 	rib_qp_t	*qp;
998 	CONN		*conn;
999 	int		i;
1000 
1001 	/*
1002 	 * Re-enable cq notify here to avoid missing any
1003 	 * completion queue notification.
1004 	 */
1005 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1006 
1007 	ibt_status = IBT_SUCCESS;
1008 	while (ibt_status != IBT_CQ_EMPTY) {
1009 		bzero(&wc, sizeof (wc));
1010 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1011 		if (ibt_status != IBT_SUCCESS)
1012 			return;
1013 
1014 		/*
1015 		 * Got a send completion
1016 		 */
1017 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1018 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1019 			qp = wd->qp;
1020 			conn = qptoc(qp);
1021 			mutex_enter(&wd->sendwait_lock);
1022 
1023 			switch (wc.wc_status) {
1024 			case IBT_WC_SUCCESS:
1025 				wd->status = RDMA_SUCCESS;
1026 				break;
1027 			default:
1028 				/*
1029 				 * Channel in error state. Set connection to
1030 				 * ERROR and cleanup will happen either from
1031 				 * conn_release  or conn timeout.
1032 				 */
1033 				wd->status = RDMA_FAILED;
1034 				mutex_enter(&conn->c_lock);
1035 				if (conn->c_state != C_DISCONN_PEND)
1036 					conn->c_state = C_ERROR_CONN;
1037 				mutex_exit(&conn->c_lock);
1038 				break;
1039 			}
1040 
1041 			if (wd->cv_sig == 1) {
1042 				/*
1043 				 * Update completion status and notify poster
1044 				 */
1045 				cv_signal(&wd->wait_cv);
1046 				mutex_exit(&wd->sendwait_lock);
1047 			} else {
1048 				/*
1049 				 * Poster not waiting for notification.
1050 				 * Free the send buffers and send_wid
1051 				 */
1052 				for (i = 0; i < wd->nsbufs; i++) {
1053 					rib_rbuf_free(qptoc(wd->qp),
1054 					    SEND_BUFFER,
1055 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1056 				}
1057 
1058 				/* decrement the send ref count */
1059 				rib_send_rele(qp);
1060 
1061 				mutex_exit(&wd->sendwait_lock);
1062 				(void) rib_free_sendwait(wd);
1063 			}
1064 		}
1065 	}
1066 }
1067 
1068 /*
1069  * RCQ handler
1070  */
1071 /* ARGSUSED */
1072 static void
1073 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1074 {
1075 	rib_qp_t	*qp;
1076 	ibt_status_t	ibt_status;
1077 	ibt_wc_t	wc;
1078 	struct recv_wid	*rwid;
1079 
1080 	/*
1081 	 * Re-enable cq notify here to avoid missing any
1082 	 * completion queue notification.
1083 	 */
1084 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1085 
1086 	ibt_status = IBT_SUCCESS;
1087 	while (ibt_status != IBT_CQ_EMPTY) {
1088 		bzero(&wc, sizeof (wc));
1089 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1090 		if (ibt_status != IBT_SUCCESS)
1091 			return;
1092 
1093 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1094 		qp = rwid->qp;
1095 		if (wc.wc_status == IBT_WC_SUCCESS) {
1096 			XDR	inxdrs, *xdrs;
1097 			uint_t	xid, vers, op, find_xid = 0;
1098 			struct reply	*r;
1099 			CONN *conn = qptoc(qp);
1100 			uint32_t rdma_credit = 0;
1101 
1102 			xdrs = &inxdrs;
1103 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1104 			    wc.wc_bytes_xfer, XDR_DECODE);
1105 			/*
1106 			 * Treat xid as opaque (xid is the first entity
1107 			 * in the rpc rdma message).
1108 			 */
1109 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1110 
1111 			/* Skip xid and set the xdr position accordingly. */
1112 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1113 			(void) xdr_u_int(xdrs, &vers);
1114 			(void) xdr_u_int(xdrs, &rdma_credit);
1115 			(void) xdr_u_int(xdrs, &op);
1116 			XDR_DESTROY(xdrs);
1117 
1118 			if (vers != RPCRDMA_VERS) {
1119 				/*
1120 				 * Invalid RPC/RDMA version. Cannot
1121 				 * interoperate.  Set connection to
1122 				 * ERROR state and bail out.
1123 				 */
1124 				mutex_enter(&conn->c_lock);
1125 				if (conn->c_state != C_DISCONN_PEND)
1126 					conn->c_state = C_ERROR_CONN;
1127 				mutex_exit(&conn->c_lock);
1128 				rib_rbuf_free(conn, RECV_BUFFER,
1129 				    (void *)(uintptr_t)rwid->addr);
1130 				rib_free_wid(rwid);
1131 				continue;
1132 			}
1133 
1134 			mutex_enter(&qp->replylist_lock);
1135 			for (r = qp->replylist; r != NULL; r = r->next) {
1136 				if (r->xid == xid) {
1137 					find_xid = 1;
1138 					switch (op) {
1139 					case RDMA_MSG:
1140 					case RDMA_NOMSG:
1141 					case RDMA_MSGP:
1142 						r->status = RDMA_SUCCESS;
1143 						r->vaddr_cq = rwid->addr;
1144 						r->bytes_xfer =
1145 						    wc.wc_bytes_xfer;
1146 						cv_signal(&r->wait_cv);
1147 						break;
1148 					default:
1149 						rib_rbuf_free(qptoc(qp),
1150 						    RECV_BUFFER,
1151 						    (void *)(uintptr_t)
1152 						    rwid->addr);
1153 						break;
1154 					}
1155 					break;
1156 				}
1157 			}
1158 			mutex_exit(&qp->replylist_lock);
1159 			if (find_xid == 0) {
1160 				/* RPC caller not waiting for reply */
1161 
1162 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1163 				    int, xid);
1164 
1165 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1166 				    (void *)(uintptr_t)rwid->addr);
1167 			}
1168 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1169 			CONN *conn = qptoc(qp);
1170 
1171 			/*
1172 			 * Connection being flushed. Just free
1173 			 * the posted buffer
1174 			 */
1175 			rib_rbuf_free(conn, RECV_BUFFER,
1176 			    (void *)(uintptr_t)rwid->addr);
1177 		} else {
1178 			CONN *conn = qptoc(qp);
1179 /*
1180  *  RC Recv Q Error Code		Local state     Remote State
1181  *  ====================		===========     ============
1182  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1183  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1184  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1185  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1186  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1187  *  IBT_WC_WR_FLUSHED_ERR               None            None
1188  */
1189 			/*
1190 			 * Channel in error state. Set connection
1191 			 * in ERROR state.
1192 			 */
1193 			mutex_enter(&conn->c_lock);
1194 			if (conn->c_state != C_DISCONN_PEND)
1195 				conn->c_state = C_ERROR_CONN;
1196 			mutex_exit(&conn->c_lock);
1197 			rib_rbuf_free(conn, RECV_BUFFER,
1198 			    (void *)(uintptr_t)rwid->addr);
1199 		}
1200 		rib_free_wid(rwid);
1201 	}
1202 }
1203 
1204 /* Server side */
1205 /* ARGSUSED */
1206 static void
1207 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1208 {
1209 	rdma_recv_data_t *rdp;
1210 	rib_qp_t	*qp;
1211 	ibt_status_t	ibt_status;
1212 	ibt_wc_t	wc;
1213 	struct svc_recv	*s_recvp;
1214 	CONN		*conn;
1215 	mblk_t		*mp;
1216 
1217 	/*
1218 	 * Re-enable cq notify here to avoid missing any
1219 	 * completion queue notification.
1220 	 */
1221 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1222 
1223 	ibt_status = IBT_SUCCESS;
1224 	while (ibt_status != IBT_CQ_EMPTY) {
1225 		bzero(&wc, sizeof (wc));
1226 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1227 		if (ibt_status != IBT_SUCCESS)
1228 			return;
1229 
1230 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1231 		qp = s_recvp->qp;
1232 		conn = qptoc(qp);
1233 		mutex_enter(&qp->posted_rbufs_lock);
1234 		qp->n_posted_rbufs--;
1235 		if (qp->n_posted_rbufs == 0)
1236 			cv_signal(&qp->posted_rbufs_cv);
1237 		mutex_exit(&qp->posted_rbufs_lock);
1238 
1239 		if (wc.wc_status == IBT_WC_SUCCESS) {
1240 			XDR	inxdrs, *xdrs;
1241 			uint_t	xid, vers, op;
1242 			uint32_t rdma_credit;
1243 
1244 			xdrs = &inxdrs;
1245 			/* s_recvp->vaddr stores data */
1246 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1247 			    wc.wc_bytes_xfer, XDR_DECODE);
1248 
1249 			/*
1250 			 * Treat xid as opaque (xid is the first entity
1251 			 * in the rpc rdma message).
1252 			 */
1253 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1254 			/* Skip xid and set the xdr position accordingly. */
1255 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1256 			if (!xdr_u_int(xdrs, &vers) ||
1257 			    !xdr_u_int(xdrs, &rdma_credit) ||
1258 			    !xdr_u_int(xdrs, &op)) {
1259 				rib_rbuf_free(conn, RECV_BUFFER,
1260 				    (void *)(uintptr_t)s_recvp->vaddr);
1261 				XDR_DESTROY(xdrs);
1262 				(void) rib_free_svc_recv(s_recvp);
1263 				continue;
1264 			}
1265 			XDR_DESTROY(xdrs);
1266 
1267 			if (vers != RPCRDMA_VERS) {
1268 				/*
1269 				 * Invalid RPC/RDMA version.
1270 				 * Drop rpc rdma message.
1271 				 */
1272 				rib_rbuf_free(conn, RECV_BUFFER,
1273 				    (void *)(uintptr_t)s_recvp->vaddr);
1274 				(void) rib_free_svc_recv(s_recvp);
1275 				continue;
1276 			}
1277 			/*
1278 			 * Is this for RDMA_DONE?
1279 			 */
1280 			if (op == RDMA_DONE) {
1281 				rib_rbuf_free(conn, RECV_BUFFER,
1282 				    (void *)(uintptr_t)s_recvp->vaddr);
1283 				/*
1284 				 * Wake up the thread waiting on
1285 				 * a RDMA_DONE for xid
1286 				 */
1287 				mutex_enter(&qp->rdlist_lock);
1288 				rdma_done_notify(qp, xid);
1289 				mutex_exit(&qp->rdlist_lock);
1290 				(void) rib_free_svc_recv(s_recvp);
1291 				continue;
1292 			}
1293 
1294 			mutex_enter(&plugin_state_lock);
1295 			if (plugin_state == ACCEPT) {
1296 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1297 				    == NULL)
1298 					(void) strwaitbuf(
1299 					    sizeof (*rdp), BPRI_LO);
1300 				/*
1301 				 * Plugin is in accept state, hence the master
1302 				 * transport queue for this is still accepting
1303 				 * requests. Hence we can call svc_queuereq to
1304 				 * queue this recieved msg.
1305 				 */
1306 				rdp = (rdma_recv_data_t *)mp->b_rptr;
1307 				rdp->conn = conn;
1308 				rdp->rpcmsg.addr =
1309 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1310 				rdp->rpcmsg.type = RECV_BUFFER;
1311 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1312 				rdp->status = wc.wc_status;
1313 				mutex_enter(&conn->c_lock);
1314 				conn->c_ref++;
1315 				mutex_exit(&conn->c_lock);
1316 				mp->b_wptr += sizeof (*rdp);
1317 				svc_queuereq((queue_t *)rib_stat->q, mp);
1318 				mutex_exit(&plugin_state_lock);
1319 			} else {
1320 				/*
1321 				 * The master transport for this is going
1322 				 * away and the queue is not accepting anymore
1323 				 * requests for krpc, so don't do anything, just
1324 				 * free the msg.
1325 				 */
1326 				mutex_exit(&plugin_state_lock);
1327 				rib_rbuf_free(conn, RECV_BUFFER,
1328 				    (void *)(uintptr_t)s_recvp->vaddr);
1329 			}
1330 		} else {
1331 			rib_rbuf_free(conn, RECV_BUFFER,
1332 			    (void *)(uintptr_t)s_recvp->vaddr);
1333 		}
1334 		(void) rib_free_svc_recv(s_recvp);
1335 	}
1336 }
1337 
1338 /*
1339  * Handles DR event of IBT_HCA_DETACH_EVENT.
1340  */
1341 /* ARGSUSED */
1342 static void
1343 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1344 	ibt_async_code_t code, ibt_async_event_t *event)
1345 {
1346 
1347 	switch (code) {
1348 	case IBT_HCA_ATTACH_EVENT:
1349 		/* ignore */
1350 		break;
1351 	case IBT_HCA_DETACH_EVENT:
1352 	{
1353 		ASSERT(rib_stat->hca->hca_hdl == hca_hdl);
1354 		rib_detach_hca(rib_stat->hca);
1355 #ifdef DEBUG
1356 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1357 #endif
1358 		break;
1359 	}
1360 #ifdef DEBUG
1361 	case IBT_EVENT_PATH_MIGRATED:
1362 		cmn_err(CE_NOTE, "rib_async_handler(): "
1363 		    "IBT_EVENT_PATH_MIGRATED\n");
1364 		break;
1365 	case IBT_EVENT_SQD:
1366 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1367 		break;
1368 	case IBT_EVENT_COM_EST:
1369 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1370 		break;
1371 	case IBT_ERROR_CATASTROPHIC_CHAN:
1372 		cmn_err(CE_NOTE, "rib_async_handler(): "
1373 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1374 		break;
1375 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1376 		cmn_err(CE_NOTE, "rib_async_handler(): "
1377 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1378 		break;
1379 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1380 		cmn_err(CE_NOTE, "rib_async_handler(): "
1381 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1382 		break;
1383 	case IBT_ERROR_PATH_MIGRATE_REQ:
1384 		cmn_err(CE_NOTE, "rib_async_handler(): "
1385 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1386 		break;
1387 	case IBT_ERROR_CQ:
1388 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1389 		break;
1390 	case IBT_ERROR_PORT_DOWN:
1391 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1392 		break;
1393 	case IBT_EVENT_PORT_UP:
1394 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1395 		break;
1396 	case IBT_ASYNC_OPAQUE1:
1397 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1398 		break;
1399 	case IBT_ASYNC_OPAQUE2:
1400 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1401 		break;
1402 	case IBT_ASYNC_OPAQUE3:
1403 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1404 		break;
1405 	case IBT_ASYNC_OPAQUE4:
1406 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1407 		break;
1408 #endif
1409 	default:
1410 		break;
1411 	}
1412 }
1413 
1414 /*
1415  * Client's reachable function.
1416  */
1417 static rdma_stat
1418 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1419 {
1420 	rdma_stat	status;
1421 	rpcib_ping_t	rpt;
1422 
1423 	/*
1424 	 * First check if a hca is still attached
1425 	 */
1426 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
1427 	if (rib_stat->hca->state != HCA_INITED) {
1428 		rw_exit(&rib_stat->hca->state_lock);
1429 		return (RDMA_FAILED);
1430 	}
1431 
1432 	bzero(&rpt, sizeof (rpcib_ping_t));
1433 	status = rib_ping_srv(addr_type, raddr, &rpt);
1434 	rw_exit(&rib_stat->hca->state_lock);
1435 
1436 	if (status == RDMA_SUCCESS) {
1437 		*handle = (void *)rpt.hca;
1438 		return (RDMA_SUCCESS);
1439 	} else {
1440 		*handle = NULL;
1441 		DTRACE_PROBE(rpcib__i__pingfailed);
1442 		return (RDMA_FAILED);
1443 	}
1444 }
1445 
1446 /* Client side qp creation */
1447 static rdma_stat
1448 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1449 {
1450 	rib_qp_t	*kqp = NULL;
1451 	CONN		*conn;
1452 	rdma_clnt_cred_ctrl_t *cc_info;
1453 
1454 	ASSERT(qp != NULL);
1455 	*qp = NULL;
1456 
1457 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1458 	conn = qptoc(kqp);
1459 	kqp->hca = hca;
1460 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1461 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1462 
1463 	kqp->mode = RIB_CLIENT;
1464 	kqp->chan_flags = IBT_BLOCKING;
1465 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1466 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1467 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1468 	/*
1469 	 * Initialize
1470 	 */
1471 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1472 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1473 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1474 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1475 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1476 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1477 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1478 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1479 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1480 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1481 	/*
1482 	 * Initialize the client credit control
1483 	 * portion of the rdmaconn struct.
1484 	 */
1485 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1486 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1487 	cc_info->clnt_cc_granted_ops = 0;
1488 	cc_info->clnt_cc_in_flight_ops = 0;
1489 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1490 
1491 	*qp = kqp;
1492 	return (RDMA_SUCCESS);
1493 }
1494 
1495 /* Server side qp creation */
1496 static rdma_stat
1497 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1498 {
1499 	rib_qp_t	*kqp = NULL;
1500 	ibt_chan_sizes_t	chan_sizes;
1501 	ibt_rc_chan_alloc_args_t	qp_attr;
1502 	ibt_status_t		ibt_status;
1503 	rdma_srv_cred_ctrl_t *cc_info;
1504 
1505 	*qp = NULL;
1506 
1507 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1508 	kqp->hca = hca;
1509 	kqp->port_num = port;
1510 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1511 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1512 
1513 	/*
1514 	 * Create the qp handle
1515 	 */
1516 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1517 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1518 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1519 	qp_attr.rc_pd = hca->pd_hdl;
1520 	qp_attr.rc_hca_port_num = port;
1521 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1522 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1523 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1524 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1525 	qp_attr.rc_clone_chan = NULL;
1526 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1527 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1528 
1529 	rw_enter(&hca->state_lock, RW_READER);
1530 	if (hca->state != HCA_DETACHED) {
1531 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1532 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1533 		    &chan_sizes);
1534 	} else {
1535 		rw_exit(&hca->state_lock);
1536 		goto fail;
1537 	}
1538 	rw_exit(&hca->state_lock);
1539 
1540 	if (ibt_status != IBT_SUCCESS) {
1541 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1542 		    int, ibt_status);
1543 		goto fail;
1544 	}
1545 
1546 	kqp->mode = RIB_SERVER;
1547 	kqp->chan_flags = IBT_BLOCKING;
1548 	kqp->q = q;	/* server ONLY */
1549 
1550 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1551 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1552 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1553 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1554 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1555 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1556 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1557 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1558 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1559 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1560 	/*
1561 	 * Set the private data area to qp to be used in callbacks
1562 	 */
1563 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1564 	kqp->rdmaconn.c_state = C_CONNECTED;
1565 
1566 	/*
1567 	 * Initialize the server credit control
1568 	 * portion of the rdmaconn struct.
1569 	 */
1570 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1571 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1572 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1573 	cc_info->srv_cc_cur_buffers_used = 0;
1574 	cc_info->srv_cc_posted = preposted_rbufs;
1575 
1576 	*qp = kqp;
1577 
1578 	return (RDMA_SUCCESS);
1579 fail:
1580 	if (kqp)
1581 		kmem_free(kqp, sizeof (rib_qp_t));
1582 
1583 	return (RDMA_FAILED);
1584 }
1585 
1586 /* ARGSUSED */
1587 ibt_cm_status_t
1588 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1589     ibt_cm_return_args_t *ret_args, void *priv_data,
1590     ibt_priv_data_len_t len)
1591 {
1592 	rpcib_state_t   *ribstat;
1593 	rib_hca_t	*hca;
1594 
1595 	ribstat = (rpcib_state_t *)clnt_hdl;
1596 	hca = (rib_hca_t *)ribstat->hca;
1597 
1598 	switch (event->cm_type) {
1599 
1600 	/* got a connection close event */
1601 	case IBT_CM_EVENT_CONN_CLOSED:
1602 	{
1603 		CONN	*conn;
1604 		rib_qp_t *qp;
1605 
1606 		/* check reason why connection was closed */
1607 		switch (event->cm_event.closed) {
1608 		case IBT_CM_CLOSED_DREP_RCVD:
1609 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1610 		case IBT_CM_CLOSED_DUP:
1611 		case IBT_CM_CLOSED_ABORT:
1612 		case IBT_CM_CLOSED_ALREADY:
1613 			/*
1614 			 * These cases indicate the local end initiated
1615 			 * the closing of the channel. Nothing to do here.
1616 			 */
1617 			break;
1618 		default:
1619 			/*
1620 			 * Reason for CONN_CLOSED event must be one of
1621 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1622 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1623 			 * the remote end is closing the channel. In these
1624 			 * cases free the channel and transition to error
1625 			 * state
1626 			 */
1627 			qp = ibt_get_chan_private(event->cm_channel);
1628 			conn = qptoc(qp);
1629 			mutex_enter(&conn->c_lock);
1630 			if (conn->c_state == C_DISCONN_PEND) {
1631 				mutex_exit(&conn->c_lock);
1632 				break;
1633 			}
1634 
1635 			conn->c_state = C_ERROR_CONN;
1636 
1637 			/*
1638 			 * Free the conn if c_ref is down to 0 already
1639 			 */
1640 			if (conn->c_ref == 0) {
1641 				/*
1642 				 * Remove from list and free conn
1643 				 */
1644 				conn->c_state = C_DISCONN_PEND;
1645 				mutex_exit(&conn->c_lock);
1646 				(void) rib_disconnect_channel(conn,
1647 				    &hca->cl_conn_list);
1648 			} else {
1649 				/*
1650 				 * conn will be freed when c_ref goes to 0.
1651 				 * Indicate to cleaning thread not to close
1652 				 * the connection, but just free the channel.
1653 				 */
1654 				conn->c_flags |= C_CLOSE_NOTNEEDED;
1655 				mutex_exit(&conn->c_lock);
1656 			}
1657 #ifdef DEBUG
1658 			if (rib_debug)
1659 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1660 				    "(CONN_CLOSED) channel disconnected");
1661 #endif
1662 			break;
1663 		}
1664 		break;
1665 	}
1666 	default:
1667 		break;
1668 	}
1669 	return (IBT_CM_ACCEPT);
1670 }
1671 
1672 /*
1673  * Connect to the server.
1674  */
1675 rdma_stat
1676 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1677 {
1678 	ibt_chan_open_args_t	chan_args;	/* channel args */
1679 	ibt_chan_sizes_t	chan_sizes;
1680 	ibt_rc_chan_alloc_args_t	qp_attr;
1681 	ibt_status_t		ibt_status;
1682 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1683 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1684 	ibt_ip_cm_info_t	ipcm_info;
1685 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1686 
1687 
1688 	(void) bzero(&chan_args, sizeof (chan_args));
1689 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1690 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1691 
1692 	ipcm_info.src_addr.family = rptp->srcip.family;
1693 	switch (ipcm_info.src_addr.family) {
1694 	case AF_INET:
1695 		ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1696 		break;
1697 	case AF_INET6:
1698 		ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1699 		break;
1700 	}
1701 
1702 	ipcm_info.dst_addr.family = rptp->srcip.family;
1703 	switch (ipcm_info.dst_addr.family) {
1704 	case AF_INET:
1705 		ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1706 		break;
1707 	case AF_INET6:
1708 		ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1709 		break;
1710 	}
1711 
1712 	ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1713 
1714 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1715 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1716 
1717 	if (ibt_status != IBT_SUCCESS) {
1718 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1719 		return (-1);
1720 	}
1721 
1722 	qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1723 	/* Alloc a RC channel */
1724 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1725 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1726 	qp_attr.rc_pd = hca->pd_hdl;
1727 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1728 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1729 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1730 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1731 	qp_attr.rc_clone_chan = NULL;
1732 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1733 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1734 
1735 	rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1736 	chan_args.oc_path = &rptp->path;
1737 
1738 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1739 	chan_args.oc_cm_clnt_private = (void *)rib_stat;
1740 	chan_args.oc_rdma_ra_out = 4;
1741 	chan_args.oc_rdma_ra_in = 4;
1742 	chan_args.oc_path_retry_cnt = 2;
1743 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1744 	chan_args.oc_priv_data = cmp_ip_pvt;
1745 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1746 
1747 refresh:
1748 	rw_enter(&hca->state_lock, RW_READER);
1749 	if (hca->state != HCA_DETACHED) {
1750 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1751 		    IBT_ACHAN_NO_FLAGS,
1752 		    &qp_attr, &qp->qp_hdl,
1753 		    &chan_sizes);
1754 	} else {
1755 		rw_exit(&hca->state_lock);
1756 		return (RDMA_FAILED);
1757 	}
1758 	rw_exit(&hca->state_lock);
1759 
1760 	if (ibt_status != IBT_SUCCESS) {
1761 		DTRACE_PROBE1(rpcib__i_conntosrv,
1762 		    int, ibt_status);
1763 		return (RDMA_FAILED);
1764 	}
1765 
1766 	/* Connect to the Server */
1767 	(void) bzero(&ret_args, sizeof (ret_args));
1768 	mutex_enter(&qp->cb_lock);
1769 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1770 	    IBT_BLOCKING, &chan_args, &ret_args);
1771 	if (ibt_status != IBT_SUCCESS) {
1772 		DTRACE_PROBE2(rpcib__i_openrctosrv,
1773 		    int, ibt_status, int, ret_args.rc_status);
1774 
1775 		(void) ibt_free_channel(qp->qp_hdl);
1776 		qp->qp_hdl = NULL;
1777 		mutex_exit(&qp->cb_lock);
1778 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1779 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1780 			/*
1781 			 * Got IBT_CM_CONN_STALE probably because of stale
1782 			 * data on the passive end of a channel that existed
1783 			 * prior to reboot. Retry establishing a channel
1784 			 * REFRESH_ATTEMPTS times, during which time the
1785 			 * stale conditions on the server might clear up.
1786 			 */
1787 			goto refresh;
1788 		}
1789 		return (RDMA_FAILED);
1790 	}
1791 	mutex_exit(&qp->cb_lock);
1792 	/*
1793 	 * Set the private data area to qp to be used in callbacks
1794 	 */
1795 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1796 	return (RDMA_SUCCESS);
1797 }
1798 
1799 rdma_stat
1800 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1801 {
1802 	uint_t			i;
1803 	ibt_status_t		ibt_status;
1804 	uint8_t			num_paths_p;
1805 	ibt_ip_path_attr_t	ipattr;
1806 	ibt_path_ip_src_t	srcip;
1807 	rpcib_ipaddrs_t		addrs4;
1808 	rpcib_ipaddrs_t		addrs6;
1809 	struct sockaddr_in	*sinp;
1810 	struct sockaddr_in6	*sin6p;
1811 	rdma_stat		retval = RDMA_SUCCESS;
1812 
1813 	ASSERT(raddr->buf != NULL);
1814 
1815 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1816 
1817 	if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1818 	    (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1819 		retval = RDMA_FAILED;
1820 		goto done;
1821 	}
1822 
1823 	/* Prep the destination address */
1824 	switch (addr_type) {
1825 	case AF_INET:
1826 		sinp = (struct sockaddr_in *)raddr->buf;
1827 		rptp->dstip.family = AF_INET;
1828 		rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1829 		sinp = addrs4.ri_list;
1830 
1831 		ipattr.ipa_dst_ip 	= &rptp->dstip;
1832 		ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
1833 		ipattr.ipa_ndst		= 1;
1834 		ipattr.ipa_max_paths	= 1;
1835 		ipattr.ipa_src_ip.family = rptp->dstip.family;
1836 		for (i = 0; i < addrs4.ri_count; i++) {
1837 			num_paths_p = 0;
1838 			ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr;
1839 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
1840 
1841 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1842 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1843 			    &num_paths_p, &srcip);
1844 			if (ibt_status == IBT_SUCCESS &&
1845 			    num_paths_p != 0 &&
1846 			    rptp->path.pi_hca_guid == rib_stat->hca->hca_guid) {
1847 				rptp->hca = rib_stat->hca;
1848 				rptp->srcip.family = AF_INET;
1849 				rptp->srcip.un.ip4addr =
1850 				    srcip.ip_primary.un.ip4addr;
1851 				goto done;
1852 			}
1853 		}
1854 		retval = RDMA_FAILED;
1855 		break;
1856 
1857 	case AF_INET6:
1858 		sin6p = (struct sockaddr_in6 *)raddr->buf;
1859 		rptp->dstip.family = AF_INET6;
1860 		rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1861 		sin6p = addrs6.ri_list;
1862 
1863 		ipattr.ipa_dst_ip 	= &rptp->dstip;
1864 		ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
1865 		ipattr.ipa_ndst		= 1;
1866 		ipattr.ipa_max_paths	= 1;
1867 		ipattr.ipa_src_ip.family = rptp->dstip.family;
1868 		for (i = 0; i < addrs6.ri_count; i++) {
1869 			num_paths_p = 0;
1870 			ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr;
1871 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
1872 
1873 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1874 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1875 			    &num_paths_p, &srcip);
1876 			if (ibt_status == IBT_SUCCESS &&
1877 			    num_paths_p != 0 &&
1878 			    rptp->path.pi_hca_guid == rib_stat->hca->hca_guid) {
1879 				rptp->hca = rib_stat->hca;
1880 				rptp->srcip.family = AF_INET6;
1881 				rptp->srcip.un.ip6addr =
1882 				    srcip.ip_primary.un.ip6addr;
1883 				goto done;
1884 			}
1885 		}
1886 		retval = RDMA_FAILED;
1887 		break;
1888 
1889 	default:
1890 		retval = RDMA_INVAL;
1891 		break;
1892 	}
1893 done:
1894 
1895 	if (addrs4.ri_size > 0)
1896 		kmem_free(addrs4.ri_list, addrs4.ri_size);
1897 	if (addrs6.ri_size > 0)
1898 		kmem_free(addrs6.ri_list, addrs6.ri_size);
1899 	return (retval);
1900 }
1901 
1902 /*
1903  * Close channel, remove from connection list and
1904  * free up resources allocated for that channel.
1905  */
1906 rdma_stat
1907 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
1908 {
1909 	rib_qp_t	*qp = ctoqp(conn);
1910 	rib_hca_t	*hca;
1911 
1912 	mutex_enter(&conn->c_lock);
1913 	if (conn->c_timeout != NULL) {
1914 		mutex_exit(&conn->c_lock);
1915 		(void) untimeout(conn->c_timeout);
1916 		mutex_enter(&conn->c_lock);
1917 	}
1918 
1919 	while (conn->c_flags & C_CLOSE_PENDING) {
1920 		cv_wait(&conn->c_cv, &conn->c_lock);
1921 	}
1922 	mutex_exit(&conn->c_lock);
1923 
1924 	/*
1925 	 * c_ref == 0 and connection is in C_DISCONN_PEND
1926 	 */
1927 	hca = qp->hca;
1928 	if (conn_list != NULL)
1929 		(void) rib_rm_conn(conn, conn_list);
1930 
1931 	/*
1932 	 * There is only one case where we get here with
1933 	 * qp_hdl = NULL, which is during connection setup on
1934 	 * the client. In such a case there are no posted
1935 	 * send/recv buffers.
1936 	 */
1937 	if (qp->qp_hdl != NULL) {
1938 		mutex_enter(&qp->posted_rbufs_lock);
1939 		while (qp->n_posted_rbufs)
1940 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
1941 		mutex_exit(&qp->posted_rbufs_lock);
1942 
1943 		mutex_enter(&qp->send_rbufs_lock);
1944 		while (qp->n_send_rbufs)
1945 			cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
1946 		mutex_exit(&qp->send_rbufs_lock);
1947 
1948 		(void) ibt_free_channel(qp->qp_hdl);
1949 		qp->qp_hdl = NULL;
1950 	}
1951 
1952 	ASSERT(qp->rdlist == NULL);
1953 
1954 	if (qp->replylist != NULL) {
1955 		(void) rib_rem_replylist(qp);
1956 	}
1957 
1958 	cv_destroy(&qp->cb_conn_cv);
1959 	cv_destroy(&qp->posted_rbufs_cv);
1960 	cv_destroy(&qp->send_rbufs_cv);
1961 	mutex_destroy(&qp->cb_lock);
1962 	mutex_destroy(&qp->replylist_lock);
1963 	mutex_destroy(&qp->posted_rbufs_lock);
1964 	mutex_destroy(&qp->send_rbufs_lock);
1965 	mutex_destroy(&qp->rdlist_lock);
1966 
1967 	cv_destroy(&conn->c_cv);
1968 	mutex_destroy(&conn->c_lock);
1969 
1970 	if (conn->c_raddr.buf != NULL) {
1971 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
1972 	}
1973 	if (conn->c_laddr.buf != NULL) {
1974 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
1975 	}
1976 
1977 	/*
1978 	 * Credit control cleanup.
1979 	 */
1980 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
1981 		rdma_clnt_cred_ctrl_t *cc_info;
1982 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1983 		cv_destroy(&cc_info->clnt_cc_cv);
1984 	}
1985 
1986 	kmem_free(qp, sizeof (rib_qp_t));
1987 
1988 	/*
1989 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
1990 	 * then the hca is no longer being used.
1991 	 */
1992 	if (conn_list != NULL) {
1993 		rw_enter(&hca->state_lock, RW_READER);
1994 		if (hca->state == HCA_DETACHED) {
1995 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
1996 			if (hca->srv_conn_list.conn_hd == NULL) {
1997 				rw_enter(&hca->cl_conn_list.conn_lock,
1998 				    RW_READER);
1999 
2000 				if (hca->cl_conn_list.conn_hd == NULL) {
2001 					mutex_enter(&hca->inuse_lock);
2002 					hca->inuse = FALSE;
2003 					cv_signal(&hca->cb_cv);
2004 					mutex_exit(&hca->inuse_lock);
2005 				}
2006 				rw_exit(&hca->cl_conn_list.conn_lock);
2007 			}
2008 			rw_exit(&hca->srv_conn_list.conn_lock);
2009 		}
2010 		rw_exit(&hca->state_lock);
2011 	}
2012 
2013 	return (RDMA_SUCCESS);
2014 }
2015 
2016 /*
2017  * All sends are done under the protection of
2018  * the wdesc->sendwait_lock. n_send_rbufs count
2019  * is protected using the send_rbufs_lock.
2020  * lock ordering is:
2021  * sendwait_lock -> send_rbufs_lock
2022  */
2023 
2024 void
2025 rib_send_hold(rib_qp_t *qp)
2026 {
2027 	mutex_enter(&qp->send_rbufs_lock);
2028 	qp->n_send_rbufs++;
2029 	mutex_exit(&qp->send_rbufs_lock);
2030 }
2031 
2032 void
2033 rib_send_rele(rib_qp_t *qp)
2034 {
2035 	mutex_enter(&qp->send_rbufs_lock);
2036 	qp->n_send_rbufs--;
2037 	if (qp->n_send_rbufs == 0)
2038 		cv_signal(&qp->send_rbufs_cv);
2039 	mutex_exit(&qp->send_rbufs_lock);
2040 }
2041 
2042 /*
2043  * Wait for send completion notification. Only on receiving a
2044  * notification be it a successful or error completion, free the
2045  * send_wid.
2046  */
2047 static rdma_stat
2048 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2049 {
2050 	clock_t timout, cv_wait_ret;
2051 	rdma_stat error = RDMA_SUCCESS;
2052 	int	i;
2053 
2054 	/*
2055 	 * Wait for send to complete
2056 	 */
2057 	ASSERT(wd != NULL);
2058 	mutex_enter(&wd->sendwait_lock);
2059 	if (wd->status == (uint_t)SEND_WAIT) {
2060 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2061 		    ddi_get_lbolt();
2062 
2063 		if (qp->mode == RIB_SERVER) {
2064 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2065 			    &wd->sendwait_lock, timout)) > 0 &&
2066 			    wd->status == (uint_t)SEND_WAIT)
2067 				;
2068 			switch (cv_wait_ret) {
2069 			case -1:	/* timeout */
2070 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2071 
2072 				wd->cv_sig = 0;		/* no signal needed */
2073 				error = RDMA_TIMEDOUT;
2074 				break;
2075 			default:	/* got send completion */
2076 				break;
2077 			}
2078 		} else {
2079 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2080 			    &wd->sendwait_lock, timout)) > 0 &&
2081 			    wd->status == (uint_t)SEND_WAIT)
2082 				;
2083 			switch (cv_wait_ret) {
2084 			case -1:	/* timeout */
2085 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2086 
2087 				wd->cv_sig = 0;		/* no signal needed */
2088 				error = RDMA_TIMEDOUT;
2089 				break;
2090 			case 0:		/* interrupted */
2091 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2092 
2093 				wd->cv_sig = 0;		/* no signal needed */
2094 				error = RDMA_INTR;
2095 				break;
2096 			default:	/* got send completion */
2097 				break;
2098 			}
2099 		}
2100 	}
2101 
2102 	if (wd->status != (uint_t)SEND_WAIT) {
2103 		/* got send completion */
2104 		if (wd->status != RDMA_SUCCESS) {
2105 			switch (wd->status) {
2106 			case RDMA_CONNLOST:
2107 				error = RDMA_CONNLOST;
2108 				break;
2109 			default:
2110 				error = RDMA_FAILED;
2111 				break;
2112 			}
2113 		}
2114 		for (i = 0; i < wd->nsbufs; i++) {
2115 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2116 			    (void *)(uintptr_t)wd->sbufaddr[i]);
2117 		}
2118 
2119 		rib_send_rele(qp);
2120 
2121 		mutex_exit(&wd->sendwait_lock);
2122 		(void) rib_free_sendwait(wd);
2123 
2124 	} else {
2125 		mutex_exit(&wd->sendwait_lock);
2126 	}
2127 	return (error);
2128 }
2129 
2130 static struct send_wid *
2131 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2132 {
2133 	struct send_wid	*wd;
2134 
2135 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2136 	wd->xid = xid;
2137 	wd->cv_sig = cv_sig;
2138 	wd->qp = qp;
2139 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2140 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2141 	wd->status = (uint_t)SEND_WAIT;
2142 
2143 	return (wd);
2144 }
2145 
2146 static int
2147 rib_free_sendwait(struct send_wid *wdesc)
2148 {
2149 	cv_destroy(&wdesc->wait_cv);
2150 	mutex_destroy(&wdesc->sendwait_lock);
2151 	kmem_free(wdesc, sizeof (*wdesc));
2152 
2153 	return (0);
2154 }
2155 
2156 static rdma_stat
2157 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2158 {
2159 	mutex_enter(&qp->replylist_lock);
2160 	if (rep != NULL) {
2161 		(void) rib_remreply(qp, rep);
2162 		mutex_exit(&qp->replylist_lock);
2163 		return (RDMA_SUCCESS);
2164 	}
2165 	mutex_exit(&qp->replylist_lock);
2166 	return (RDMA_FAILED);
2167 }
2168 
2169 /*
2170  * Send buffers are freed here only in case of error in posting
2171  * on QP. If the post succeeded, the send buffers are freed upon
2172  * send completion in rib_sendwait() or in the scq_handler.
2173  */
2174 rdma_stat
2175 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2176 	int send_sig, int cv_sig, caddr_t *swid)
2177 {
2178 	struct send_wid	*wdesc;
2179 	struct clist	*clp;
2180 	ibt_status_t	ibt_status = IBT_SUCCESS;
2181 	rdma_stat	ret = RDMA_SUCCESS;
2182 	ibt_send_wr_t	tx_wr;
2183 	int		i, nds;
2184 	ibt_wr_ds_t	sgl[DSEG_MAX];
2185 	uint_t		total_msg_size;
2186 	rib_qp_t	*qp;
2187 
2188 	qp = ctoqp(conn);
2189 
2190 	ASSERT(cl != NULL);
2191 
2192 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2193 
2194 	nds = 0;
2195 	total_msg_size = 0;
2196 	clp = cl;
2197 	while (clp != NULL) {
2198 		if (nds >= DSEG_MAX) {
2199 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2200 			return (RDMA_FAILED);
2201 		}
2202 		sgl[nds].ds_va = clp->w.c_saddr;
2203 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2204 		sgl[nds].ds_len = clp->c_len;
2205 		total_msg_size += clp->c_len;
2206 		clp = clp->c_next;
2207 		nds++;
2208 	}
2209 
2210 	if (send_sig) {
2211 		/* Set SEND_SIGNAL flag. */
2212 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2213 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2214 		*swid = (caddr_t)wdesc;
2215 		tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2216 		mutex_enter(&wdesc->sendwait_lock);
2217 		wdesc->nsbufs = nds;
2218 		for (i = 0; i < nds; i++) {
2219 			wdesc->sbufaddr[i] = sgl[i].ds_va;
2220 		}
2221 	} else {
2222 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2223 		*swid = NULL;
2224 		tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2225 	}
2226 
2227 	tx_wr.wr_opcode = IBT_WRC_SEND;
2228 	tx_wr.wr_trans = IBT_RC_SRV;
2229 	tx_wr.wr_nds = nds;
2230 	tx_wr.wr_sgl = sgl;
2231 
2232 	mutex_enter(&conn->c_lock);
2233 	if (conn->c_state == C_CONNECTED) {
2234 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2235 	}
2236 	if (conn->c_state != C_CONNECTED ||
2237 	    ibt_status != IBT_SUCCESS) {
2238 		if (conn->c_state != C_DISCONN_PEND)
2239 			conn->c_state = C_ERROR_CONN;
2240 		mutex_exit(&conn->c_lock);
2241 		if (send_sig) {
2242 			for (i = 0; i < nds; i++) {
2243 				rib_rbuf_free(conn, SEND_BUFFER,
2244 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2245 			}
2246 			mutex_exit(&wdesc->sendwait_lock);
2247 			(void) rib_free_sendwait(wdesc);
2248 		}
2249 		return (RDMA_CONNLOST);
2250 	}
2251 
2252 	mutex_exit(&conn->c_lock);
2253 
2254 	if (send_sig) {
2255 		rib_send_hold(qp);
2256 		mutex_exit(&wdesc->sendwait_lock);
2257 		if (cv_sig) {
2258 			/*
2259 			 * cv_wait for send to complete.
2260 			 * We can fail due to a timeout or signal or
2261 			 * unsuccessful send.
2262 			 */
2263 			ret = rib_sendwait(qp, wdesc);
2264 
2265 			return (ret);
2266 		}
2267 	}
2268 
2269 	return (RDMA_SUCCESS);
2270 }
2271 
2272 
2273 rdma_stat
2274 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2275 {
2276 	rdma_stat	ret;
2277 	caddr_t		wd;
2278 
2279 	/* send-wait & cv_signal */
2280 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2281 	return (ret);
2282 }
2283 
2284 /*
2285  * Deprecated/obsolete interface not used currently
2286  * but earlier used for READ-READ protocol.
2287  * Send RPC reply and wait for RDMA_DONE.
2288  */
2289 rdma_stat
2290 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2291 {
2292 	rdma_stat ret = RDMA_SUCCESS;
2293 	struct rdma_done_list *rd;
2294 	clock_t timout, cv_wait_ret;
2295 	caddr_t *wid = NULL;
2296 	rib_qp_t *qp = ctoqp(conn);
2297 
2298 	mutex_enter(&qp->rdlist_lock);
2299 	rd = rdma_done_add(qp, msgid);
2300 
2301 	/* No cv_signal (whether send-wait or no-send-wait) */
2302 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2303 
2304 	if (ret != RDMA_SUCCESS) {
2305 		rdma_done_rm(qp, rd);
2306 	} else {
2307 		/*
2308 		 * Wait for RDMA_DONE from remote end
2309 		 */
2310 		timout =
2311 		    drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2312 		cv_wait_ret = cv_timedwait(&rd->rdma_done_cv,
2313 		    &qp->rdlist_lock,
2314 		    timout);
2315 
2316 		rdma_done_rm(qp, rd);
2317 
2318 		if (cv_wait_ret < 0) {
2319 			ret = RDMA_TIMEDOUT;
2320 		}
2321 	}
2322 
2323 	mutex_exit(&qp->rdlist_lock);
2324 	return (ret);
2325 }
2326 
2327 static struct recv_wid *
2328 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2329 {
2330 	struct recv_wid	*rwid;
2331 
2332 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2333 	rwid->xid = msgid;
2334 	rwid->addr = sgl->ds_va;
2335 	rwid->qp = qp;
2336 
2337 	return (rwid);
2338 }
2339 
2340 static void
2341 rib_free_wid(struct recv_wid *rwid)
2342 {
2343 	kmem_free(rwid, sizeof (struct recv_wid));
2344 }
2345 
2346 rdma_stat
2347 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2348 {
2349 	rib_qp_t	*qp = ctoqp(conn);
2350 	struct clist	*clp = cl;
2351 	struct reply	*rep;
2352 	struct recv_wid	*rwid;
2353 	int		nds;
2354 	ibt_wr_ds_t	sgl[DSEG_MAX];
2355 	ibt_recv_wr_t	recv_wr;
2356 	rdma_stat	ret;
2357 	ibt_status_t	ibt_status;
2358 
2359 	/*
2360 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2361 	 */
2362 
2363 	nds = 0;
2364 	while (cl != NULL) {
2365 		if (nds >= DSEG_MAX) {
2366 			ret = RDMA_FAILED;
2367 			goto done;
2368 		}
2369 		sgl[nds].ds_va = cl->w.c_saddr;
2370 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2371 		sgl[nds].ds_len = cl->c_len;
2372 		cl = cl->c_next;
2373 		nds++;
2374 	}
2375 
2376 	if (nds != 1) {
2377 		ret = RDMA_FAILED;
2378 		goto done;
2379 	}
2380 
2381 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2382 	recv_wr.wr_nds = nds;
2383 	recv_wr.wr_sgl = sgl;
2384 
2385 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2386 	if (rwid) {
2387 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2388 	} else {
2389 		ret = RDMA_NORESOURCE;
2390 		goto done;
2391 	}
2392 	rep = rib_addreplylist(qp, msgid);
2393 	if (!rep) {
2394 		rib_free_wid(rwid);
2395 		ret = RDMA_NORESOURCE;
2396 		goto done;
2397 	}
2398 
2399 	mutex_enter(&conn->c_lock);
2400 
2401 	if (conn->c_state == C_CONNECTED) {
2402 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2403 	}
2404 
2405 	if (conn->c_state != C_CONNECTED ||
2406 	    ibt_status != IBT_SUCCESS) {
2407 		if (conn->c_state != C_DISCONN_PEND)
2408 			conn->c_state = C_ERROR_CONN;
2409 		mutex_exit(&conn->c_lock);
2410 		rib_free_wid(rwid);
2411 		(void) rib_rem_rep(qp, rep);
2412 		ret = RDMA_CONNLOST;
2413 		goto done;
2414 	}
2415 	mutex_exit(&conn->c_lock);
2416 	return (RDMA_SUCCESS);
2417 
2418 done:
2419 	while (clp != NULL) {
2420 		rib_rbuf_free(conn, RECV_BUFFER,
2421 		    (void *)(uintptr_t)clp->w.c_saddr3);
2422 		clp = clp->c_next;
2423 	}
2424 	return (ret);
2425 }
2426 
2427 rdma_stat
2428 rib_svc_post(CONN* conn, struct clist *cl)
2429 {
2430 	rib_qp_t	*qp = ctoqp(conn);
2431 	struct svc_recv	*s_recvp;
2432 	int		nds;
2433 	ibt_wr_ds_t	sgl[DSEG_MAX];
2434 	ibt_recv_wr_t	recv_wr;
2435 	ibt_status_t	ibt_status;
2436 
2437 	nds = 0;
2438 	while (cl != NULL) {
2439 		if (nds >= DSEG_MAX) {
2440 			return (RDMA_FAILED);
2441 		}
2442 		sgl[nds].ds_va = cl->w.c_saddr;
2443 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2444 		sgl[nds].ds_len = cl->c_len;
2445 		cl = cl->c_next;
2446 		nds++;
2447 	}
2448 
2449 	if (nds != 1) {
2450 		rib_rbuf_free(conn, RECV_BUFFER,
2451 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2452 
2453 		return (RDMA_FAILED);
2454 	}
2455 
2456 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2457 	recv_wr.wr_nds = nds;
2458 	recv_wr.wr_sgl = sgl;
2459 
2460 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2461 	/* Use s_recvp's addr as wr id */
2462 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2463 	mutex_enter(&conn->c_lock);
2464 	if (conn->c_state == C_CONNECTED) {
2465 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2466 	}
2467 	if (conn->c_state != C_CONNECTED ||
2468 	    ibt_status != IBT_SUCCESS) {
2469 		if (conn->c_state != C_DISCONN_PEND)
2470 			conn->c_state = C_ERROR_CONN;
2471 		mutex_exit(&conn->c_lock);
2472 		rib_rbuf_free(conn, RECV_BUFFER,
2473 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2474 		(void) rib_free_svc_recv(s_recvp);
2475 
2476 		return (RDMA_CONNLOST);
2477 	}
2478 	mutex_exit(&conn->c_lock);
2479 
2480 	return (RDMA_SUCCESS);
2481 }
2482 
2483 /* Client */
2484 rdma_stat
2485 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2486 {
2487 
2488 	return (rib_clnt_post(conn, cl, msgid));
2489 }
2490 
2491 /* Client */
2492 rdma_stat
2493 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2494 {
2495 	rib_qp_t	*qp = ctoqp(conn);
2496 	struct reply	*rep;
2497 
2498 	mutex_enter(&qp->replylist_lock);
2499 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2500 		if (rep->xid == msgid) {
2501 			if (rep->vaddr_cq) {
2502 				rib_rbuf_free(conn, RECV_BUFFER,
2503 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2504 			}
2505 			(void) rib_remreply(qp, rep);
2506 			break;
2507 		}
2508 	}
2509 	mutex_exit(&qp->replylist_lock);
2510 
2511 	return (RDMA_SUCCESS);
2512 }
2513 
2514 /* Server */
2515 rdma_stat
2516 rib_post_recv(CONN *conn, struct clist *cl)
2517 {
2518 	rib_qp_t	*qp = ctoqp(conn);
2519 
2520 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2521 		mutex_enter(&qp->posted_rbufs_lock);
2522 		qp->n_posted_rbufs++;
2523 		mutex_exit(&qp->posted_rbufs_lock);
2524 		return (RDMA_SUCCESS);
2525 	}
2526 	return (RDMA_FAILED);
2527 }
2528 
2529 /*
2530  * Client side only interface to "recv" the rpc reply buf
2531  * posted earlier by rib_post_resp(conn, cl, msgid).
2532  */
2533 rdma_stat
2534 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2535 {
2536 	struct reply *rep = NULL;
2537 	clock_t timout, cv_wait_ret;
2538 	rdma_stat ret = RDMA_SUCCESS;
2539 	rib_qp_t *qp = ctoqp(conn);
2540 
2541 	/*
2542 	 * Find the reply structure for this msgid
2543 	 */
2544 	mutex_enter(&qp->replylist_lock);
2545 
2546 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2547 		if (rep->xid == msgid)
2548 			break;
2549 	}
2550 
2551 	if (rep != NULL) {
2552 		/*
2553 		 * If message not yet received, wait.
2554 		 */
2555 		if (rep->status == (uint_t)REPLY_WAIT) {
2556 			timout = ddi_get_lbolt() +
2557 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2558 
2559 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2560 			    &qp->replylist_lock, timout)) > 0 &&
2561 			    rep->status == (uint_t)REPLY_WAIT)
2562 				;
2563 
2564 			switch (cv_wait_ret) {
2565 			case -1:	/* timeout */
2566 				ret = RDMA_TIMEDOUT;
2567 				break;
2568 			case 0:
2569 				ret = RDMA_INTR;
2570 				break;
2571 			default:
2572 				break;
2573 			}
2574 		}
2575 
2576 		if (rep->status == RDMA_SUCCESS) {
2577 			struct clist *cl = NULL;
2578 
2579 			/*
2580 			 * Got message successfully
2581 			 */
2582 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2583 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2584 			*clp = cl;
2585 		} else {
2586 			if (rep->status != (uint_t)REPLY_WAIT) {
2587 				/*
2588 				 * Got error in reply message. Free
2589 				 * recv buffer here.
2590 				 */
2591 				ret = rep->status;
2592 				rib_rbuf_free(conn, RECV_BUFFER,
2593 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2594 			}
2595 		}
2596 		(void) rib_remreply(qp, rep);
2597 	} else {
2598 		/*
2599 		 * No matching reply structure found for given msgid on the
2600 		 * reply wait list.
2601 		 */
2602 		ret = RDMA_INVAL;
2603 		DTRACE_PROBE(rpcib__i__nomatchxid2);
2604 	}
2605 
2606 	/*
2607 	 * Done.
2608 	 */
2609 	mutex_exit(&qp->replylist_lock);
2610 	return (ret);
2611 }
2612 
2613 /*
2614  * RDMA write a buffer to the remote address.
2615  */
2616 rdma_stat
2617 rib_write(CONN *conn, struct clist *cl, int wait)
2618 {
2619 	ibt_send_wr_t	tx_wr;
2620 	int		cv_sig;
2621 	ibt_wr_ds_t	sgl[DSEG_MAX];
2622 	struct send_wid	*wdesc;
2623 	ibt_status_t	ibt_status;
2624 	rdma_stat	ret = RDMA_SUCCESS;
2625 	rib_qp_t	*qp = ctoqp(conn);
2626 	uint64_t	n_writes = 0;
2627 
2628 	if (cl == NULL) {
2629 		return (RDMA_FAILED);
2630 	}
2631 
2632 	while ((cl != NULL)) {
2633 		if (cl->c_len > 0) {
2634 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2635 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2636 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2637 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2638 			sgl[0].ds_va = cl->w.c_saddr;
2639 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2640 			sgl[0].ds_len = cl->c_len;
2641 
2642 			if (wait) {
2643 				cv_sig = 1;
2644 			} else {
2645 				if (n_writes > max_unsignaled_rws) {
2646 					n_writes = 0;
2647 					cv_sig = 1;
2648 				} else {
2649 					cv_sig = 0;
2650 				}
2651 			}
2652 
2653 			if (cv_sig) {
2654 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2655 				wdesc = rib_init_sendwait(0, cv_sig, qp);
2656 				tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2657 				mutex_enter(&wdesc->sendwait_lock);
2658 			} else {
2659 				tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2660 				tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2661 			}
2662 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2663 			tx_wr.wr_trans = IBT_RC_SRV;
2664 			tx_wr.wr_nds = 1;
2665 			tx_wr.wr_sgl = sgl;
2666 
2667 			mutex_enter(&conn->c_lock);
2668 			if (conn->c_state == C_CONNECTED) {
2669 				ibt_status =
2670 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2671 			}
2672 			if (conn->c_state != C_CONNECTED ||
2673 			    ibt_status != IBT_SUCCESS) {
2674 				if (conn->c_state != C_DISCONN_PEND)
2675 					conn->c_state = C_ERROR_CONN;
2676 				mutex_exit(&conn->c_lock);
2677 				if (cv_sig) {
2678 					mutex_exit(&wdesc->sendwait_lock);
2679 					(void) rib_free_sendwait(wdesc);
2680 				}
2681 				return (RDMA_CONNLOST);
2682 			}
2683 
2684 			mutex_exit(&conn->c_lock);
2685 
2686 			/*
2687 			 * Wait for send to complete
2688 			 */
2689 			if (cv_sig) {
2690 
2691 				rib_send_hold(qp);
2692 				mutex_exit(&wdesc->sendwait_lock);
2693 
2694 				ret = rib_sendwait(qp, wdesc);
2695 				if (ret != 0)
2696 					return (ret);
2697 			}
2698 			n_writes ++;
2699 		}
2700 		cl = cl->c_next;
2701 	}
2702 	return (RDMA_SUCCESS);
2703 }
2704 
2705 /*
2706  * RDMA Read a buffer from the remote address.
2707  */
2708 rdma_stat
2709 rib_read(CONN *conn, struct clist *cl, int wait)
2710 {
2711 	ibt_send_wr_t	rx_wr;
2712 	int		cv_sig = 0;
2713 	ibt_wr_ds_t	sgl;
2714 	struct send_wid	*wdesc;
2715 	ibt_status_t	ibt_status = IBT_SUCCESS;
2716 	rdma_stat	ret = RDMA_SUCCESS;
2717 	rib_qp_t	*qp = ctoqp(conn);
2718 
2719 	if (cl == NULL) {
2720 		return (RDMA_FAILED);
2721 	}
2722 
2723 	while (cl != NULL) {
2724 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2725 		/*
2726 		 * Remote address is at the head chunk item in list.
2727 		 */
2728 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2729 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2730 
2731 		sgl.ds_va = cl->u.c_daddr;
2732 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2733 		sgl.ds_len = cl->c_len;
2734 
2735 		/*
2736 		 * If there are multiple chunks to be read, and
2737 		 * wait is set, ask for signal only for the last chunk
2738 		 * and wait only on the last chunk. The completion of
2739 		 * RDMA_READ on last chunk ensures that reads on all
2740 		 * previous chunks are also completed.
2741 		 */
2742 		if (wait && (cl->c_next == NULL)) {
2743 			cv_sig = 1;
2744 			wdesc = rib_init_sendwait(0, cv_sig, qp);
2745 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2746 			rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2747 			mutex_enter(&wdesc->sendwait_lock);
2748 		} else {
2749 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2750 			rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2751 		}
2752 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2753 		rx_wr.wr_trans = IBT_RC_SRV;
2754 		rx_wr.wr_nds = 1;
2755 		rx_wr.wr_sgl = &sgl;
2756 
2757 		mutex_enter(&conn->c_lock);
2758 		if (conn->c_state == C_CONNECTED) {
2759 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2760 		}
2761 		if (conn->c_state != C_CONNECTED ||
2762 		    ibt_status != IBT_SUCCESS) {
2763 			if (conn->c_state != C_DISCONN_PEND)
2764 				conn->c_state = C_ERROR_CONN;
2765 			mutex_exit(&conn->c_lock);
2766 			if (wait && (cl->c_next == NULL)) {
2767 				mutex_exit(&wdesc->sendwait_lock);
2768 				(void) rib_free_sendwait(wdesc);
2769 			}
2770 			return (RDMA_CONNLOST);
2771 		}
2772 
2773 		mutex_exit(&conn->c_lock);
2774 
2775 		/*
2776 		 * Wait for send to complete if this is the
2777 		 * last item in the list.
2778 		 */
2779 		if (wait && cl->c_next == NULL) {
2780 			rib_send_hold(qp);
2781 			mutex_exit(&wdesc->sendwait_lock);
2782 
2783 			ret = rib_sendwait(qp, wdesc);
2784 
2785 			if (ret != 0)
2786 				return (ret);
2787 		}
2788 		cl = cl->c_next;
2789 	}
2790 	return (RDMA_SUCCESS);
2791 }
2792 
2793 /*
2794  * rib_srv_cm_handler()
2795  *    Connection Manager callback to handle RC connection requests.
2796  */
2797 /* ARGSUSED */
2798 static ibt_cm_status_t
2799 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2800 	ibt_cm_return_args_t *ret_args, void *priv_data,
2801 	ibt_priv_data_len_t len)
2802 {
2803 	queue_t		*q;
2804 	rib_qp_t	*qp;
2805 	rpcib_state_t	*ribstat;
2806 	rib_hca_t	*hca;
2807 	rdma_stat	status = RDMA_SUCCESS;
2808 	int		i;
2809 	struct clist	cl;
2810 	rdma_buf_t	rdbuf = {0};
2811 	void		*buf = NULL;
2812 	CONN		*conn;
2813 	ibt_ip_cm_info_t	ipinfo;
2814 	struct sockaddr_in *s;
2815 	struct sockaddr_in6 *s6;
2816 	int sin_size = sizeof (struct sockaddr_in);
2817 	int in_size = sizeof (struct in_addr);
2818 	int sin6_size = sizeof (struct sockaddr_in6);
2819 
2820 	ASSERT(any != NULL);
2821 	ASSERT(event != NULL);
2822 
2823 	ribstat = (rpcib_state_t *)any;
2824 	hca = (rib_hca_t *)ribstat->hca;
2825 	ASSERT(hca != NULL);
2826 
2827 	/* got a connection request */
2828 	switch (event->cm_type) {
2829 	case IBT_CM_EVENT_REQ_RCV:
2830 		/*
2831 		 * If the plugin is in the NO_ACCEPT state, bail out.
2832 		 */
2833 		mutex_enter(&plugin_state_lock);
2834 		if (plugin_state == NO_ACCEPT) {
2835 			mutex_exit(&plugin_state_lock);
2836 			return (IBT_CM_REJECT);
2837 		}
2838 		mutex_exit(&plugin_state_lock);
2839 
2840 		/*
2841 		 * Need to send a MRA MAD to CM so that it does not
2842 		 * timeout on us.
2843 		 */
2844 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2845 		    event->cm_event.req.req_timeout * 8, NULL, 0);
2846 
2847 		mutex_enter(&rib_stat->open_hca_lock);
2848 		q = rib_stat->q;
2849 		mutex_exit(&rib_stat->open_hca_lock);
2850 
2851 		status = rib_svc_create_chan(hca, (caddr_t)q,
2852 		    event->cm_event.req.req_prim_hca_port, &qp);
2853 
2854 		if (status) {
2855 			return (IBT_CM_REJECT);
2856 		}
2857 
2858 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2859 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2860 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2861 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2862 
2863 		/*
2864 		 * Pre-posts RECV buffers
2865 		 */
2866 		conn = qptoc(qp);
2867 		for (i = 0; i < preposted_rbufs; i++) {
2868 			bzero(&rdbuf, sizeof (rdbuf));
2869 			rdbuf.type = RECV_BUFFER;
2870 			buf = rib_rbuf_alloc(conn, &rdbuf);
2871 			if (buf == NULL) {
2872 				/*
2873 				 * A connection is not established yet.
2874 				 * Just flush the channel. Buffers
2875 				 * posted till now will error out with
2876 				 * IBT_WC_WR_FLUSHED_ERR.
2877 				 */
2878 				(void) ibt_flush_channel(qp->qp_hdl);
2879 				(void) rib_disconnect_channel(conn, NULL);
2880 				return (IBT_CM_REJECT);
2881 			}
2882 
2883 			bzero(&cl, sizeof (cl));
2884 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
2885 			cl.c_len = rdbuf.len;
2886 			cl.c_smemhandle.mrc_lmr =
2887 			    rdbuf.handle.mrc_lmr; /* lkey */
2888 			cl.c_next = NULL;
2889 			status = rib_post_recv(conn, &cl);
2890 			if (status != RDMA_SUCCESS) {
2891 				/*
2892 				 * A connection is not established yet.
2893 				 * Just flush the channel. Buffers
2894 				 * posted till now will error out with
2895 				 * IBT_WC_WR_FLUSHED_ERR.
2896 				 */
2897 				(void) ibt_flush_channel(qp->qp_hdl);
2898 				(void) rib_disconnect_channel(conn, NULL);
2899 				return (IBT_CM_REJECT);
2900 			}
2901 		}
2902 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
2903 
2904 		/*
2905 		 * Get the address translation
2906 		 */
2907 		rw_enter(&hca->state_lock, RW_READER);
2908 		if (hca->state == HCA_DETACHED) {
2909 			rw_exit(&hca->state_lock);
2910 			return (IBT_CM_REJECT);
2911 		}
2912 		rw_exit(&hca->state_lock);
2913 
2914 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
2915 
2916 		if (ibt_get_ip_data(event->cm_priv_data_len,
2917 		    event->cm_priv_data,
2918 		    &ipinfo) != IBT_SUCCESS) {
2919 
2920 			return (IBT_CM_REJECT);
2921 		}
2922 
2923 		switch (ipinfo.src_addr.family) {
2924 		case AF_INET:
2925 
2926 			conn->c_raddr.maxlen =
2927 			    conn->c_raddr.len = sin_size;
2928 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
2929 
2930 			s = (struct sockaddr_in *)conn->c_raddr.buf;
2931 			s->sin_family = AF_INET;
2932 
2933 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
2934 			    &s->sin_addr, in_size);
2935 
2936 			break;
2937 
2938 		case AF_INET6:
2939 
2940 			conn->c_raddr.maxlen =
2941 			    conn->c_raddr.len = sin6_size;
2942 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
2943 
2944 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
2945 			s6->sin6_family = AF_INET6;
2946 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
2947 			    &s6->sin6_addr,
2948 			    sizeof (struct in6_addr));
2949 
2950 			break;
2951 
2952 		default:
2953 			return (IBT_CM_REJECT);
2954 		}
2955 
2956 		break;
2957 
2958 	case IBT_CM_EVENT_CONN_CLOSED:
2959 	{
2960 		CONN		*conn;
2961 		rib_qp_t	*qp;
2962 
2963 		switch (event->cm_event.closed) {
2964 		case IBT_CM_CLOSED_DREP_RCVD:
2965 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
2966 		case IBT_CM_CLOSED_DUP:
2967 		case IBT_CM_CLOSED_ABORT:
2968 		case IBT_CM_CLOSED_ALREADY:
2969 			/*
2970 			 * These cases indicate the local end initiated
2971 			 * the closing of the channel. Nothing to do here.
2972 			 */
2973 			break;
2974 		default:
2975 			/*
2976 			 * Reason for CONN_CLOSED event must be one of
2977 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
2978 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
2979 			 * the remote end is closing the channel. In these
2980 			 * cases free the channel and transition to error
2981 			 * state
2982 			 */
2983 			qp = ibt_get_chan_private(event->cm_channel);
2984 			conn = qptoc(qp);
2985 			mutex_enter(&conn->c_lock);
2986 			if (conn->c_state == C_DISCONN_PEND) {
2987 				mutex_exit(&conn->c_lock);
2988 				break;
2989 			}
2990 			conn->c_state = C_ERROR_CONN;
2991 
2992 			/*
2993 			 * Free the conn if c_ref goes down to 0
2994 			 */
2995 			if (conn->c_ref == 0) {
2996 				/*
2997 				 * Remove from list and free conn
2998 				 */
2999 				conn->c_state = C_DISCONN_PEND;
3000 				mutex_exit(&conn->c_lock);
3001 				(void) rib_disconnect_channel(conn,
3002 				    &hca->srv_conn_list);
3003 			} else {
3004 				/*
3005 				 * conn will be freed when c_ref goes to 0.
3006 				 * Indicate to cleaning thread not to close
3007 				 * the connection, but just free the channel.
3008 				 */
3009 				conn->c_flags |= C_CLOSE_NOTNEEDED;
3010 				mutex_exit(&conn->c_lock);
3011 			}
3012 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3013 			break;
3014 		}
3015 		break;
3016 	}
3017 	case IBT_CM_EVENT_CONN_EST:
3018 		/*
3019 		 * RTU received, hence connection established.
3020 		 */
3021 		if (rib_debug > 1)
3022 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3023 			    "(CONN_EST) channel established");
3024 		break;
3025 
3026 	default:
3027 		if (rib_debug > 2) {
3028 			/* Let CM handle the following events. */
3029 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3030 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3031 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3032 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3033 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3034 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3035 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3036 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3037 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3038 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3039 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3040 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3041 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3042 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3043 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
3044 			}
3045 		}
3046 		return (IBT_CM_DEFAULT);
3047 	}
3048 
3049 	/* accept all other CM messages (i.e. let the CM handle them) */
3050 	return (IBT_CM_ACCEPT);
3051 }
3052 
3053 static rdma_stat
3054 rib_register_service(rib_hca_t *hca, int service_type)
3055 {
3056 	ibt_srv_desc_t		sdesc;
3057 	ibt_hca_portinfo_t	*port_infop;
3058 	ib_svc_id_t		srv_id;
3059 	ibt_srv_hdl_t		srv_hdl;
3060 	uint_t			port_size;
3061 	uint_t			pki, i, num_ports, nbinds;
3062 	ibt_status_t		ibt_status;
3063 	rib_service_t		*new_service;
3064 	ib_pkey_t		pkey;
3065 
3066 	/*
3067 	 * Query all ports for the given HCA
3068 	 */
3069 	rw_enter(&hca->state_lock, RW_READER);
3070 	if (hca->state != HCA_DETACHED) {
3071 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3072 		    &num_ports, &port_size);
3073 		rw_exit(&hca->state_lock);
3074 	} else {
3075 		rw_exit(&hca->state_lock);
3076 		return (RDMA_FAILED);
3077 	}
3078 	if (ibt_status != IBT_SUCCESS) {
3079 		return (RDMA_FAILED);
3080 	}
3081 
3082 	DTRACE_PROBE1(rpcib__i__regservice_numports,
3083 	    int, num_ports);
3084 
3085 	for (i = 0; i < num_ports; i++) {
3086 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3087 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3088 			    int, i+1);
3089 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3090 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
3091 			    int, i+1);
3092 		}
3093 	}
3094 
3095 	/*
3096 	 * Get all the IP addresses on this system to register the
3097 	 * given "service type" on all DNS recognized IP addrs.
3098 	 * Each service type such as NFS will have all the systems
3099 	 * IP addresses as its different names. For now the only
3100 	 * type of service we support in RPCIB is NFS.
3101 	 */
3102 	rw_enter(&hca->service_list_lock, RW_WRITER);
3103 	/*
3104 	 * Start registering and binding service to active
3105 	 * on active ports on this HCA.
3106 	 */
3107 	nbinds = 0;
3108 	new_service = NULL;
3109 
3110 	/*
3111 	 * We use IP addresses as the service names for
3112 	 * service registration.  Register each of them
3113 	 * with CM to obtain a svc_id and svc_hdl.  We do not
3114 	 * register the service with machine's loopback address.
3115 	 */
3116 	(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3117 	(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3118 	(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3119 
3120 	sdesc.sd_handler = rib_srv_cm_handler;
3121 	sdesc.sd_flags = 0;
3122 	ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3123 	    &sdesc, ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port),
3124 	    1, &srv_hdl, &srv_id);
3125 
3126 	for (i = 0; i < num_ports; i++) {
3127 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3128 			continue;
3129 
3130 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3131 			pkey = port_infop[i].p_pkey_tbl[pki];
3132 			if ((pkey & IBSRM_HB) &&
3133 			    (pkey != IB_PKEY_INVALID_FULL)) {
3134 
3135 				/*
3136 				 * Allocate and prepare a service entry
3137 				 */
3138 				new_service =
3139 				    kmem_zalloc(1 * sizeof (rib_service_t),
3140 				    KM_SLEEP);
3141 
3142 				new_service->srv_type = service_type;
3143 				new_service->srv_hdl = srv_hdl;
3144 				new_service->srv_next = NULL;
3145 
3146 				ibt_status = ibt_bind_service(srv_hdl,
3147 				    port_infop[i].p_sgid_tbl[0],
3148 				    NULL, rib_stat, NULL);
3149 
3150 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3151 				    int, ibt_status);
3152 
3153 				if (ibt_status != IBT_SUCCESS) {
3154 					kmem_free(new_service,
3155 					    sizeof (rib_service_t));
3156 					new_service = NULL;
3157 					continue;
3158 				}
3159 
3160 				/*
3161 				 * Add to the service list for this HCA
3162 				 */
3163 				new_service->srv_next = hca->service_list;
3164 				hca->service_list = new_service;
3165 				new_service = NULL;
3166 				nbinds++;
3167 			}
3168 		}
3169 	}
3170 	rw_exit(&hca->service_list_lock);
3171 
3172 	ibt_free_portinfo(port_infop, port_size);
3173 
3174 	if (nbinds == 0) {
3175 		return (RDMA_FAILED);
3176 	} else {
3177 		/*
3178 		 * Put this plugin into accept state, since atleast
3179 		 * one registration was successful.
3180 		 */
3181 		mutex_enter(&plugin_state_lock);
3182 		plugin_state = ACCEPT;
3183 		mutex_exit(&plugin_state_lock);
3184 		return (RDMA_SUCCESS);
3185 	}
3186 }
3187 
3188 void
3189 rib_listen(struct rdma_svc_data *rd)
3190 {
3191 	rdma_stat status = RDMA_SUCCESS;
3192 
3193 	rd->active = 0;
3194 	rd->err_code = RDMA_FAILED;
3195 
3196 	/*
3197 	 * First check if a hca is still attached
3198 	 */
3199 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
3200 	if (rib_stat->hca->state != HCA_INITED) {
3201 		rw_exit(&rib_stat->hca->state_lock);
3202 		return;
3203 	}
3204 	rw_exit(&rib_stat->hca->state_lock);
3205 
3206 	rib_stat->q = &rd->q;
3207 	/*
3208 	 * Right now the only service type is NFS. Hence force feed this
3209 	 * value. Ideally to communicate the service type it should be
3210 	 * passed down in rdma_svc_data.
3211 	 */
3212 	rib_stat->service_type = NFS;
3213 	status = rib_register_service(rib_stat->hca, NFS);
3214 	if (status != RDMA_SUCCESS) {
3215 		rd->err_code = status;
3216 		return;
3217 	}
3218 	/*
3219 	 * Service active on an HCA, check rd->err_code for more
3220 	 * explainable errors.
3221 	 */
3222 	rd->active = 1;
3223 	rd->err_code = status;
3224 }
3225 
3226 /* XXXX */
3227 /* ARGSUSED */
3228 static void
3229 rib_listen_stop(struct rdma_svc_data *svcdata)
3230 {
3231 	rib_hca_t		*hca;
3232 
3233 	/*
3234 	 * KRPC called the RDMATF to stop the listeners, this means
3235 	 * stop sending incomming or recieved requests to KRPC master
3236 	 * transport handle for RDMA-IB. This is also means that the
3237 	 * master transport handle, responsible for us, is going away.
3238 	 */
3239 	mutex_enter(&plugin_state_lock);
3240 	plugin_state = NO_ACCEPT;
3241 	if (svcdata != NULL)
3242 		svcdata->active = 0;
3243 	mutex_exit(&plugin_state_lock);
3244 
3245 	/*
3246 	 * First check if a hca is still attached
3247 	 */
3248 	hca = rib_stat->hca;
3249 	rw_enter(&hca->state_lock, RW_READER);
3250 	if (hca->state != HCA_INITED) {
3251 		rw_exit(&hca->state_lock);
3252 		return;
3253 	}
3254 	rib_close_channels(&hca->srv_conn_list);
3255 	rib_stop_services(hca);
3256 	rw_exit(&hca->state_lock);
3257 }
3258 
3259 /*
3260  * Traverse the HCA's service list to unbind and deregister services.
3261  * Instead of unbinding the service for a service handle by
3262  * calling ibt_unbind_service() for each port/pkey, we unbind
3263  * all the services for the service handle by making only one
3264  * call to ibt_unbind_all_services().  Then, we deregister the
3265  * service for the service handle.
3266  *
3267  * When traversing the entries in service_list, we compare the
3268  * srv_hdl of the current entry with that of the next.  If they
3269  * are different or if the next entry is NULL, the current entry
3270  * marks the last binding of the service handle.  In this case,
3271  * call ibt_unbind_all_services() and deregister the service for
3272  * the service handle.  If they are the same, the current and the
3273  * next entries are bound to the same service handle.  In this
3274  * case, move on to the next entry.
3275  */
3276 static void
3277 rib_stop_services(rib_hca_t *hca)
3278 {
3279 	rib_service_t		*srv_list, *to_remove;
3280 
3281 	/*
3282 	 * unbind and deregister the services for this service type.
3283 	 * Right now there is only one service type. In future it will
3284 	 * be passed down to this function.
3285 	 */
3286 	rw_enter(&hca->service_list_lock, RW_WRITER);
3287 	srv_list = hca->service_list;
3288 	while (srv_list != NULL) {
3289 		to_remove = srv_list;
3290 		srv_list = to_remove->srv_next;
3291 		if (srv_list == NULL || bcmp(to_remove->srv_hdl,
3292 		    srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) {
3293 
3294 			(void) ibt_unbind_all_services(to_remove->srv_hdl);
3295 			(void) ibt_deregister_service(hca->ibt_clnt_hdl,
3296 			    to_remove->srv_hdl);
3297 		}
3298 
3299 		kmem_free(to_remove, sizeof (rib_service_t));
3300 	}
3301 	hca->service_list = NULL;
3302 	rw_exit(&hca->service_list_lock);
3303 }
3304 
3305 static struct svc_recv *
3306 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3307 {
3308 	struct svc_recv	*recvp;
3309 
3310 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3311 	recvp->vaddr = sgl->ds_va;
3312 	recvp->qp = qp;
3313 	recvp->bytes_xfer = 0;
3314 	return (recvp);
3315 }
3316 
3317 static int
3318 rib_free_svc_recv(struct svc_recv *recvp)
3319 {
3320 	kmem_free(recvp, sizeof (*recvp));
3321 
3322 	return (0);
3323 }
3324 
3325 static struct reply *
3326 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3327 {
3328 	struct reply	*rep;
3329 
3330 
3331 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3332 	if (rep == NULL) {
3333 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3334 		return (NULL);
3335 	}
3336 	rep->xid = msgid;
3337 	rep->vaddr_cq = NULL;
3338 	rep->bytes_xfer = 0;
3339 	rep->status = (uint_t)REPLY_WAIT;
3340 	rep->prev = NULL;
3341 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3342 
3343 	mutex_enter(&qp->replylist_lock);
3344 	if (qp->replylist) {
3345 		rep->next = qp->replylist;
3346 		qp->replylist->prev = rep;
3347 	}
3348 	qp->rep_list_size++;
3349 
3350 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3351 	    int, qp->rep_list_size);
3352 
3353 	qp->replylist = rep;
3354 	mutex_exit(&qp->replylist_lock);
3355 
3356 	return (rep);
3357 }
3358 
3359 static rdma_stat
3360 rib_rem_replylist(rib_qp_t *qp)
3361 {
3362 	struct reply	*r, *n;
3363 
3364 	mutex_enter(&qp->replylist_lock);
3365 	for (r = qp->replylist; r != NULL; r = n) {
3366 		n = r->next;
3367 		(void) rib_remreply(qp, r);
3368 	}
3369 	mutex_exit(&qp->replylist_lock);
3370 
3371 	return (RDMA_SUCCESS);
3372 }
3373 
3374 static int
3375 rib_remreply(rib_qp_t *qp, struct reply *rep)
3376 {
3377 
3378 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3379 	if (rep->prev) {
3380 		rep->prev->next = rep->next;
3381 	}
3382 	if (rep->next) {
3383 		rep->next->prev = rep->prev;
3384 	}
3385 	if (qp->replylist == rep)
3386 		qp->replylist = rep->next;
3387 
3388 	cv_destroy(&rep->wait_cv);
3389 	qp->rep_list_size--;
3390 
3391 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3392 	    int, qp->rep_list_size);
3393 
3394 	kmem_free(rep, sizeof (*rep));
3395 
3396 	return (0);
3397 }
3398 
3399 rdma_stat
3400 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3401 	struct mrc *buf_handle)
3402 {
3403 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3404 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3405 	rdma_stat	status;
3406 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3407 
3408 	/*
3409 	 * Note: ALL buffer pools use the same memory type RDMARW.
3410 	 */
3411 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3412 	if (status == RDMA_SUCCESS) {
3413 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3414 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3415 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3416 	} else {
3417 		buf_handle->mrc_linfo = NULL;
3418 		buf_handle->mrc_lmr = 0;
3419 		buf_handle->mrc_rmr = 0;
3420 	}
3421 	return (status);
3422 }
3423 
3424 static rdma_stat
3425 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3426 	ibt_mr_flags_t spec,
3427 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3428 {
3429 	ibt_mr_attr_t	mem_attr;
3430 	ibt_status_t	ibt_status;
3431 	mem_attr.mr_vaddr = (uintptr_t)buf;
3432 	mem_attr.mr_len = (ib_msglen_t)size;
3433 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3434 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3435 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3436 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3437 
3438 	rw_enter(&hca->state_lock, RW_READER);
3439 	if (hca->state == HCA_INITED) {
3440 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3441 		    &mem_attr, mr_hdlp, mr_descp);
3442 		rw_exit(&hca->state_lock);
3443 	} else {
3444 		rw_exit(&hca->state_lock);
3445 		return (RDMA_FAILED);
3446 	}
3447 
3448 	if (ibt_status != IBT_SUCCESS) {
3449 		return (RDMA_FAILED);
3450 	}
3451 	return (RDMA_SUCCESS);
3452 }
3453 
3454 rdma_stat
3455 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3456 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3457 {
3458 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3459 	rib_lrc_entry_t *l;
3460 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3461 	rdma_stat	status;
3462 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3463 
3464 	/*
3465 	 * Non-coherent memory registration.
3466 	 */
3467 	l = (rib_lrc_entry_t *)lrc;
3468 	if (l) {
3469 		if (l->registered) {
3470 			buf_handle->mrc_linfo =
3471 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3472 			buf_handle->mrc_lmr =
3473 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3474 			buf_handle->mrc_rmr =
3475 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3476 			*sync_handle = (RIB_SYNCMEM_HANDLE)
3477 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3478 			return (RDMA_SUCCESS);
3479 		} else {
3480 			/* Always register the whole buffer */
3481 			buf = (caddr_t)l->lrc_buf;
3482 			buflen = l->lrc_len;
3483 		}
3484 	}
3485 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3486 
3487 	if (status == RDMA_SUCCESS) {
3488 		if (l) {
3489 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3490 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3491 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3492 			l->registered		 = TRUE;
3493 		}
3494 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3495 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3496 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3497 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3498 	} else {
3499 		buf_handle->mrc_linfo = NULL;
3500 		buf_handle->mrc_lmr = 0;
3501 		buf_handle->mrc_rmr = 0;
3502 	}
3503 	return (status);
3504 }
3505 
3506 /* ARGSUSED */
3507 rdma_stat
3508 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3509 {
3510 	rib_hca_t *hca = (ctoqp(conn))->hca;
3511 	/*
3512 	 * Allow memory deregistration even if HCA is
3513 	 * getting detached. Need all outstanding
3514 	 * memory registrations to be deregistered
3515 	 * before HCA_DETACH_EVENT can be accepted.
3516 	 */
3517 	(void) ibt_deregister_mr(hca->hca_hdl,
3518 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3519 	return (RDMA_SUCCESS);
3520 }
3521 
3522 /* ARGSUSED */
3523 rdma_stat
3524 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3525 		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3526 {
3527 	rib_lrc_entry_t *l;
3528 	l = (rib_lrc_entry_t *)lrc;
3529 	if (l)
3530 		if (l->registered)
3531 			return (RDMA_SUCCESS);
3532 
3533 	(void) rib_deregistermem(conn, buf, buf_handle);
3534 
3535 	return (RDMA_SUCCESS);
3536 }
3537 
3538 /* ARGSUSED */
3539 rdma_stat
3540 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3541 		int len, int cpu)
3542 {
3543 	ibt_status_t	status;
3544 	rib_hca_t *hca = (ctoqp(conn))->hca;
3545 	ibt_mr_sync_t	mr_segment;
3546 
3547 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3548 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3549 	mr_segment.ms_len = (ib_memlen_t)len;
3550 	if (cpu) {
3551 		/* make incoming data visible to memory */
3552 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3553 	} else {
3554 		/* make memory changes visible to IO */
3555 		mr_segment.ms_flags = IBT_SYNC_READ;
3556 	}
3557 	rw_enter(&hca->state_lock, RW_READER);
3558 	if (hca->state == HCA_INITED) {
3559 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3560 		rw_exit(&hca->state_lock);
3561 	} else {
3562 		rw_exit(&hca->state_lock);
3563 		return (RDMA_FAILED);
3564 	}
3565 
3566 	if (status == IBT_SUCCESS)
3567 		return (RDMA_SUCCESS);
3568 	else {
3569 		return (RDMA_FAILED);
3570 	}
3571 }
3572 
3573 /*
3574  * XXXX	????
3575  */
3576 static rdma_stat
3577 rib_getinfo(rdma_info_t *info)
3578 {
3579 	/*
3580 	 * XXXX	Hack!
3581 	 */
3582 	info->addrlen = 16;
3583 	info->mts = 1000000;
3584 	info->mtu = 1000000;
3585 
3586 	return (RDMA_SUCCESS);
3587 }
3588 
3589 rib_bufpool_t *
3590 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3591 {
3592 	rib_bufpool_t	*rbp = NULL;
3593 	bufpool_t	*bp = NULL;
3594 	caddr_t		buf;
3595 	ibt_mr_attr_t	mem_attr;
3596 	ibt_status_t	ibt_status;
3597 	int		i, j;
3598 
3599 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3600 
3601 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3602 	    num * sizeof (void *), KM_SLEEP);
3603 
3604 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3605 	bp->numelems = num;
3606 
3607 
3608 	switch (ptype) {
3609 	case SEND_BUFFER:
3610 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3611 		bp->rsize = RPC_MSG_SZ;
3612 		break;
3613 	case RECV_BUFFER:
3614 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3615 		bp->rsize = RPC_BUF_SIZE;
3616 		break;
3617 	default:
3618 		goto fail;
3619 	}
3620 
3621 	/*
3622 	 * Register the pool.
3623 	 */
3624 	bp->bufsize = num * bp->rsize;
3625 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3626 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3627 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3628 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3629 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3630 	rw_enter(&hca->state_lock, RW_READER);
3631 
3632 	if (hca->state != HCA_INITED) {
3633 		rw_exit(&hca->state_lock);
3634 		goto fail;
3635 	}
3636 
3637 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3638 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3639 		mem_attr.mr_vaddr = (uintptr_t)buf;
3640 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3641 		mem_attr.mr_as = NULL;
3642 		ibt_status = ibt_register_mr(hca->hca_hdl,
3643 		    hca->pd_hdl, &mem_attr,
3644 		    &rbp->mr_hdl[i],
3645 		    &rbp->mr_desc[i]);
3646 		if (ibt_status != IBT_SUCCESS) {
3647 			for (j = 0; j < i; j++) {
3648 				(void) ibt_deregister_mr(hca->hca_hdl,
3649 				    rbp->mr_hdl[j]);
3650 			}
3651 			rw_exit(&hca->state_lock);
3652 			goto fail;
3653 		}
3654 	}
3655 	rw_exit(&hca->state_lock);
3656 	buf = (caddr_t)bp->buf;
3657 	for (i = 0; i < num; i++, buf += bp->rsize) {
3658 		bp->buflist[i] = (void *)buf;
3659 	}
3660 	bp->buffree = num - 1;	/* no. of free buffers */
3661 	rbp->bpool = bp;
3662 
3663 	return (rbp);
3664 fail:
3665 	if (bp) {
3666 		if (bp->buf)
3667 			kmem_free(bp->buf, bp->bufsize);
3668 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3669 	}
3670 	if (rbp) {
3671 		if (rbp->mr_hdl)
3672 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3673 		if (rbp->mr_desc)
3674 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3675 		kmem_free(rbp, sizeof (rib_bufpool_t));
3676 	}
3677 	return (NULL);
3678 }
3679 
3680 static void
3681 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3682 {
3683 	int i;
3684 	rib_bufpool_t *rbp = NULL;
3685 	bufpool_t *bp;
3686 
3687 	/*
3688 	 * Obtain pool address based on type of pool
3689 	 */
3690 	switch (ptype) {
3691 		case SEND_BUFFER:
3692 			rbp = hca->send_pool;
3693 			break;
3694 		case RECV_BUFFER:
3695 			rbp = hca->recv_pool;
3696 			break;
3697 		default:
3698 			return;
3699 	}
3700 	if (rbp == NULL)
3701 		return;
3702 
3703 	bp = rbp->bpool;
3704 
3705 	/*
3706 	 * Deregister the pool memory and free it.
3707 	 */
3708 	for (i = 0; i < bp->numelems; i++) {
3709 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3710 	}
3711 }
3712 
3713 static void
3714 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3715 {
3716 
3717 	rib_bufpool_t *rbp = NULL;
3718 	bufpool_t *bp;
3719 
3720 	/*
3721 	 * Obtain pool address based on type of pool
3722 	 */
3723 	switch (ptype) {
3724 		case SEND_BUFFER:
3725 			rbp = hca->send_pool;
3726 			break;
3727 		case RECV_BUFFER:
3728 			rbp = hca->recv_pool;
3729 			break;
3730 		default:
3731 			return;
3732 	}
3733 	if (rbp == NULL)
3734 		return;
3735 
3736 	bp = rbp->bpool;
3737 
3738 	/*
3739 	 * Free the pool memory.
3740 	 */
3741 	if (rbp->mr_hdl)
3742 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3743 
3744 	if (rbp->mr_desc)
3745 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
3746 	if (bp->buf)
3747 		kmem_free(bp->buf, bp->bufsize);
3748 	mutex_destroy(&bp->buflock);
3749 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
3750 	kmem_free(rbp, sizeof (rib_bufpool_t));
3751 }
3752 
3753 void
3754 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
3755 {
3756 	/*
3757 	 * Deregister the pool memory and free it.
3758 	 */
3759 	rib_rbufpool_deregister(hca, ptype);
3760 	rib_rbufpool_free(hca, ptype);
3761 }
3762 
3763 /*
3764  * Fetch a buffer from the pool of type specified in rdbuf->type.
3765  */
3766 static rdma_stat
3767 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3768 {
3769 	rib_lrc_entry_t *rlep;
3770 
3771 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
3772 		rlep = rib_get_cache_buf(conn, rdbuf->len);
3773 		rdbuf->rb_private =  (caddr_t)rlep;
3774 		rdbuf->addr = rlep->lrc_buf;
3775 		rdbuf->handle = rlep->lrc_mhandle;
3776 		return (RDMA_SUCCESS);
3777 	}
3778 
3779 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
3780 	if (rdbuf->addr) {
3781 		switch (rdbuf->type) {
3782 		case SEND_BUFFER:
3783 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
3784 			break;
3785 		case RECV_BUFFER:
3786 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
3787 			break;
3788 		default:
3789 			rdbuf->len = 0;
3790 		}
3791 		return (RDMA_SUCCESS);
3792 	} else
3793 		return (RDMA_FAILED);
3794 }
3795 
3796 /*
3797  * Fetch a buffer of specified type.
3798  * Note that rdbuf->handle is mw's rkey.
3799  */
3800 static void *
3801 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3802 {
3803 	rib_qp_t	*qp = ctoqp(conn);
3804 	rib_hca_t	*hca = qp->hca;
3805 	rdma_btype	ptype = rdbuf->type;
3806 	void		*buf;
3807 	rib_bufpool_t	*rbp = NULL;
3808 	bufpool_t	*bp;
3809 	int		i;
3810 
3811 	/*
3812 	 * Obtain pool address based on type of pool
3813 	 */
3814 	switch (ptype) {
3815 	case SEND_BUFFER:
3816 		rbp = hca->send_pool;
3817 		break;
3818 	case RECV_BUFFER:
3819 		rbp = hca->recv_pool;
3820 		break;
3821 	default:
3822 		return (NULL);
3823 	}
3824 	if (rbp == NULL)
3825 		return (NULL);
3826 
3827 	bp = rbp->bpool;
3828 
3829 	mutex_enter(&bp->buflock);
3830 	if (bp->buffree < 0) {
3831 		mutex_exit(&bp->buflock);
3832 		return (NULL);
3833 	}
3834 
3835 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
3836 	buf = bp->buflist[bp->buffree];
3837 	rdbuf->addr = buf;
3838 	rdbuf->len = bp->rsize;
3839 	for (i = bp->numelems - 1; i >= 0; i--) {
3840 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
3841 			rdbuf->handle.mrc_rmr =
3842 			    (uint32_t)rbp->mr_desc[i].md_rkey;
3843 			rdbuf->handle.mrc_linfo =
3844 			    (uintptr_t)rbp->mr_hdl[i];
3845 			rdbuf->handle.mrc_lmr =
3846 			    (uint32_t)rbp->mr_desc[i].md_lkey;
3847 			bp->buffree--;
3848 
3849 			mutex_exit(&bp->buflock);
3850 
3851 			return (buf);
3852 		}
3853 	}
3854 
3855 	mutex_exit(&bp->buflock);
3856 
3857 	return (NULL);
3858 }
3859 
3860 static void
3861 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
3862 {
3863 
3864 	if (rdbuf->type == RDMA_LONG_BUFFER) {
3865 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
3866 		rdbuf->rb_private = NULL;
3867 		return;
3868 	}
3869 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
3870 }
3871 
3872 static void
3873 rib_rbuf_free(CONN *conn, int ptype, void *buf)
3874 {
3875 	rib_qp_t *qp = ctoqp(conn);
3876 	rib_hca_t *hca = qp->hca;
3877 	rib_bufpool_t *rbp = NULL;
3878 	bufpool_t *bp;
3879 
3880 	/*
3881 	 * Obtain pool address based on type of pool
3882 	 */
3883 	switch (ptype) {
3884 	case SEND_BUFFER:
3885 		rbp = hca->send_pool;
3886 		break;
3887 	case RECV_BUFFER:
3888 		rbp = hca->recv_pool;
3889 		break;
3890 	default:
3891 		return;
3892 	}
3893 	if (rbp == NULL)
3894 		return;
3895 
3896 	bp = rbp->bpool;
3897 
3898 	mutex_enter(&bp->buflock);
3899 	if (++bp->buffree >= bp->numelems) {
3900 		/*
3901 		 * Should never happen
3902 		 */
3903 		bp->buffree--;
3904 	} else {
3905 		bp->buflist[bp->buffree] = buf;
3906 	}
3907 	mutex_exit(&bp->buflock);
3908 }
3909 
3910 static rdma_stat
3911 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
3912 {
3913 	rw_enter(&connlist->conn_lock, RW_WRITER);
3914 	if (connlist->conn_hd) {
3915 		cn->c_next = connlist->conn_hd;
3916 		connlist->conn_hd->c_prev = cn;
3917 	}
3918 	connlist->conn_hd = cn;
3919 	rw_exit(&connlist->conn_lock);
3920 
3921 	return (RDMA_SUCCESS);
3922 }
3923 
3924 static rdma_stat
3925 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
3926 {
3927 	rw_enter(&connlist->conn_lock, RW_WRITER);
3928 	if (cn->c_prev) {
3929 		cn->c_prev->c_next = cn->c_next;
3930 	}
3931 	if (cn->c_next) {
3932 		cn->c_next->c_prev = cn->c_prev;
3933 	}
3934 	if (connlist->conn_hd == cn)
3935 		connlist->conn_hd = cn->c_next;
3936 	rw_exit(&connlist->conn_lock);
3937 
3938 	return (RDMA_SUCCESS);
3939 }
3940 
3941 /*
3942  * Connection management.
3943  * IBTF does not support recycling of channels. So connections are only
3944  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
3945  * C_DISCONN_PEND state. No C_IDLE state.
3946  * C_CONN_PEND state: Connection establishment in progress to the server.
3947  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
3948  * It has an RC channel associated with it. ibt_post_send/recv are allowed
3949  * only in this state.
3950  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
3951  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
3952  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
3953  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
3954  * c_ref drops to 0 (this indicates that RPC has no more references to this
3955  * connection), the connection should be destroyed. A connection transitions
3956  * into this state when it is being destroyed.
3957  */
3958 /* ARGSUSED */
3959 static rdma_stat
3960 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn)
3961 {
3962 	CONN *cn;
3963 	int status = RDMA_SUCCESS;
3964 	rib_hca_t *hca = rib_stat->hca;
3965 	rib_qp_t *qp;
3966 	clock_t cv_stat, timout;
3967 	rpcib_ping_t rpt;
3968 
3969 	if (hca == NULL)
3970 		return (RDMA_FAILED);
3971 
3972 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
3973 	if (hca->state == HCA_DETACHED) {
3974 		rw_exit(&rib_stat->hca->state_lock);
3975 		return (RDMA_FAILED);
3976 	}
3977 	rw_exit(&rib_stat->hca->state_lock);
3978 
3979 again:
3980 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
3981 	cn = hca->cl_conn_list.conn_hd;
3982 	while (cn != NULL) {
3983 		/*
3984 		 * First, clear up any connection in the ERROR state
3985 		 */
3986 		mutex_enter(&cn->c_lock);
3987 		if (cn->c_state == C_ERROR_CONN) {
3988 			if (cn->c_ref == 0) {
3989 				/*
3990 				 * Remove connection from list and destroy it.
3991 				 */
3992 				cn->c_state = C_DISCONN_PEND;
3993 				mutex_exit(&cn->c_lock);
3994 				rw_exit(&hca->cl_conn_list.conn_lock);
3995 				rib_conn_close((void *)cn);
3996 				goto again;
3997 			}
3998 			mutex_exit(&cn->c_lock);
3999 			cn = cn->c_next;
4000 			continue;
4001 		}
4002 		if (cn->c_state == C_DISCONN_PEND) {
4003 			mutex_exit(&cn->c_lock);
4004 			cn = cn->c_next;
4005 			continue;
4006 		}
4007 		if ((cn->c_raddr.len == svcaddr->len) &&
4008 		    bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) {
4009 			/*
4010 			 * Our connection. Give up conn list lock
4011 			 * as we are done traversing the list.
4012 			 */
4013 			rw_exit(&hca->cl_conn_list.conn_lock);
4014 			if (cn->c_state == C_CONNECTED) {
4015 				cn->c_ref++;	/* sharing a conn */
4016 				mutex_exit(&cn->c_lock);
4017 				*conn = cn;
4018 				return (status);
4019 			}
4020 			if (cn->c_state == C_CONN_PEND) {
4021 				/*
4022 				 * Hold a reference to this conn before
4023 				 * we give up the lock.
4024 				 */
4025 				cn->c_ref++;
4026 				timout =  ddi_get_lbolt() +
4027 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
4028 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4029 				    &cn->c_lock, timout)) > 0 &&
4030 				    cn->c_state == C_CONN_PEND)
4031 					;
4032 				if (cv_stat == 0) {
4033 					cn->c_ref--;
4034 					mutex_exit(&cn->c_lock);
4035 					return (RDMA_INTR);
4036 				}
4037 				if (cv_stat < 0) {
4038 					cn->c_ref--;
4039 					mutex_exit(&cn->c_lock);
4040 					return (RDMA_TIMEDOUT);
4041 				}
4042 				if (cn->c_state == C_CONNECTED) {
4043 					*conn = cn;
4044 					mutex_exit(&cn->c_lock);
4045 					return (status);
4046 				} else {
4047 					cn->c_ref--;
4048 					mutex_exit(&cn->c_lock);
4049 					return (RDMA_TIMEDOUT);
4050 				}
4051 			}
4052 		}
4053 		mutex_exit(&cn->c_lock);
4054 		cn = cn->c_next;
4055 	}
4056 	rw_exit(&hca->cl_conn_list.conn_lock);
4057 
4058 	bzero(&rpt, sizeof (rpcib_ping_t));
4059 
4060 	status = rib_ping_srv(addr_type, svcaddr, &rpt);
4061 	if (status != RDMA_SUCCESS) {
4062 		return (RDMA_FAILED);
4063 	}
4064 
4065 	/*
4066 	 * Channel to server doesn't exist yet, create one.
4067 	 */
4068 	if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) {
4069 		return (RDMA_FAILED);
4070 	}
4071 	cn = qptoc(qp);
4072 	cn->c_state = C_CONN_PEND;
4073 	cn->c_ref = 1;
4074 
4075 	/*
4076 	 * Add to conn list.
4077 	 * We had given up the READER lock. In the time since then,
4078 	 * another thread might have created the connection we are
4079 	 * trying here. But for now, that is quiet alright - there
4080 	 * might be two connections between a pair of hosts instead
4081 	 * of one. If we really want to close that window,
4082 	 * then need to check the list after acquiring the
4083 	 * WRITER lock.
4084 	 */
4085 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4086 	status = rib_conn_to_srv(hca, qp, &rpt);
4087 	mutex_enter(&cn->c_lock);
4088 	if (status == RDMA_SUCCESS) {
4089 		cn->c_state = C_CONNECTED;
4090 		*conn = cn;
4091 	} else {
4092 		cn->c_state = C_ERROR_CONN;
4093 		cn->c_ref--;
4094 	}
4095 	cv_broadcast(&cn->c_cv);
4096 	mutex_exit(&cn->c_lock);
4097 	return (status);
4098 }
4099 
4100 static void
4101 rib_conn_close(void *rarg)
4102 {
4103 	CONN *conn = (CONN *)rarg;
4104 	rib_qp_t *qp = ctoqp(conn);
4105 
4106 	mutex_enter(&conn->c_lock);
4107 	if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4108 
4109 		conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4110 		/*
4111 		 * Live connection in CONNECTED state.
4112 		 */
4113 		if (conn->c_state == C_CONNECTED) {
4114 			conn->c_state = C_ERROR_CONN;
4115 		}
4116 		mutex_exit(&conn->c_lock);
4117 
4118 		rib_close_a_channel(conn);
4119 
4120 		mutex_enter(&conn->c_lock);
4121 		conn->c_flags &= ~C_CLOSE_PENDING;
4122 		cv_signal(&conn->c_cv);
4123 	}
4124 
4125 	mutex_exit(&conn->c_lock);
4126 
4127 	if (qp->mode == RIB_SERVER)
4128 		(void) rib_disconnect_channel(conn,
4129 		    &qp->hca->srv_conn_list);
4130 	else
4131 		(void) rib_disconnect_channel(conn,
4132 		    &qp->hca->cl_conn_list);
4133 }
4134 
4135 static void
4136 rib_conn_timeout_call(void *carg)
4137 {
4138 	time_t idle_time;
4139 	CONN *conn = (CONN *)carg;
4140 	rib_hca_t *hca = ctoqp(conn)->hca;
4141 	int error;
4142 
4143 	mutex_enter(&conn->c_lock);
4144 	if ((conn->c_ref > 0) ||
4145 	    (conn->c_state == C_DISCONN_PEND)) {
4146 		conn->c_timeout = NULL;
4147 		mutex_exit(&conn->c_lock);
4148 		return;
4149 	}
4150 
4151 	idle_time = (gethrestime_sec() - conn->c_last_used);
4152 
4153 	if ((idle_time <= rib_conn_timeout) &&
4154 	    (conn->c_state != C_ERROR_CONN)) {
4155 		/*
4156 		 * There was activity after the last timeout.
4157 		 * Extend the conn life. Unless the conn is
4158 		 * already in error state.
4159 		 */
4160 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4161 		    SEC_TO_TICK(rib_conn_timeout - idle_time));
4162 		mutex_exit(&conn->c_lock);
4163 		return;
4164 	}
4165 
4166 	error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4167 	    (void *)conn, DDI_NOSLEEP);
4168 
4169 	/*
4170 	 * If taskq dispatch fails above, then reset the timeout
4171 	 * to try again after 10 secs.
4172 	 */
4173 
4174 	if (error != DDI_SUCCESS) {
4175 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4176 		    SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4177 		mutex_exit(&conn->c_lock);
4178 		return;
4179 	}
4180 
4181 	conn->c_state = C_DISCONN_PEND;
4182 	mutex_exit(&conn->c_lock);
4183 }
4184 
4185 static rdma_stat
4186 rib_conn_release(CONN *conn)
4187 {
4188 
4189 	mutex_enter(&conn->c_lock);
4190 	conn->c_ref--;
4191 
4192 	conn->c_last_used = gethrestime_sec();
4193 	if (conn->c_ref > 0) {
4194 		mutex_exit(&conn->c_lock);
4195 		return (RDMA_SUCCESS);
4196 	}
4197 
4198 	/*
4199 	 * If a conn is C_ERROR_CONN, close the channel.
4200 	 */
4201 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4202 		conn->c_state = C_DISCONN_PEND;
4203 		mutex_exit(&conn->c_lock);
4204 		rib_conn_close((void *)conn);
4205 		return (RDMA_SUCCESS);
4206 	}
4207 
4208 	/*
4209 	 * c_ref == 0, set a timeout for conn release
4210 	 */
4211 
4212 	if (conn->c_timeout == NULL) {
4213 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4214 		    SEC_TO_TICK(rib_conn_timeout));
4215 	}
4216 
4217 	mutex_exit(&conn->c_lock);
4218 	return (RDMA_SUCCESS);
4219 }
4220 
4221 /*
4222  * Add at front of list
4223  */
4224 static struct rdma_done_list *
4225 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4226 {
4227 	struct rdma_done_list *rd;
4228 
4229 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4230 
4231 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4232 	rd->xid = xid;
4233 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4234 
4235 	rd->prev = NULL;
4236 	rd->next = qp->rdlist;
4237 	if (qp->rdlist != NULL)
4238 		qp->rdlist->prev = rd;
4239 	qp->rdlist = rd;
4240 
4241 	return (rd);
4242 }
4243 
4244 static void
4245 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4246 {
4247 	struct rdma_done_list *r;
4248 
4249 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4250 
4251 	r = rd->next;
4252 	if (r != NULL) {
4253 		r->prev = rd->prev;
4254 	}
4255 
4256 	r = rd->prev;
4257 	if (r != NULL) {
4258 		r->next = rd->next;
4259 	} else {
4260 		qp->rdlist = rd->next;
4261 	}
4262 
4263 	cv_destroy(&rd->rdma_done_cv);
4264 	kmem_free(rd, sizeof (*rd));
4265 }
4266 
4267 static void
4268 rdma_done_rem_list(rib_qp_t *qp)
4269 {
4270 	struct rdma_done_list	*r, *n;
4271 
4272 	mutex_enter(&qp->rdlist_lock);
4273 	for (r = qp->rdlist; r != NULL; r = n) {
4274 		n = r->next;
4275 		rdma_done_rm(qp, r);
4276 	}
4277 	mutex_exit(&qp->rdlist_lock);
4278 }
4279 
4280 static void
4281 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4282 {
4283 	struct rdma_done_list *r = qp->rdlist;
4284 
4285 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4286 
4287 	while (r) {
4288 		if (r->xid == xid) {
4289 			cv_signal(&r->rdma_done_cv);
4290 			return;
4291 		} else {
4292 			r = r->next;
4293 		}
4294 	}
4295 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4296 	    int, xid);
4297 }
4298 
4299 /*
4300  * Expects conn->c_lock to be held by the caller.
4301  */
4302 
4303 static void
4304 rib_close_a_channel(CONN *conn)
4305 {
4306 	rib_qp_t	*qp;
4307 	qp = ctoqp(conn);
4308 
4309 	if (qp->qp_hdl == NULL) {
4310 		/* channel already freed */
4311 		return;
4312 	}
4313 
4314 	/*
4315 	 * Call ibt_close_rc_channel in blocking mode
4316 	 * with no callbacks.
4317 	 */
4318 	(void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4319 	    NULL, 0, NULL, NULL, 0);
4320 }
4321 
4322 /*
4323  * Goes through all connections and closes the channel
4324  * This will cause all the WRs on those channels to be
4325  * flushed.
4326  */
4327 static void
4328 rib_close_channels(rib_conn_list_t *connlist)
4329 {
4330 	CONN 		*conn, *tmp;
4331 
4332 	rw_enter(&connlist->conn_lock, RW_READER);
4333 	conn = connlist->conn_hd;
4334 	while (conn != NULL) {
4335 		mutex_enter(&conn->c_lock);
4336 		tmp = conn->c_next;
4337 		if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4338 
4339 			conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4340 
4341 			/*
4342 			 * Live connection in CONNECTED state.
4343 			 */
4344 			if (conn->c_state == C_CONNECTED)
4345 				conn->c_state = C_ERROR_CONN;
4346 			mutex_exit(&conn->c_lock);
4347 
4348 			rib_close_a_channel(conn);
4349 
4350 			mutex_enter(&conn->c_lock);
4351 			conn->c_flags &= ~C_CLOSE_PENDING;
4352 			/* Signal a pending rib_disconnect_channel() */
4353 			cv_signal(&conn->c_cv);
4354 		}
4355 		mutex_exit(&conn->c_lock);
4356 		conn = tmp;
4357 	}
4358 	rw_exit(&connlist->conn_lock);
4359 }
4360 
4361 /*
4362  * Frees up all connections that are no longer being referenced
4363  */
4364 static void
4365 rib_purge_connlist(rib_conn_list_t *connlist)
4366 {
4367 	CONN 		*conn;
4368 
4369 top:
4370 	rw_enter(&connlist->conn_lock, RW_READER);
4371 	conn = connlist->conn_hd;
4372 	while (conn != NULL) {
4373 		mutex_enter(&conn->c_lock);
4374 
4375 		/*
4376 		 * At this point connection is either in ERROR
4377 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4378 		 * then some other thread is culling that connection.
4379 		 * If not and if c_ref is 0, then destroy the connection.
4380 		 */
4381 		if (conn->c_ref == 0 &&
4382 		    conn->c_state != C_DISCONN_PEND) {
4383 			/*
4384 			 * Cull the connection
4385 			 */
4386 			conn->c_state = C_DISCONN_PEND;
4387 			mutex_exit(&conn->c_lock);
4388 			rw_exit(&connlist->conn_lock);
4389 			(void) rib_disconnect_channel(conn, connlist);
4390 			goto top;
4391 		} else {
4392 			/*
4393 			 * conn disconnect already scheduled or will
4394 			 * happen from conn_release when c_ref drops to 0.
4395 			 */
4396 			mutex_exit(&conn->c_lock);
4397 		}
4398 		conn = conn->c_next;
4399 	}
4400 	rw_exit(&connlist->conn_lock);
4401 
4402 	/*
4403 	 * At this point, only connections with c_ref != 0 are on the list
4404 	 */
4405 }
4406 
4407 /*
4408  * Free all the HCA resources and close
4409  * the hca.
4410  */
4411 
4412 static void
4413 rib_free_hca(rib_hca_t *hca)
4414 {
4415 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4416 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4417 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4418 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4419 
4420 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4421 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4422 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4423 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4424 
4425 	rib_rbufpool_destroy(hca, RECV_BUFFER);
4426 	rib_rbufpool_destroy(hca, SEND_BUFFER);
4427 	rib_destroy_cache(hca);
4428 	if (rib_mod.rdma_count == 0)
4429 		rdma_unregister_mod(&rib_mod);
4430 	(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4431 	(void) ibt_close_hca(hca->hca_hdl);
4432 	hca->hca_hdl = NULL;
4433 }
4434 
4435 /*
4436  * Cleans and closes up all uses of the HCA
4437  */
4438 static void
4439 rib_detach_hca(rib_hca_t *hca)
4440 {
4441 
4442 	/*
4443 	 * Stop all services on the HCA
4444 	 * Go through cl_conn_list and close all rc_channels
4445 	 * Go through svr_conn_list and close all rc_channels
4446 	 * Free connections whose c_ref has dropped to 0
4447 	 * Destroy all CQs
4448 	 * Deregister and released all buffer pool memory after all
4449 	 * connections are destroyed
4450 	 * Free the protection domain
4451 	 * ibt_close_hca()
4452 	 */
4453 	rw_enter(&hca->state_lock, RW_WRITER);
4454 	if (hca->state == HCA_DETACHED) {
4455 		rw_exit(&hca->state_lock);
4456 		return;
4457 	}
4458 
4459 	hca->state = HCA_DETACHED;
4460 	rib_stat->nhca_inited--;
4461 
4462 	rib_stop_services(hca);
4463 	rib_close_channels(&hca->cl_conn_list);
4464 	rib_close_channels(&hca->srv_conn_list);
4465 
4466 	rib_mod.rdma_count--;
4467 
4468 	rw_exit(&hca->state_lock);
4469 
4470 	rib_purge_connlist(&hca->cl_conn_list);
4471 	rib_purge_connlist(&hca->srv_conn_list);
4472 
4473 	if (stats_enabled) {
4474 		kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4475 		    GLOBAL_ZONEID);
4476 	}
4477 
4478 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4479 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4480 	if (hca->srv_conn_list.conn_hd == NULL &&
4481 	    hca->cl_conn_list.conn_hd == NULL) {
4482 		/*
4483 		 * conn_lists are NULL, so destroy
4484 		 * buffers, close hca and be done.
4485 		 */
4486 		rib_free_hca(hca);
4487 	}
4488 	rw_exit(&hca->cl_conn_list.conn_lock);
4489 	rw_exit(&hca->srv_conn_list.conn_lock);
4490 
4491 	if (hca->hca_hdl != NULL) {
4492 		mutex_enter(&hca->inuse_lock);
4493 		while (hca->inuse)
4494 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4495 		mutex_exit(&hca->inuse_lock);
4496 
4497 		rib_free_hca(hca);
4498 	}
4499 
4500 	if (hca->cleanup_helper != NULL) {
4501 		ddi_taskq_destroy(hca->cleanup_helper);
4502 		hca->cleanup_helper = NULL;
4503 	}
4504 }
4505 
4506 static void
4507 rib_server_side_cache_reclaim(void *argp)
4508 {
4509 	cache_avl_struct_t    *rcas;
4510 	rib_lrc_entry_t		*rb;
4511 	rib_hca_t *hca = (rib_hca_t *)argp;
4512 
4513 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4514 	rcas = avl_first(&hca->avl_tree);
4515 	if (rcas != NULL)
4516 		avl_remove(&hca->avl_tree, rcas);
4517 
4518 	while (rcas != NULL) {
4519 		while (rcas->r.forw != &rcas->r) {
4520 			rcas->elements--;
4521 			rib_total_buffers --;
4522 			rb = rcas->r.forw;
4523 			remque(rb);
4524 			if (rb->registered)
4525 				(void) rib_deregistermem_via_hca(hca,
4526 				    rb->lrc_buf, rb->lrc_mhandle);
4527 			cache_allocation -= rb->lrc_len;
4528 			kmem_free(rb->lrc_buf, rb->lrc_len);
4529 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4530 		}
4531 		mutex_destroy(&rcas->node_lock);
4532 		kmem_cache_free(hca->server_side_cache, rcas);
4533 		rcas = avl_first(&hca->avl_tree);
4534 		if (rcas != NULL)
4535 			avl_remove(&hca->avl_tree, rcas);
4536 	}
4537 	rw_exit(&hca->avl_rw_lock);
4538 }
4539 
4540 static void
4541 rib_server_side_cache_cleanup(void *argp)
4542 {
4543 	cache_avl_struct_t    *rcas;
4544 	rib_lrc_entry_t		*rb;
4545 	rib_hca_t *hca = (rib_hca_t *)argp;
4546 
4547 	rw_enter(&hca->avl_rw_lock, RW_READER);
4548 	if (cache_allocation < cache_limit) {
4549 		rw_exit(&hca->avl_rw_lock);
4550 		return;
4551 	}
4552 	rw_exit(&hca->avl_rw_lock);
4553 
4554 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4555 	rcas = avl_last(&hca->avl_tree);
4556 	if (rcas != NULL)
4557 		avl_remove(&hca->avl_tree, rcas);
4558 
4559 	while (rcas != NULL) {
4560 		while (rcas->r.forw != &rcas->r) {
4561 			rcas->elements--;
4562 			rib_total_buffers --;
4563 			rb = rcas->r.forw;
4564 			remque(rb);
4565 			if (rb->registered)
4566 				(void) rib_deregistermem_via_hca(hca,
4567 				    rb->lrc_buf, rb->lrc_mhandle);
4568 			cache_allocation -= rb->lrc_len;
4569 			kmem_free(rb->lrc_buf, rb->lrc_len);
4570 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4571 		}
4572 		mutex_destroy(&rcas->node_lock);
4573 		if (hca->server_side_cache) {
4574 			kmem_cache_free(hca->server_side_cache, rcas);
4575 		}
4576 		if ((cache_allocation) < cache_limit) {
4577 			rw_exit(&hca->avl_rw_lock);
4578 			return;
4579 		}
4580 
4581 		rcas = avl_last(&hca->avl_tree);
4582 		if (rcas != NULL)
4583 			avl_remove(&hca->avl_tree, rcas);
4584 	}
4585 	rw_exit(&hca->avl_rw_lock);
4586 }
4587 
4588 static int
4589 avl_compare(const void *t1, const void *t2)
4590 {
4591 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
4592 		return (0);
4593 
4594 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
4595 		return (-1);
4596 
4597 	return (1);
4598 }
4599 
4600 static void
4601 rib_destroy_cache(rib_hca_t *hca)
4602 {
4603 	if (hca->avl_init) {
4604 		rib_server_side_cache_reclaim((void *)hca);
4605 		if (hca->server_side_cache) {
4606 			kmem_cache_destroy(hca->server_side_cache);
4607 			hca->server_side_cache = NULL;
4608 		}
4609 		avl_destroy(&hca->avl_tree);
4610 		mutex_destroy(&hca->cache_allocation);
4611 		rw_destroy(&hca->avl_rw_lock);
4612 	}
4613 	hca->avl_init = FALSE;
4614 }
4615 
4616 static void
4617 rib_force_cleanup(void *hca)
4618 {
4619 	if (((rib_hca_t *)hca)->cleanup_helper != NULL)
4620 		(void) ddi_taskq_dispatch(
4621 		    ((rib_hca_t *)hca)->cleanup_helper,
4622 		    rib_server_side_cache_cleanup,
4623 		    (void *)hca, DDI_NOSLEEP);
4624 }
4625 
4626 static rib_lrc_entry_t *
4627 rib_get_cache_buf(CONN *conn, uint32_t len)
4628 {
4629 	cache_avl_struct_t	cas, *rcas;
4630 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4631 	rib_lrc_entry_t *reply_buf;
4632 	avl_index_t where = NULL;
4633 	uint64_t c_alloc = 0;
4634 
4635 	if (!hca->avl_init)
4636 		goto  error_alloc;
4637 
4638 	cas.len = len;
4639 
4640 	rw_enter(&hca->avl_rw_lock, RW_READER);
4641 
4642 	mutex_enter(&hca->cache_allocation);
4643 	c_alloc = cache_allocation;
4644 	mutex_exit(&hca->cache_allocation);
4645 
4646 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
4647 	    &where)) == NULL) {
4648 		/* Am I above the cache limit */
4649 		if ((c_alloc + len) >= cache_limit) {
4650 			rib_force_cleanup((void *)hca);
4651 			rw_exit(&hca->avl_rw_lock);
4652 			cache_misses_above_the_limit ++;
4653 
4654 			/* Allocate and register the buffer directly */
4655 			goto error_alloc;
4656 		}
4657 
4658 		rw_exit(&hca->avl_rw_lock);
4659 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
4660 
4661 		/* Recheck to make sure no other thread added the entry in */
4662 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
4663 		    &cas, &where)) == NULL) {
4664 			/* Allocate an avl tree entry */
4665 			rcas = (cache_avl_struct_t *)
4666 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
4667 
4668 			bzero(rcas, sizeof (cache_avl_struct_t));
4669 			rcas->elements = 0;
4670 			rcas->r.forw = &rcas->r;
4671 			rcas->r.back = &rcas->r;
4672 			rcas->len = len;
4673 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
4674 			avl_insert(&hca->avl_tree, rcas, where);
4675 		}
4676 	}
4677 
4678 	mutex_enter(&rcas->node_lock);
4679 
4680 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
4681 		rib_total_buffers--;
4682 		cache_hits++;
4683 		reply_buf = rcas->r.forw;
4684 		remque(reply_buf);
4685 		rcas->elements--;
4686 		mutex_exit(&rcas->node_lock);
4687 		rw_exit(&hca->avl_rw_lock);
4688 		mutex_enter(&hca->cache_allocation);
4689 		cache_allocation -= len;
4690 		mutex_exit(&hca->cache_allocation);
4691 	} else {
4692 		/* Am I above the cache limit */
4693 		mutex_exit(&rcas->node_lock);
4694 		if ((c_alloc + len) >= cache_limit) {
4695 			rib_force_cleanup((void *)hca);
4696 			rw_exit(&hca->avl_rw_lock);
4697 			cache_misses_above_the_limit ++;
4698 			/* Allocate and register the buffer directly */
4699 			goto error_alloc;
4700 		}
4701 		rw_exit(&hca->avl_rw_lock);
4702 		cache_misses ++;
4703 		/* Allocate a reply_buf entry */
4704 		reply_buf = (rib_lrc_entry_t *)
4705 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4706 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
4707 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
4708 		reply_buf->lrc_len  = len;
4709 		reply_buf->registered = FALSE;
4710 		reply_buf->avl_node = (void *)rcas;
4711 	}
4712 
4713 	return (reply_buf);
4714 
4715 error_alloc:
4716 	reply_buf = (rib_lrc_entry_t *)
4717 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4718 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
4719 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
4720 	reply_buf->lrc_len = len;
4721 	reply_buf->registered = FALSE;
4722 	reply_buf->avl_node = NULL;
4723 
4724 	return (reply_buf);
4725 }
4726 
4727 /*
4728  * Return a pre-registered back to the cache (without
4729  * unregistering the buffer)..
4730  */
4731 
4732 static void
4733 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
4734 {
4735 	cache_avl_struct_t    cas, *rcas;
4736 	avl_index_t where = NULL;
4737 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4738 
4739 	if (!hca->avl_init)
4740 		goto  error_free;
4741 
4742 	cas.len = reg_buf->lrc_len;
4743 	rw_enter(&hca->avl_rw_lock, RW_READER);
4744 	if ((rcas = (cache_avl_struct_t *)
4745 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
4746 		rw_exit(&hca->avl_rw_lock);
4747 		goto error_free;
4748 	} else {
4749 		rib_total_buffers ++;
4750 		cas.len = reg_buf->lrc_len;
4751 		mutex_enter(&rcas->node_lock);
4752 		insque(reg_buf, &rcas->r);
4753 		rcas->elements ++;
4754 		mutex_exit(&rcas->node_lock);
4755 		rw_exit(&hca->avl_rw_lock);
4756 		mutex_enter(&hca->cache_allocation);
4757 		cache_allocation += cas.len;
4758 		mutex_exit(&hca->cache_allocation);
4759 	}
4760 
4761 	return;
4762 
4763 error_free:
4764 
4765 	if (reg_buf->registered)
4766 		(void) rib_deregistermem_via_hca(hca,
4767 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
4768 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
4769 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
4770 }
4771 
4772 static rdma_stat
4773 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
4774 	uint_t buflen, struct mrc *buf_handle)
4775 {
4776 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
4777 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
4778 	rdma_stat	status;
4779 
4780 
4781 	/*
4782 	 * Note: ALL buffer pools use the same memory type RDMARW.
4783 	 */
4784 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
4785 	if (status == RDMA_SUCCESS) {
4786 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
4787 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
4788 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
4789 	} else {
4790 		buf_handle->mrc_linfo = NULL;
4791 		buf_handle->mrc_lmr = 0;
4792 		buf_handle->mrc_rmr = 0;
4793 	}
4794 	return (status);
4795 }
4796 
4797 /* ARGSUSED */
4798 static rdma_stat
4799 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
4800     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
4801 {
4802 
4803 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
4804 	return (RDMA_SUCCESS);
4805 }
4806 
4807 /* ARGSUSED */
4808 static rdma_stat
4809 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
4810 {
4811 
4812 	(void) ibt_deregister_mr(hca->hca_hdl,
4813 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
4814 	return (RDMA_SUCCESS);
4815 }
4816 
4817 /*
4818  * Check if the IP interface named by `lifrp' is RDMA-capable.
4819  */
4820 static boolean_t
4821 rpcib_rdma_capable_interface(struct lifreq *lifrp)
4822 {
4823 	char ifname[LIFNAMSIZ];
4824 	char *cp;
4825 
4826 	if (lifrp->lifr_type == IFT_IB)
4827 		return (B_TRUE);
4828 
4829 	/*
4830 	 * Strip off the logical interface portion before getting
4831 	 * intimate with the name.
4832 	 */
4833 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
4834 	if ((cp = strchr(ifname, ':')) != NULL)
4835 		*cp = '\0';
4836 
4837 	return (strcmp("lo0", ifname) == 0);
4838 }
4839 
4840 static int
4841 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
4842 {
4843 	vnode_t *kvp, *vp;
4844 	TIUSER  *tiptr;
4845 	struct  strioctl iocb;
4846 	k_sigset_t smask;
4847 	int	err = 0;
4848 
4849 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) {
4850 		if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE,
4851 		    &tiptr, CRED()) == 0) {
4852 			vp = tiptr->fp->f_vnode;
4853 		} else {
4854 			VN_RELE(kvp);
4855 			return (EPROTO);
4856 		}
4857 	} else {
4858 		return (EPROTO);
4859 	}
4860 
4861 	iocb.ic_cmd = cmd;
4862 	iocb.ic_timout = 0;
4863 	iocb.ic_len = len;
4864 	iocb.ic_dp = (caddr_t)arg;
4865 	sigintr(&smask, 0);
4866 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4867 	sigunintr(&smask);
4868 	(void) t_kclose(tiptr, 0);
4869 	VN_RELE(kvp);
4870 	return (err);
4871 }
4872 
4873 /*
4874  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
4875  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
4876  */
4877 static int
4878 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
4879 {
4880 	int err;
4881 	struct lifnum lifn;
4882 
4883 	bzero(&lifn, sizeof (struct lifnum));
4884 	lifn.lifn_family = AF_UNSPEC;
4885 
4886 	err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
4887 	if (err != 0)
4888 		return (err);
4889 
4890 	/*
4891 	 * Pad the interface count to account for additional interfaces that
4892 	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
4893 	 */
4894 	lifn.lifn_count += 4;
4895 
4896 	bzero(lifcp, sizeof (struct lifconf));
4897 	lifcp->lifc_family = AF_UNSPEC;
4898 	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
4899 	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
4900 
4901 	err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
4902 	if (err != 0) {
4903 		kmem_free(lifcp->lifc_buf, *bufsizep);
4904 		return (err);
4905 	}
4906 	return (0);
4907 }
4908 
4909 static boolean_t
4910 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
4911 {
4912 	uint_t i, nifs;
4913 	uint_t bufsize;
4914 	struct lifconf lifc;
4915 	struct lifreq *lifrp;
4916 	struct sockaddr_in *sinp;
4917 	struct sockaddr_in6 *sin6p;
4918 
4919 	bzero(addrs4, sizeof (rpcib_ipaddrs_t));
4920 	bzero(addrs6, sizeof (rpcib_ipaddrs_t));
4921 
4922 	if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
4923 		return (B_FALSE);
4924 
4925 	if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
4926 		kmem_free(lifc.lifc_buf, bufsize);
4927 		return (B_FALSE);
4928 	}
4929 
4930 	/*
4931 	 * Worst case is that all of the addresses are IB-capable and have
4932 	 * the same address family, so size our buffers accordingly.
4933 	 */
4934 	addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
4935 	addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
4936 	addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
4937 	addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
4938 
4939 	for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
4940 		if (!rpcib_rdma_capable_interface(lifrp))
4941 			continue;
4942 
4943 		if (lifrp->lifr_addr.ss_family == AF_INET) {
4944 			sinp = addrs4->ri_list;
4945 			bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
4946 			    sizeof (struct sockaddr_in));
4947 		} else if (lifrp->lifr_addr.ss_family == AF_INET6) {
4948 			sin6p = addrs6->ri_list;
4949 			bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
4950 			    sizeof (struct sockaddr_in6));
4951 		}
4952 	}
4953 
4954 	kmem_free(lifc.lifc_buf, bufsize);
4955 	return (B_TRUE);
4956 }
4957 
4958 /* ARGSUSED */
4959 static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) {
4960 
4961 	if (KSTAT_WRITE == rw) {
4962 		return (EACCES);
4963 	}
4964 	rpcib_kstat.cache_limit.value.ui64 =
4965 	    (uint64_t)cache_limit;
4966 	rpcib_kstat.cache_allocation.value.ui64 =
4967 	    (uint64_t)cache_allocation;
4968 	rpcib_kstat.cache_hits.value.ui64 =
4969 	    (uint64_t)cache_hits;
4970 	rpcib_kstat.cache_misses.value.ui64 =
4971 	    (uint64_t)cache_misses;
4972 	rpcib_kstat.cache_misses_above_the_limit.value.ui64 =
4973 	    (uint64_t)cache_misses_above_the_limit;
4974 	return (0);
4975 }
4976