xref: /illumos-gate/usr/src/uts/common/rpc/rpcib.c (revision ef2504f26d1ea5859db9838255bb63f488f1b050)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
28  *
29  * Portions of this source code is developed by the team members of
30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31  * headed by Professor Dhabaleswar K. (DK) Panda.
32  *
33  * Acknowledgements to contributions from developors:
34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
35  *   Lei Chai      : chail@cse.ohio-state.edu
36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
37  *
38  */
39 
40 /*
41  * The rpcib plugin. Implements the interface for RDMATF's
42  * interaction with IBTF.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/user.h>
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/file.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/stropts.h>
56 #include <sys/errno.h>
57 #include <sys/kmem.h>
58 #include <sys/debug.h>
59 #include <sys/pathname.h>
60 #include <sys/kstat.h>
61 #include <sys/t_lock.h>
62 #include <sys/ddi.h>
63 #include <sys/cmn_err.h>
64 #include <sys/time.h>
65 #include <sys/isa_defs.h>
66 #include <sys/callb.h>
67 #include <sys/sunddi.h>
68 #include <sys/sunndi.h>
69 #include <sys/sdt.h>
70 #include <sys/ib/ibtl/ibti.h>
71 #include <rpc/rpc.h>
72 #include <rpc/ib.h>
73 #include <sys/modctl.h>
74 #include <sys/kstr.h>
75 #include <sys/sockio.h>
76 #include <sys/vnode.h>
77 #include <sys/tiuser.h>
78 #include <net/if.h>
79 #include <net/if_types.h>
80 #include <sys/cred.h>
81 #include <rpc/rpc_rdma.h>
82 #include <nfs/nfs.h>
83 #include <sys/atomic.h>
84 
85 #define	NFS_RDMA_PORT	20049
86 
87 
88 /*
89  * Convenience structures for connection management
90  */
91 typedef struct rpcib_ipaddrs {
92 	void	*ri_list;	/* pointer to list of addresses */
93 	uint_t	ri_count;	/* number of addresses in list */
94 	uint_t	ri_size;	/* size of ri_list in bytes */
95 } rpcib_ipaddrs_t;
96 
97 
98 typedef struct rpcib_ping {
99 	rib_hca_t  *hca;
100 	ibt_path_info_t path;
101 	ibt_ip_addr_t srcip;
102 	ibt_ip_addr_t dstip;
103 } rpcib_ping_t;
104 
105 /*
106  * Prototype declarations for driver ops
107  */
108 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
109 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
110 				void *, void **);
111 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
113 static int	rpcib_do_ip_ioctl(int, int, void *);
114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
115 static int rpcib_cache_kstat_update(kstat_t *, int);
116 static void rib_force_cleanup(void *);
117 
118 struct {
119 	kstat_named_t cache_limit;
120 	kstat_named_t cache_allocation;
121 	kstat_named_t cache_hits;
122 	kstat_named_t cache_misses;
123 	kstat_named_t cache_misses_above_the_limit;
124 } rpcib_kstat = {
125 	{"cache_limit",			KSTAT_DATA_UINT64 },
126 	{"cache_allocation",		KSTAT_DATA_UINT64 },
127 	{"cache_hits",			KSTAT_DATA_UINT64 },
128 	{"cache_misses",		KSTAT_DATA_UINT64 },
129 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
130 };
131 
132 /* rpcib cb_ops */
133 static struct cb_ops rpcib_cbops = {
134 	nulldev,		/* open */
135 	nulldev,		/* close */
136 	nodev,			/* strategy */
137 	nodev,			/* print */
138 	nodev,			/* dump */
139 	nodev,			/* read */
140 	nodev,			/* write */
141 	nodev,			/* ioctl */
142 	nodev,			/* devmap */
143 	nodev,			/* mmap */
144 	nodev,			/* segmap */
145 	nochpoll,		/* poll */
146 	ddi_prop_op,		/* prop_op */
147 	NULL,			/* stream */
148 	D_MP,			/* cb_flag */
149 	CB_REV,			/* rev */
150 	nodev,			/* int (*cb_aread)() */
151 	nodev			/* int (*cb_awrite)() */
152 };
153 
154 /*
155  * Device options
156  */
157 static struct dev_ops rpcib_ops = {
158 	DEVO_REV,		/* devo_rev, */
159 	0,			/* refcnt  */
160 	rpcib_getinfo,		/* info */
161 	nulldev,		/* identify */
162 	nulldev,		/* probe */
163 	rpcib_attach,		/* attach */
164 	rpcib_detach,		/* detach */
165 	nodev,			/* reset */
166 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
167 	NULL,			/* bus operations */
168 	NULL,			/* power */
169 	ddi_quiesce_not_needed,		/* quiesce */
170 };
171 
172 /*
173  * Module linkage information.
174  */
175 
176 static struct modldrv rib_modldrv = {
177 	&mod_driverops,		/* Driver module */
178 	"RPCIB plugin driver",	/* Driver name and version */
179 	&rpcib_ops,		/* Driver ops */
180 };
181 
182 static struct modlinkage rib_modlinkage = {
183 	MODREV_1,
184 	(void *)&rib_modldrv,
185 	NULL
186 };
187 
188 typedef struct rib_lrc_entry {
189 	struct rib_lrc_entry *forw;
190 	struct rib_lrc_entry *back;
191 	char *lrc_buf;
192 
193 	uint32_t lrc_len;
194 	void  *avl_node;
195 	bool_t registered;
196 
197 	struct mrc lrc_mhandle;
198 	bool_t lrc_on_freed_list;
199 } rib_lrc_entry_t;
200 
201 typedef	struct cache_struct	{
202 	rib_lrc_entry_t		r;
203 	uint32_t		len;
204 	uint32_t		elements;
205 	kmutex_t		node_lock;
206 	avl_node_t		avl_link;
207 } cache_avl_struct_t;
208 
209 static uint64_t	rib_total_buffers = 0;
210 uint64_t	cache_limit = 100 * 1024 * 1024;
211 static volatile uint64_t	cache_allocation = 0;
212 static uint64_t	cache_watermark = 80 * 1024 * 1024;
213 static uint64_t	cache_hits = 0;
214 static uint64_t	cache_misses = 0;
215 static uint64_t	cache_cold_misses = 0;
216 static uint64_t	cache_hot_misses = 0;
217 static uint64_t	cache_misses_above_the_limit = 0;
218 static bool_t	stats_enabled = FALSE;
219 
220 static uint64_t max_unsignaled_rws = 5;
221 int nfs_rdma_port = NFS_RDMA_PORT;
222 
223 /*
224  * rib_stat: private data pointer used when registering
225  *	with the IBTF.  It is returned to the consumer
226  *	in all callbacks.
227  */
228 static rpcib_state_t *rib_stat = NULL;
229 
230 #define	RNR_RETRIES	IBT_RNR_RETRY_1
231 #define	MAX_PORTS	2
232 
233 int preposted_rbufs = RDMA_BUFS_GRANT;
234 int send_threshold = 1;
235 
236 /*
237  * State of the plugin.
238  * ACCEPT = accepting new connections and requests.
239  * NO_ACCEPT = not accepting new connection and requests.
240  * This should eventually move to rpcib_state_t structure, since this
241  * will tell in which state the plugin is for a particular type of service
242  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
243  * state for one and in no_accept state for the other.
244  */
245 int		plugin_state;
246 kmutex_t	plugin_state_lock;
247 
248 ldi_ident_t rpcib_li;
249 
250 /*
251  * RPCIB RDMATF operations
252  */
253 #if defined(MEASURE_POOL_DEPTH)
254 static void rib_posted_rbufs(uint32_t x) { return; }
255 #endif
256 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
257 static rdma_stat rib_disconnect(CONN *conn);
258 static void rib_listen(struct rdma_svc_data *rd);
259 static void rib_listen_stop(struct rdma_svc_data *rd);
260 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
261 	uint_t buflen, struct mrc *buf_handle);
262 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
263 	struct mrc buf_handle);
264 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
265 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
266 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
267 		struct mrc buf_handle);
268 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
269 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
270 	void *lrc);
271 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
272 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
273 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
274 	caddr_t buf, int len, int cpu);
275 
276 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
277 
278 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
279 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
280 
281 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
282 
283 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
284 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
285 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
286 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
287 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
288 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
289 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
290 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
291 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
292 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
293 static rdma_stat rib_conn_release(CONN *conn);
294 static rdma_stat rib_getinfo(rdma_info_t *info);
295 
296 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
297 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
298 static void rib_destroy_cache(rib_hca_t *hca);
299 static	void	rib_server_side_cache_reclaim(void *argp);
300 static int avl_compare(const void *t1, const void *t2);
301 
302 static void rib_stop_services(rib_hca_t *);
303 static void rib_close_channels(rib_conn_list_t *);
304 
305 /*
306  * RPCIB addressing operations
307  */
308 
309 /*
310  * RDMA operations the RPCIB module exports
311  */
312 static rdmaops_t rib_ops = {
313 	rib_reachable,
314 	rib_conn_get,
315 	rib_conn_release,
316 	rib_listen,
317 	rib_listen_stop,
318 	rib_registermem,
319 	rib_deregistermem,
320 	rib_registermemsync,
321 	rib_deregistermemsync,
322 	rib_syncmem,
323 	rib_reg_buf_alloc,
324 	rib_reg_buf_free,
325 	rib_send,
326 	rib_send_resp,
327 	rib_post_resp,
328 	rib_post_resp_remove,
329 	rib_post_recv,
330 	rib_recv,
331 	rib_read,
332 	rib_write,
333 	rib_getinfo,
334 };
335 
336 /*
337  * RDMATF RPCIB plugin details
338  */
339 static rdma_mod_t rib_mod = {
340 	"ibtf",		/* api name */
341 	RDMATF_VERS_1,
342 	0,
343 	&rib_ops,	/* rdma op vector for ibtf */
344 };
345 
346 static rdma_stat open_hcas(rpcib_state_t *);
347 static rdma_stat rib_qp_init(rib_qp_t *, int);
348 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
349 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
350 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
351 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
352 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
353 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
354 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
355 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
356 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
357 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
358 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
359 	rib_qp_t **);
360 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
361 	rib_qp_t **);
362 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
363 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
364 static int rib_free_sendwait(struct send_wid *);
365 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
366 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
367 static void rdma_done_rem_list(rib_qp_t *);
368 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
369 
370 static void rib_async_handler(void *,
371 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
372 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
373 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
374 static int rib_free_svc_recv(struct svc_recv *);
375 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
376 static void rib_free_wid(struct recv_wid *);
377 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
378 static void rib_detach_hca(rib_hca_t *);
379 
380 /*
381  * Registration with IBTF as a consumer
382  */
383 static struct ibt_clnt_modinfo_s rib_modinfo = {
384 	IBTI_V_CURR,
385 	IBT_GENERIC,
386 	rib_async_handler,	/* async event handler */
387 	NULL,			/* Memory Region Handler */
388 	"nfs/ib"
389 };
390 
391 /*
392  * Global strucuture
393  */
394 
395 typedef struct rpcib_s {
396 	dev_info_t	*rpcib_dip;
397 	kmutex_t	rpcib_mutex;
398 } rpcib_t;
399 
400 rpcib_t rpcib;
401 
402 /*
403  * /etc/system controlled variable to control
404  * debugging in rpcib kernel module.
405  * Set it to values greater that 1 to control
406  * the amount of debugging messages required.
407  */
408 int rib_debug = 0;
409 
410 int
411 _init(void)
412 {
413 	int error;
414 
415 	error = mod_install((struct modlinkage *)&rib_modlinkage);
416 	if (error != 0) {
417 		/*
418 		 * Could not load module
419 		 */
420 		return (error);
421 	}
422 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
423 	return (0);
424 }
425 
426 int
427 _fini()
428 {
429 	int status;
430 
431 	/*
432 	 * Remove module
433 	 */
434 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
435 		return (status);
436 	}
437 	mutex_destroy(&plugin_state_lock);
438 	return (0);
439 }
440 
441 int
442 _info(struct modinfo *modinfop)
443 {
444 	return (mod_info(&rib_modlinkage, modinfop));
445 }
446 
447 /*
448  * rpcib_getinfo()
449  * Given the device number, return the devinfo pointer or the
450  * instance number.
451  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
452  */
453 
454 /*ARGSUSED*/
455 static int
456 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
457 {
458 	int ret = DDI_SUCCESS;
459 
460 	switch (cmd) {
461 	case DDI_INFO_DEVT2DEVINFO:
462 		if (rpcib.rpcib_dip != NULL)
463 			*result = rpcib.rpcib_dip;
464 		else {
465 			*result = NULL;
466 			ret = DDI_FAILURE;
467 		}
468 		break;
469 
470 	case DDI_INFO_DEVT2INSTANCE:
471 		*result = NULL;
472 		break;
473 
474 	default:
475 		ret = DDI_FAILURE;
476 	}
477 	return (ret);
478 }
479 
480 static int
481 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
482 {
483 	ibt_status_t	ibt_status;
484 	rdma_stat	r_status;
485 
486 	switch (cmd) {
487 	case DDI_ATTACH:
488 		break;
489 	case DDI_RESUME:
490 		return (DDI_SUCCESS);
491 	default:
492 		return (DDI_FAILURE);
493 	}
494 
495 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
496 
497 	mutex_enter(&rpcib.rpcib_mutex);
498 	if (rpcib.rpcib_dip != NULL) {
499 		mutex_exit(&rpcib.rpcib_mutex);
500 		return (DDI_FAILURE);
501 	}
502 	rpcib.rpcib_dip = dip;
503 	mutex_exit(&rpcib.rpcib_mutex);
504 	/*
505 	 * Create the "rpcib" minor-node.
506 	 */
507 	if (ddi_create_minor_node(dip,
508 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
509 		/* Error message, no cmn_err as they print on console */
510 		return (DDI_FAILURE);
511 	}
512 
513 	if (rib_stat == NULL) {
514 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
515 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
516 	}
517 
518 	rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids);
519 	if (rib_stat->hca_count < 1) {
520 		mutex_destroy(&rib_stat->open_hca_lock);
521 		kmem_free(rib_stat, sizeof (*rib_stat));
522 		rib_stat = NULL;
523 		return (DDI_FAILURE);
524 	}
525 
526 	ibt_status = ibt_attach(&rib_modinfo, dip,
527 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
528 
529 	if (ibt_status != IBT_SUCCESS) {
530 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
531 		mutex_destroy(&rib_stat->open_hca_lock);
532 		kmem_free(rib_stat, sizeof (*rib_stat));
533 		rib_stat = NULL;
534 		return (DDI_FAILURE);
535 	}
536 
537 	mutex_enter(&rib_stat->open_hca_lock);
538 	if (open_hcas(rib_stat) != RDMA_SUCCESS) {
539 		mutex_exit(&rib_stat->open_hca_lock);
540 		goto open_fail;
541 	}
542 	mutex_exit(&rib_stat->open_hca_lock);
543 
544 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
545 	    DDI_PROP_SUCCESS) {
546 		cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
547 		    "failed.");
548 		goto register_fail;
549 	}
550 
551 	/*
552 	 * Register with rdmatf
553 	 */
554 	rib_mod.rdma_count = rib_stat->nhca_inited;
555 	r_status = rdma_register_mod(&rib_mod);
556 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
557 		cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
558 		    "status = %d", r_status);
559 		goto register_fail;
560 	}
561 
562 	return (DDI_SUCCESS);
563 
564 register_fail:
565 	rib_detach_hca(rib_stat->hca);
566 open_fail:
567 	ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
568 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
569 	mutex_destroy(&rib_stat->open_hca_lock);
570 	kmem_free(rib_stat, sizeof (*rib_stat));
571 	rib_stat = NULL;
572 	return (DDI_FAILURE);
573 }
574 
575 /*ARGSUSED*/
576 static int
577 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
578 {
579 	switch (cmd) {
580 
581 	case DDI_DETACH:
582 		break;
583 
584 	case DDI_SUSPEND:
585 	default:
586 		return (DDI_FAILURE);
587 	}
588 
589 	/*
590 	 * Detach the hca and free resources
591 	 */
592 	mutex_enter(&plugin_state_lock);
593 	plugin_state = NO_ACCEPT;
594 	mutex_exit(&plugin_state_lock);
595 	rib_detach_hca(rib_stat->hca);
596 	ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
597 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
598 	mutex_destroy(&rib_stat->open_hca_lock);
599 	if (rib_stat->hcas) {
600 		kmem_free(rib_stat->hcas, rib_stat->hca_count *
601 		    sizeof (rib_hca_t));
602 		rib_stat->hcas = NULL;
603 	}
604 	kmem_free(rib_stat, sizeof (*rib_stat));
605 	rib_stat = NULL;
606 
607 	mutex_enter(&rpcib.rpcib_mutex);
608 	rpcib.rpcib_dip = NULL;
609 	mutex_exit(&rpcib.rpcib_mutex);
610 	mutex_destroy(&rpcib.rpcib_mutex);
611 	return (DDI_SUCCESS);
612 }
613 
614 
615 static void rib_rbufpool_free(rib_hca_t *, int);
616 static void rib_rbufpool_deregister(rib_hca_t *, int);
617 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
618 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
619 static rdma_stat rib_rem_replylist(rib_qp_t *);
620 static int rib_remreply(rib_qp_t *, struct reply *);
621 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
622 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
623 
624 
625 /*
626  * One CQ pair per HCA
627  */
628 static rdma_stat
629 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
630 	rib_cq_t **cqp, rpcib_state_t *ribstat)
631 {
632 	rib_cq_t	*cq;
633 	ibt_cq_attr_t	cq_attr;
634 	uint32_t	real_size;
635 	ibt_status_t	status;
636 	rdma_stat	error = RDMA_SUCCESS;
637 
638 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
639 	cq->rib_hca = hca;
640 	cq_attr.cq_size = cq_size;
641 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
642 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
643 	    &real_size);
644 	if (status != IBT_SUCCESS) {
645 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
646 		    " status=%d", status);
647 		error = RDMA_FAILED;
648 		goto fail;
649 	}
650 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat);
651 
652 	/*
653 	 * Enable CQ callbacks. CQ Callbacks are single shot
654 	 * (e.g. you have to call ibt_enable_cq_notify()
655 	 * after each callback to get another one).
656 	 */
657 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
658 	if (status != IBT_SUCCESS) {
659 		cmn_err(CE_WARN, "rib_create_cq: "
660 		    "enable_cq_notify failed, status %d", status);
661 		error = RDMA_FAILED;
662 		goto fail;
663 	}
664 	*cqp = cq;
665 
666 	return (error);
667 fail:
668 	if (cq->rib_cq_hdl)
669 		(void) ibt_free_cq(cq->rib_cq_hdl);
670 	if (cq)
671 		kmem_free(cq, sizeof (rib_cq_t));
672 	return (error);
673 }
674 
675 static rdma_stat
676 open_hcas(rpcib_state_t *ribstat)
677 {
678 	rib_hca_t		*hca;
679 	ibt_status_t		ibt_status;
680 	rdma_stat		status;
681 	ibt_hca_portinfo_t	*pinfop;
682 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
683 	uint_t			size, cq_size;
684 	int			i;
685 	kstat_t *ksp;
686 	cache_avl_struct_t example_avl_node;
687 	char rssc_name[32];
688 
689 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
690 
691 	if (ribstat->hcas == NULL)
692 		ribstat->hcas = kmem_zalloc(ribstat->hca_count *
693 		    sizeof (rib_hca_t), KM_SLEEP);
694 
695 	/*
696 	 * Open a hca and setup for RDMA
697 	 */
698 	for (i = 0; i < ribstat->hca_count; i++) {
699 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
700 		    ribstat->hca_guids[i],
701 		    &ribstat->hcas[i].hca_hdl);
702 		if (ibt_status != IBT_SUCCESS) {
703 			continue;
704 		}
705 		ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
706 		hca = &(ribstat->hcas[i]);
707 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
708 		hca->state = HCA_INITED;
709 
710 		/*
711 		 * query HCA info
712 		 */
713 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
714 		if (ibt_status != IBT_SUCCESS) {
715 			goto fail1;
716 		}
717 
718 		/*
719 		 * One PD (Protection Domain) per HCA.
720 		 * A qp is allowed to access a memory region
721 		 * only when it's in the same PD as that of
722 		 * the memory region.
723 		 */
724 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
725 		if (ibt_status != IBT_SUCCESS) {
726 			goto fail1;
727 		}
728 
729 		/*
730 		 * query HCA ports
731 		 */
732 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
733 		    0, &pinfop, &hca->hca_nports, &size);
734 		if (ibt_status != IBT_SUCCESS) {
735 			goto fail2;
736 		}
737 		hca->hca_ports = pinfop;
738 		hca->hca_pinfosz = size;
739 		pinfop = NULL;
740 
741 		cq_size = DEF_CQ_SIZE; /* default cq size */
742 		/*
743 		 * Create 2 pairs of cq's (1 pair for client
744 		 * and the other pair for server) on this hca.
745 		 * If number of qp's gets too large, then several
746 		 * cq's will be needed.
747 		 */
748 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
749 		    &hca->svc_rcq, ribstat);
750 		if (status != RDMA_SUCCESS) {
751 			goto fail3;
752 		}
753 
754 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
755 		    &hca->svc_scq, ribstat);
756 		if (status != RDMA_SUCCESS) {
757 			goto fail3;
758 		}
759 
760 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
761 		    &hca->clnt_rcq, ribstat);
762 		if (status != RDMA_SUCCESS) {
763 			goto fail3;
764 		}
765 
766 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
767 		    &hca->clnt_scq, ribstat);
768 		if (status != RDMA_SUCCESS) {
769 			goto fail3;
770 		}
771 
772 		/*
773 		 * Create buffer pools.
774 		 * Note rib_rbuf_create also allocates memory windows.
775 		 */
776 		hca->recv_pool = rib_rbufpool_create(hca,
777 		    RECV_BUFFER, MAX_BUFS);
778 		if (hca->recv_pool == NULL) {
779 			goto fail3;
780 		}
781 
782 		hca->send_pool = rib_rbufpool_create(hca,
783 		    SEND_BUFFER, MAX_BUFS);
784 		if (hca->send_pool == NULL) {
785 			rib_rbufpool_destroy(hca, RECV_BUFFER);
786 			goto fail3;
787 		}
788 
789 		if (hca->server_side_cache == NULL) {
790 			(void) sprintf(rssc_name,
791 			    "rib_server_side_cache_%04d", i);
792 			hca->server_side_cache = kmem_cache_create(
793 			    rssc_name,
794 			    sizeof (cache_avl_struct_t), 0,
795 			    NULL,
796 			    NULL,
797 			    rib_server_side_cache_reclaim,
798 			    hca, NULL, 0);
799 		}
800 
801 		avl_create(&hca->avl_tree,
802 		    avl_compare,
803 		    sizeof (cache_avl_struct_t),
804 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
805 		    (uint_t)(uintptr_t)&example_avl_node);
806 
807 		rw_init(&hca->avl_rw_lock,
808 		    NULL, RW_DRIVER, hca->iblock);
809 		mutex_init(&hca->cache_allocation,
810 		    NULL, MUTEX_DRIVER, NULL);
811 		hca->avl_init = TRUE;
812 
813 		/* Create kstats for the cache */
814 		ASSERT(INGLOBALZONE(curproc));
815 
816 		if (!stats_enabled) {
817 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
818 			    KSTAT_TYPE_NAMED,
819 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
820 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
821 			    GLOBAL_ZONEID);
822 			if (ksp) {
823 				ksp->ks_data = (void *) &rpcib_kstat;
824 				ksp->ks_update = rpcib_cache_kstat_update;
825 				kstat_install(ksp);
826 				stats_enabled = TRUE;
827 			}
828 		}
829 		if (NULL == hca->reg_cache_clean_up) {
830 			hca->reg_cache_clean_up = ddi_taskq_create(NULL,
831 			    "REG_CACHE_CLEANUP", 1, TASKQ_DEFAULTPRI, 0);
832 		}
833 
834 		/*
835 		 * Initialize the registered service list and
836 		 * the lock
837 		 */
838 		hca->service_list = NULL;
839 		rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
840 
841 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
842 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
843 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
844 		    hca->iblock);
845 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
846 		    hca->iblock);
847 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
848 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
849 		hca->inuse = TRUE;
850 		/*
851 		 * XXX One hca only. Add multi-hca functionality if needed
852 		 * later.
853 		 */
854 		ribstat->hca = hca;
855 		ribstat->nhca_inited++;
856 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
857 		break;
858 
859 fail3:
860 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
861 fail2:
862 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
863 fail1:
864 		(void) ibt_close_hca(hca->hca_hdl);
865 
866 	}
867 	if (ribstat->hca != NULL)
868 		return (RDMA_SUCCESS);
869 	else
870 		return (RDMA_FAILED);
871 }
872 
873 /*
874  * Callback routines
875  */
876 
877 /*
878  * SCQ handlers
879  */
880 /* ARGSUSED */
881 static void
882 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
883 {
884 	ibt_status_t	ibt_status;
885 	ibt_wc_t	wc;
886 	int		i;
887 
888 	/*
889 	 * Re-enable cq notify here to avoid missing any
890 	 * completion queue notification.
891 	 */
892 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
893 
894 	ibt_status = IBT_SUCCESS;
895 	while (ibt_status != IBT_CQ_EMPTY) {
896 	bzero(&wc, sizeof (wc));
897 	ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
898 	if (ibt_status != IBT_SUCCESS)
899 		return;
900 
901 	/*
902 	 * Got a send completion
903 	 */
904 	if (wc.wc_id != NULL) {	/* XXX can it be otherwise ???? */
905 		struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
906 		CONN	*conn = qptoc(wd->qp);
907 
908 		mutex_enter(&wd->sendwait_lock);
909 		switch (wc.wc_status) {
910 		case IBT_WC_SUCCESS:
911 			wd->status = RDMA_SUCCESS;
912 			break;
913 		case IBT_WC_WR_FLUSHED_ERR:
914 			wd->status = RDMA_FAILED;
915 			break;
916 		default:
917 /*
918  *    RC Send Q Error Code		Local state     Remote State
919  *    ==================== 		===========     ============
920  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
921  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
922  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
923  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
924  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
925  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
926  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
927  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
928  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
929  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
930  *    IBT_WC_WR_FLUSHED_ERR               None            None
931  */
932 			/*
933 			 * Channel in error state. Set connection to
934 			 * ERROR and cleanup will happen either from
935 			 * conn_release  or from rib_conn_get
936 			 */
937 			wd->status = RDMA_FAILED;
938 			mutex_enter(&conn->c_lock);
939 			if (conn->c_state != C_DISCONN_PEND)
940 				conn->c_state = C_ERROR_CONN;
941 			mutex_exit(&conn->c_lock);
942 			break;
943 		}
944 
945 		if (wd->cv_sig == 1) {
946 			/*
947 			 * Notify poster
948 			 */
949 			cv_signal(&wd->wait_cv);
950 			mutex_exit(&wd->sendwait_lock);
951 		} else {
952 			/*
953 			 * Poster not waiting for notification.
954 			 * Free the send buffers and send_wid
955 			 */
956 			for (i = 0; i < wd->nsbufs; i++) {
957 				rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
958 				    (void *)(uintptr_t)wd->sbufaddr[i]);
959 				}
960 			mutex_exit(&wd->sendwait_lock);
961 			(void) rib_free_sendwait(wd);
962 			}
963 		}
964 	}
965 }
966 
967 /* ARGSUSED */
968 static void
969 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
970 {
971 	ibt_status_t	ibt_status;
972 	ibt_wc_t	wc;
973 	int		i;
974 
975 	/*
976 	 * Re-enable cq notify here to avoid missing any
977 	 * completion queue notification.
978 	 */
979 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
980 
981 	ibt_status = IBT_SUCCESS;
982 	while (ibt_status != IBT_CQ_EMPTY) {
983 		bzero(&wc, sizeof (wc));
984 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
985 		if (ibt_status != IBT_SUCCESS)
986 			return;
987 
988 		/*
989 		 * Got a send completion
990 		 */
991 		if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
992 			struct send_wid *wd =
993 			    (struct send_wid *)(uintptr_t)wc.wc_id;
994 			mutex_enter(&wd->sendwait_lock);
995 			if (wd->cv_sig == 1) {
996 				/*
997 				 * Update completion status and notify poster
998 				 */
999 				if (wc.wc_status == IBT_WC_SUCCESS)
1000 					wd->status = RDMA_SUCCESS;
1001 				else
1002 					wd->status = RDMA_FAILED;
1003 				cv_signal(&wd->wait_cv);
1004 				mutex_exit(&wd->sendwait_lock);
1005 			} else {
1006 				/*
1007 				 * Poster not waiting for notification.
1008 				 * Free the send buffers and send_wid
1009 				 */
1010 				for (i = 0; i < wd->nsbufs; i++) {
1011 					rib_rbuf_free(qptoc(wd->qp),
1012 					    SEND_BUFFER,
1013 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1014 				}
1015 				mutex_exit(&wd->sendwait_lock);
1016 				(void) rib_free_sendwait(wd);
1017 			}
1018 		}
1019 	}
1020 }
1021 
1022 /*
1023  * RCQ handler
1024  */
1025 /* ARGSUSED */
1026 static void
1027 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1028 {
1029 	rib_qp_t	*qp;
1030 	ibt_status_t	ibt_status;
1031 	ibt_wc_t	wc;
1032 	struct recv_wid	*rwid;
1033 
1034 	/*
1035 	 * Re-enable cq notify here to avoid missing any
1036 	 * completion queue notification.
1037 	 */
1038 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1039 
1040 	ibt_status = IBT_SUCCESS;
1041 	while (ibt_status != IBT_CQ_EMPTY) {
1042 		bzero(&wc, sizeof (wc));
1043 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1044 		if (ibt_status != IBT_SUCCESS)
1045 			return;
1046 
1047 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1048 		qp = rwid->qp;
1049 		if (wc.wc_status == IBT_WC_SUCCESS) {
1050 			XDR	inxdrs, *xdrs;
1051 			uint_t	xid, vers, op, find_xid = 0;
1052 			struct reply	*r;
1053 			CONN *conn = qptoc(qp);
1054 			uint32_t rdma_credit = 0;
1055 
1056 			xdrs = &inxdrs;
1057 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1058 			    wc.wc_bytes_xfer, XDR_DECODE);
1059 			/*
1060 			 * Treat xid as opaque (xid is the first entity
1061 			 * in the rpc rdma message).
1062 			 */
1063 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1064 
1065 			/* Skip xid and set the xdr position accordingly. */
1066 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1067 			(void) xdr_u_int(xdrs, &vers);
1068 			(void) xdr_u_int(xdrs, &rdma_credit);
1069 			(void) xdr_u_int(xdrs, &op);
1070 			XDR_DESTROY(xdrs);
1071 
1072 			if (vers != RPCRDMA_VERS) {
1073 				/*
1074 				 * Invalid RPC/RDMA version. Cannot
1075 				 * interoperate.  Set connection to
1076 				 * ERROR state and bail out.
1077 				 */
1078 				mutex_enter(&conn->c_lock);
1079 				if (conn->c_state != C_DISCONN_PEND)
1080 					conn->c_state = C_ERROR_CONN;
1081 				mutex_exit(&conn->c_lock);
1082 				rib_rbuf_free(conn, RECV_BUFFER,
1083 				    (void *)(uintptr_t)rwid->addr);
1084 				rib_free_wid(rwid);
1085 				continue;
1086 			}
1087 
1088 			mutex_enter(&qp->replylist_lock);
1089 			for (r = qp->replylist; r != NULL; r = r->next) {
1090 				if (r->xid == xid) {
1091 					find_xid = 1;
1092 					switch (op) {
1093 					case RDMA_MSG:
1094 					case RDMA_NOMSG:
1095 					case RDMA_MSGP:
1096 						r->status = RDMA_SUCCESS;
1097 						r->vaddr_cq = rwid->addr;
1098 						r->bytes_xfer =
1099 						    wc.wc_bytes_xfer;
1100 						cv_signal(&r->wait_cv);
1101 						break;
1102 					default:
1103 						rib_rbuf_free(qptoc(qp),
1104 						    RECV_BUFFER,
1105 						    (void *)(uintptr_t)
1106 						    rwid->addr);
1107 						break;
1108 					}
1109 					break;
1110 				}
1111 			}
1112 			mutex_exit(&qp->replylist_lock);
1113 			if (find_xid == 0) {
1114 				/* RPC caller not waiting for reply */
1115 
1116 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1117 				    int, xid);
1118 
1119 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1120 				    (void *)(uintptr_t)rwid->addr);
1121 			}
1122 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1123 			CONN *conn = qptoc(qp);
1124 
1125 			/*
1126 			 * Connection being flushed. Just free
1127 			 * the posted buffer
1128 			 */
1129 			rib_rbuf_free(conn, RECV_BUFFER,
1130 			    (void *)(uintptr_t)rwid->addr);
1131 		} else {
1132 			CONN *conn = qptoc(qp);
1133 /*
1134  *  RC Recv Q Error Code		Local state     Remote State
1135  *  ====================		===========     ============
1136  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1137  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1138  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1139  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1140  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1141  *  IBT_WC_WR_FLUSHED_ERR               None            None
1142  */
1143 			/*
1144 			 * Channel in error state. Set connection
1145 			 * in ERROR state.
1146 			 */
1147 			mutex_enter(&conn->c_lock);
1148 			if (conn->c_state != C_DISCONN_PEND)
1149 				conn->c_state = C_ERROR_CONN;
1150 			mutex_exit(&conn->c_lock);
1151 			rib_rbuf_free(conn, RECV_BUFFER,
1152 			    (void *)(uintptr_t)rwid->addr);
1153 		}
1154 		rib_free_wid(rwid);
1155 	}
1156 }
1157 
1158 /* Server side */
1159 /* ARGSUSED */
1160 static void
1161 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1162 {
1163 	rdma_recv_data_t *rdp;
1164 	rib_qp_t	*qp;
1165 	ibt_status_t	ibt_status;
1166 	ibt_wc_t	wc;
1167 	struct svc_recv	*s_recvp;
1168 	CONN		*conn;
1169 	mblk_t		*mp;
1170 
1171 	/*
1172 	 * Re-enable cq notify here to avoid missing any
1173 	 * completion queue notification.
1174 	 */
1175 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1176 
1177 	ibt_status = IBT_SUCCESS;
1178 	while (ibt_status != IBT_CQ_EMPTY) {
1179 		bzero(&wc, sizeof (wc));
1180 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1181 		if (ibt_status != IBT_SUCCESS)
1182 			return;
1183 
1184 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1185 		qp = s_recvp->qp;
1186 		conn = qptoc(qp);
1187 		mutex_enter(&qp->posted_rbufs_lock);
1188 		qp->n_posted_rbufs--;
1189 #if defined(MEASURE_POOL_DEPTH)
1190 		rib_posted_rbufs(preposted_rbufs -  qp->n_posted_rbufs);
1191 #endif
1192 		if (qp->n_posted_rbufs == 0)
1193 			cv_signal(&qp->posted_rbufs_cv);
1194 		mutex_exit(&qp->posted_rbufs_lock);
1195 
1196 		if (wc.wc_status == IBT_WC_SUCCESS) {
1197 			XDR	inxdrs, *xdrs;
1198 			uint_t	xid, vers, op;
1199 			uint32_t rdma_credit;
1200 
1201 			xdrs = &inxdrs;
1202 			/* s_recvp->vaddr stores data */
1203 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1204 			    wc.wc_bytes_xfer, XDR_DECODE);
1205 
1206 			/*
1207 			 * Treat xid as opaque (xid is the first entity
1208 			 * in the rpc rdma message).
1209 			 */
1210 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1211 			/* Skip xid and set the xdr position accordingly. */
1212 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1213 			if (!xdr_u_int(xdrs, &vers) ||
1214 			    !xdr_u_int(xdrs, &rdma_credit) ||
1215 			    !xdr_u_int(xdrs, &op)) {
1216 				rib_rbuf_free(conn, RECV_BUFFER,
1217 				    (void *)(uintptr_t)s_recvp->vaddr);
1218 				XDR_DESTROY(xdrs);
1219 				(void) rib_free_svc_recv(s_recvp);
1220 				continue;
1221 			}
1222 			XDR_DESTROY(xdrs);
1223 
1224 			if (vers != RPCRDMA_VERS) {
1225 				/*
1226 				 * Invalid RPC/RDMA version.
1227 				 * Drop rpc rdma message.
1228 				 */
1229 				rib_rbuf_free(conn, RECV_BUFFER,
1230 				    (void *)(uintptr_t)s_recvp->vaddr);
1231 				(void) rib_free_svc_recv(s_recvp);
1232 				continue;
1233 			}
1234 			/*
1235 			 * Is this for RDMA_DONE?
1236 			 */
1237 			if (op == RDMA_DONE) {
1238 				rib_rbuf_free(conn, RECV_BUFFER,
1239 				    (void *)(uintptr_t)s_recvp->vaddr);
1240 				/*
1241 				 * Wake up the thread waiting on
1242 				 * a RDMA_DONE for xid
1243 				 */
1244 				mutex_enter(&qp->rdlist_lock);
1245 				rdma_done_notify(qp, xid);
1246 				mutex_exit(&qp->rdlist_lock);
1247 				(void) rib_free_svc_recv(s_recvp);
1248 				continue;
1249 			}
1250 
1251 			mutex_enter(&plugin_state_lock);
1252 			if (plugin_state == ACCEPT) {
1253 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1254 				    == NULL)
1255 					(void) strwaitbuf(
1256 					    sizeof (*rdp), BPRI_LO);
1257 				/*
1258 				 * Plugin is in accept state, hence the master
1259 				 * transport queue for this is still accepting
1260 				 * requests. Hence we can call svc_queuereq to
1261 				 * queue this recieved msg.
1262 				 */
1263 				rdp = (rdma_recv_data_t *)mp->b_rptr;
1264 				rdp->conn = conn;
1265 				rdp->rpcmsg.addr =
1266 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1267 				rdp->rpcmsg.type = RECV_BUFFER;
1268 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1269 				rdp->status = wc.wc_status;
1270 				mutex_enter(&conn->c_lock);
1271 				conn->c_ref++;
1272 				mutex_exit(&conn->c_lock);
1273 				mp->b_wptr += sizeof (*rdp);
1274 				svc_queuereq((queue_t *)rib_stat->q, mp);
1275 				mutex_exit(&plugin_state_lock);
1276 			} else {
1277 				/*
1278 				 * The master transport for this is going
1279 				 * away and the queue is not accepting anymore
1280 				 * requests for krpc, so don't do anything, just
1281 				 * free the msg.
1282 				 */
1283 				mutex_exit(&plugin_state_lock);
1284 				rib_rbuf_free(conn, RECV_BUFFER,
1285 				    (void *)(uintptr_t)s_recvp->vaddr);
1286 			}
1287 		} else {
1288 			rib_rbuf_free(conn, RECV_BUFFER,
1289 			    (void *)(uintptr_t)s_recvp->vaddr);
1290 		}
1291 		(void) rib_free_svc_recv(s_recvp);
1292 	}
1293 }
1294 
1295 /*
1296  * Handles DR event of IBT_HCA_DETACH_EVENT.
1297  */
1298 /* ARGSUSED */
1299 static void
1300 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1301 	ibt_async_code_t code, ibt_async_event_t *event)
1302 {
1303 
1304 	switch (code) {
1305 	case IBT_HCA_ATTACH_EVENT:
1306 		/* ignore */
1307 		break;
1308 	case IBT_HCA_DETACH_EVENT:
1309 	{
1310 		ASSERT(rib_stat->hca->hca_hdl == hca_hdl);
1311 		rib_detach_hca(rib_stat->hca);
1312 #ifdef DEBUG
1313 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1314 #endif
1315 		break;
1316 	}
1317 #ifdef DEBUG
1318 	case IBT_EVENT_PATH_MIGRATED:
1319 		cmn_err(CE_NOTE, "rib_async_handler(): "
1320 		    "IBT_EVENT_PATH_MIGRATED\n");
1321 		break;
1322 	case IBT_EVENT_SQD:
1323 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1324 		break;
1325 	case IBT_EVENT_COM_EST:
1326 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1327 		break;
1328 	case IBT_ERROR_CATASTROPHIC_CHAN:
1329 		cmn_err(CE_NOTE, "rib_async_handler(): "
1330 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1331 		break;
1332 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1333 		cmn_err(CE_NOTE, "rib_async_handler(): "
1334 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1335 		break;
1336 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1337 		cmn_err(CE_NOTE, "rib_async_handler(): "
1338 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1339 		break;
1340 	case IBT_ERROR_PATH_MIGRATE_REQ:
1341 		cmn_err(CE_NOTE, "rib_async_handler(): "
1342 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1343 		break;
1344 	case IBT_ERROR_CQ:
1345 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1346 		break;
1347 	case IBT_ERROR_PORT_DOWN:
1348 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1349 		break;
1350 	case IBT_EVENT_PORT_UP:
1351 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1352 		break;
1353 	case IBT_ASYNC_OPAQUE1:
1354 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1355 		break;
1356 	case IBT_ASYNC_OPAQUE2:
1357 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1358 		break;
1359 	case IBT_ASYNC_OPAQUE3:
1360 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1361 		break;
1362 	case IBT_ASYNC_OPAQUE4:
1363 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1364 		break;
1365 #endif
1366 	default:
1367 		break;
1368 	}
1369 }
1370 
1371 /*
1372  * Client's reachable function.
1373  */
1374 static rdma_stat
1375 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1376 {
1377 	rdma_stat	status;
1378 	rpcib_ping_t	rpt;
1379 
1380 	/*
1381 	 * First check if a hca is still attached
1382 	 */
1383 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
1384 	if (rib_stat->hca->state != HCA_INITED) {
1385 		rw_exit(&rib_stat->hca->state_lock);
1386 		return (RDMA_FAILED);
1387 	}
1388 
1389 	bzero(&rpt, sizeof (rpcib_ping_t));
1390 	status = rib_ping_srv(addr_type, raddr, &rpt);
1391 	rw_exit(&rib_stat->hca->state_lock);
1392 
1393 	if (status == RDMA_SUCCESS) {
1394 		*handle = (void *)rpt.hca;
1395 		return (RDMA_SUCCESS);
1396 	} else {
1397 		*handle = NULL;
1398 		DTRACE_PROBE(rpcib__i__pingfailed);
1399 		return (RDMA_FAILED);
1400 	}
1401 }
1402 
1403 /* Client side qp creation */
1404 static rdma_stat
1405 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1406 {
1407 	rib_qp_t	*kqp = NULL;
1408 	CONN		*conn;
1409 	rdma_clnt_cred_ctrl_t *cc_info;
1410 
1411 	ASSERT(qp != NULL);
1412 	*qp = NULL;
1413 
1414 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1415 	conn = qptoc(kqp);
1416 	kqp->hca = hca;
1417 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1418 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1419 
1420 	kqp->mode = RIB_CLIENT;
1421 	kqp->chan_flags = IBT_BLOCKING;
1422 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1423 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1424 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1425 	/*
1426 	 * Initialize
1427 	 */
1428 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1429 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1430 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1431 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1432 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1433 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1434 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1435 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1436 	/*
1437 	 * Initialize the client credit control
1438 	 * portion of the rdmaconn struct.
1439 	 */
1440 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1441 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1442 	cc_info->clnt_cc_granted_ops = 0;
1443 	cc_info->clnt_cc_in_flight_ops = 0;
1444 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1445 
1446 	*qp = kqp;
1447 	return (RDMA_SUCCESS);
1448 }
1449 
1450 /* Server side qp creation */
1451 static rdma_stat
1452 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1453 {
1454 	rib_qp_t	*kqp = NULL;
1455 	ibt_chan_sizes_t	chan_sizes;
1456 	ibt_rc_chan_alloc_args_t	qp_attr;
1457 	ibt_status_t		ibt_status;
1458 	rdma_srv_cred_ctrl_t *cc_info;
1459 
1460 	*qp = NULL;
1461 
1462 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1463 	kqp->hca = hca;
1464 	kqp->port_num = port;
1465 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1466 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1467 
1468 	/*
1469 	 * Create the qp handle
1470 	 */
1471 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1472 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1473 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1474 	qp_attr.rc_pd = hca->pd_hdl;
1475 	qp_attr.rc_hca_port_num = port;
1476 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1477 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1478 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1479 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1480 	qp_attr.rc_clone_chan = NULL;
1481 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1482 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1483 
1484 	rw_enter(&hca->state_lock, RW_READER);
1485 	if (hca->state != HCA_DETACHED) {
1486 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1487 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1488 		    &chan_sizes);
1489 	} else {
1490 		rw_exit(&hca->state_lock);
1491 		goto fail;
1492 	}
1493 	rw_exit(&hca->state_lock);
1494 
1495 	if (ibt_status != IBT_SUCCESS) {
1496 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1497 		    int, ibt_status);
1498 		goto fail;
1499 	}
1500 
1501 	kqp->mode = RIB_SERVER;
1502 	kqp->chan_flags = IBT_BLOCKING;
1503 	kqp->q = q;	/* server ONLY */
1504 
1505 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1506 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1507 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1508 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1509 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1510 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1511 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1512 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1513 	/*
1514 	 * Set the private data area to qp to be used in callbacks
1515 	 */
1516 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1517 	kqp->rdmaconn.c_state = C_CONNECTED;
1518 
1519 	/*
1520 	 * Initialize the server credit control
1521 	 * portion of the rdmaconn struct.
1522 	 */
1523 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1524 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1525 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1526 	cc_info->srv_cc_cur_buffers_used = 0;
1527 	cc_info->srv_cc_posted = preposted_rbufs;
1528 
1529 	*qp = kqp;
1530 
1531 	return (RDMA_SUCCESS);
1532 fail:
1533 	if (kqp)
1534 		kmem_free(kqp, sizeof (rib_qp_t));
1535 
1536 	return (RDMA_FAILED);
1537 }
1538 
1539 /* ARGSUSED */
1540 ibt_cm_status_t
1541 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1542     ibt_cm_return_args_t *ret_args, void *priv_data,
1543     ibt_priv_data_len_t len)
1544 {
1545 	rpcib_state_t   *ribstat;
1546 	rib_hca_t	*hca;
1547 
1548 	ribstat = (rpcib_state_t *)clnt_hdl;
1549 	hca = (rib_hca_t *)ribstat->hca;
1550 
1551 	switch (event->cm_type) {
1552 
1553 	/* got a connection close event */
1554 	case IBT_CM_EVENT_CONN_CLOSED:
1555 	{
1556 		CONN	*conn;
1557 		rib_qp_t *qp;
1558 
1559 		/* check reason why connection was closed */
1560 		switch (event->cm_event.closed) {
1561 		case IBT_CM_CLOSED_DREP_RCVD:
1562 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1563 		case IBT_CM_CLOSED_DUP:
1564 		case IBT_CM_CLOSED_ABORT:
1565 		case IBT_CM_CLOSED_ALREADY:
1566 			/*
1567 			 * These cases indicate the local end initiated
1568 			 * the closing of the channel. Nothing to do here.
1569 			 */
1570 			break;
1571 		default:
1572 			/*
1573 			 * Reason for CONN_CLOSED event must be one of
1574 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1575 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1576 			 * the remote end is closing the channel. In these
1577 			 * cases free the channel and transition to error
1578 			 * state
1579 			 */
1580 			qp = ibt_get_chan_private(event->cm_channel);
1581 			conn = qptoc(qp);
1582 			mutex_enter(&conn->c_lock);
1583 			if (conn->c_state == C_DISCONN_PEND) {
1584 				mutex_exit(&conn->c_lock);
1585 				break;
1586 			}
1587 
1588 			conn->c_state = C_ERROR_CONN;
1589 
1590 			/*
1591 			 * Free the rc_channel. Channel has already
1592 			 * transitioned to ERROR state and WRs have been
1593 			 * FLUSHED_ERR already.
1594 			 */
1595 			(void) ibt_free_channel(qp->qp_hdl);
1596 			qp->qp_hdl = NULL;
1597 
1598 			/*
1599 			 * Free the conn if c_ref is down to 0 already
1600 			 */
1601 			if (conn->c_ref == 0) {
1602 				/*
1603 				 * Remove from list and free conn
1604 				 */
1605 				conn->c_state = C_DISCONN_PEND;
1606 				mutex_exit(&conn->c_lock);
1607 				(void) rib_disconnect_channel(conn,
1608 				    &hca->cl_conn_list);
1609 			} else {
1610 				mutex_exit(&conn->c_lock);
1611 			}
1612 #ifdef DEBUG
1613 			if (rib_debug)
1614 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1615 				    "(CONN_CLOSED) channel disconnected");
1616 #endif
1617 			break;
1618 		}
1619 		break;
1620 	}
1621 	default:
1622 		break;
1623 	}
1624 	return (IBT_CM_ACCEPT);
1625 }
1626 
1627 /*
1628  * Connect to the server.
1629  */
1630 rdma_stat
1631 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1632 {
1633 	ibt_chan_open_args_t	chan_args;	/* channel args */
1634 	ibt_chan_sizes_t	chan_sizes;
1635 	ibt_rc_chan_alloc_args_t	qp_attr;
1636 	ibt_status_t		ibt_status;
1637 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1638 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1639 	ibt_ip_cm_info_t	ipcm_info;
1640 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1641 
1642 
1643 	(void) bzero(&chan_args, sizeof (chan_args));
1644 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1645 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1646 
1647 	ipcm_info.src_addr.family = rptp->srcip.family;
1648 	switch (ipcm_info.src_addr.family) {
1649 	case AF_INET:
1650 		ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1651 		break;
1652 	case AF_INET6:
1653 		ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1654 		break;
1655 	}
1656 
1657 	ipcm_info.dst_addr.family = rptp->srcip.family;
1658 	switch (ipcm_info.dst_addr.family) {
1659 	case AF_INET:
1660 		ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1661 		break;
1662 	case AF_INET6:
1663 		ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1664 		break;
1665 	}
1666 
1667 	ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1668 
1669 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1670 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1671 
1672 	if (ibt_status != IBT_SUCCESS) {
1673 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1674 		return (-1);
1675 	}
1676 
1677 	qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1678 	/* Alloc a RC channel */
1679 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1680 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1681 	qp_attr.rc_pd = hca->pd_hdl;
1682 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1683 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1684 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1685 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1686 	qp_attr.rc_clone_chan = NULL;
1687 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1688 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1689 
1690 	rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1691 	chan_args.oc_path = &rptp->path;
1692 
1693 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1694 	chan_args.oc_cm_clnt_private = (void *)rib_stat;
1695 	chan_args.oc_rdma_ra_out = 4;
1696 	chan_args.oc_rdma_ra_in = 4;
1697 	chan_args.oc_path_retry_cnt = 2;
1698 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1699 	chan_args.oc_priv_data = cmp_ip_pvt;
1700 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1701 
1702 refresh:
1703 	rw_enter(&hca->state_lock, RW_READER);
1704 	if (hca->state != HCA_DETACHED) {
1705 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1706 		    IBT_ACHAN_NO_FLAGS,
1707 		    &qp_attr, &qp->qp_hdl,
1708 		    &chan_sizes);
1709 	} else {
1710 		rw_exit(&hca->state_lock);
1711 		return (RDMA_FAILED);
1712 	}
1713 	rw_exit(&hca->state_lock);
1714 
1715 	if (ibt_status != IBT_SUCCESS) {
1716 		DTRACE_PROBE1(rpcib__i_conntosrv,
1717 		    int, ibt_status);
1718 		return (RDMA_FAILED);
1719 	}
1720 
1721 	/* Connect to the Server */
1722 	(void) bzero(&ret_args, sizeof (ret_args));
1723 	mutex_enter(&qp->cb_lock);
1724 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1725 	    IBT_BLOCKING, &chan_args, &ret_args);
1726 	if (ibt_status != IBT_SUCCESS) {
1727 		DTRACE_PROBE2(rpcib__i_openrctosrv,
1728 		    int, ibt_status, int, ret_args.rc_status);
1729 
1730 		(void) ibt_free_channel(qp->qp_hdl);
1731 		qp->qp_hdl = NULL;
1732 		mutex_exit(&qp->cb_lock);
1733 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1734 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1735 			/*
1736 			 * Got IBT_CM_CONN_STALE probably because of stale
1737 			 * data on the passive end of a channel that existed
1738 			 * prior to reboot. Retry establishing a channel
1739 			 * REFRESH_ATTEMPTS times, during which time the
1740 			 * stale conditions on the server might clear up.
1741 			 */
1742 			goto refresh;
1743 		}
1744 		return (RDMA_FAILED);
1745 	}
1746 	mutex_exit(&qp->cb_lock);
1747 	/*
1748 	 * Set the private data area to qp to be used in callbacks
1749 	 */
1750 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1751 	return (RDMA_SUCCESS);
1752 }
1753 
1754 rdma_stat
1755 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1756 {
1757 	uint_t			i;
1758 	ibt_status_t		ibt_status;
1759 	uint8_t			num_paths_p;
1760 	ibt_ip_path_attr_t	ipattr;
1761 	ibt_path_ip_src_t	srcip;
1762 	rpcib_ipaddrs_t		addrs4;
1763 	rpcib_ipaddrs_t		addrs6;
1764 	struct sockaddr_in	*sinp;
1765 	struct sockaddr_in6	*sin6p;
1766 	rdma_stat		retval = RDMA_SUCCESS;
1767 
1768 	ASSERT(raddr->buf != NULL);
1769 
1770 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1771 
1772 	if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1773 	    (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1774 		retval = RDMA_FAILED;
1775 		goto done;
1776 	}
1777 
1778 	/* Prep the destination address */
1779 	switch (addr_type) {
1780 	case AF_INET:
1781 		sinp = (struct sockaddr_in *)raddr->buf;
1782 		rptp->dstip.family = AF_INET;
1783 		rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1784 		sinp = addrs4.ri_list;
1785 
1786 		ipattr.ipa_dst_ip 	= &rptp->dstip;
1787 		ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
1788 		ipattr.ipa_ndst		= 1;
1789 		ipattr.ipa_max_paths	= 1;
1790 		ipattr.ipa_src_ip.family = rptp->dstip.family;
1791 		for (i = 0; i < addrs4.ri_count; i++) {
1792 			num_paths_p = 0;
1793 			ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr;
1794 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
1795 
1796 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1797 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1798 			    &num_paths_p, &srcip);
1799 			if (ibt_status == IBT_SUCCESS &&
1800 			    num_paths_p != 0 &&
1801 			    rptp->path.pi_hca_guid == rib_stat->hca->hca_guid) {
1802 				rptp->hca = rib_stat->hca;
1803 				rptp->srcip.family = AF_INET;
1804 				rptp->srcip.un.ip4addr =
1805 				    srcip.ip_primary.un.ip4addr;
1806 				goto done;
1807 			}
1808 		}
1809 		retval = RDMA_FAILED;
1810 		break;
1811 
1812 	case AF_INET6:
1813 		sin6p = (struct sockaddr_in6 *)raddr->buf;
1814 		rptp->dstip.family = AF_INET6;
1815 		rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1816 		sin6p = addrs6.ri_list;
1817 
1818 		ipattr.ipa_dst_ip 	= &rptp->dstip;
1819 		ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
1820 		ipattr.ipa_ndst		= 1;
1821 		ipattr.ipa_max_paths	= 1;
1822 		ipattr.ipa_src_ip.family = rptp->dstip.family;
1823 		for (i = 0; i < addrs6.ri_count; i++) {
1824 			num_paths_p = 0;
1825 			ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr;
1826 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
1827 
1828 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1829 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1830 			    &num_paths_p, &srcip);
1831 			if (ibt_status == IBT_SUCCESS &&
1832 			    num_paths_p != 0 &&
1833 			    rptp->path.pi_hca_guid == rib_stat->hca->hca_guid) {
1834 				rptp->hca = rib_stat->hca;
1835 				rptp->srcip.family = AF_INET6;
1836 				rptp->srcip.un.ip6addr =
1837 				    srcip.ip_primary.un.ip6addr;
1838 				goto done;
1839 			}
1840 		}
1841 		retval = RDMA_FAILED;
1842 		break;
1843 
1844 	default:
1845 		retval = RDMA_INVAL;
1846 		break;
1847 	}
1848 done:
1849 
1850 	if (addrs4.ri_size > 0)
1851 		kmem_free(addrs4.ri_list, addrs4.ri_size);
1852 	if (addrs6.ri_size > 0)
1853 		kmem_free(addrs6.ri_list, addrs6.ri_size);
1854 	return (retval);
1855 }
1856 
1857 /*
1858  * Close channel, remove from connection list and
1859  * free up resources allocated for that channel.
1860  */
1861 rdma_stat
1862 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
1863 {
1864 	rib_qp_t	*qp = ctoqp(conn);
1865 	rib_hca_t	*hca;
1866 
1867 	/*
1868 	 * c_ref == 0 and connection is in C_DISCONN_PEND
1869 	 */
1870 	hca = qp->hca;
1871 	if (conn_list != NULL)
1872 		(void) rib_rm_conn(conn, conn_list);
1873 
1874 	if (qp->qp_hdl != NULL) {
1875 		/*
1876 		 * If the channel has not been establised,
1877 		 * ibt_flush_channel is called to flush outstanding WRs
1878 		 * on the Qs.  Otherwise, ibt_close_rc_channel() is
1879 		 * called.  The channel is then freed.
1880 		 */
1881 		if (conn_list != NULL)
1882 			(void) ibt_close_rc_channel(qp->qp_hdl,
1883 			    IBT_BLOCKING, NULL, 0, NULL, NULL, 0);
1884 		else
1885 			(void) ibt_flush_channel(qp->qp_hdl);
1886 
1887 		mutex_enter(&qp->posted_rbufs_lock);
1888 		while (qp->n_posted_rbufs)
1889 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
1890 		mutex_exit(&qp->posted_rbufs_lock);
1891 		(void) ibt_free_channel(qp->qp_hdl);
1892 		qp->qp_hdl = NULL;
1893 	}
1894 
1895 	ASSERT(qp->rdlist == NULL);
1896 
1897 	if (qp->replylist != NULL) {
1898 		(void) rib_rem_replylist(qp);
1899 	}
1900 
1901 	cv_destroy(&qp->cb_conn_cv);
1902 	cv_destroy(&qp->posted_rbufs_cv);
1903 	mutex_destroy(&qp->cb_lock);
1904 
1905 	mutex_destroy(&qp->replylist_lock);
1906 	mutex_destroy(&qp->posted_rbufs_lock);
1907 	mutex_destroy(&qp->rdlist_lock);
1908 
1909 	cv_destroy(&conn->c_cv);
1910 	mutex_destroy(&conn->c_lock);
1911 
1912 	if (conn->c_raddr.buf != NULL) {
1913 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
1914 	}
1915 	if (conn->c_laddr.buf != NULL) {
1916 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
1917 	}
1918 
1919 	/*
1920 	 * Credit control cleanup.
1921 	 */
1922 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
1923 		rdma_clnt_cred_ctrl_t *cc_info;
1924 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1925 		cv_destroy(&cc_info->clnt_cc_cv);
1926 	}
1927 
1928 	kmem_free(qp, sizeof (rib_qp_t));
1929 
1930 	/*
1931 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
1932 	 * then the hca is no longer being used.
1933 	 */
1934 	if (conn_list != NULL) {
1935 		rw_enter(&hca->state_lock, RW_READER);
1936 		if (hca->state == HCA_DETACHED) {
1937 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
1938 			if (hca->srv_conn_list.conn_hd == NULL) {
1939 				rw_enter(&hca->cl_conn_list.conn_lock,
1940 				    RW_READER);
1941 
1942 				if (hca->cl_conn_list.conn_hd == NULL) {
1943 					mutex_enter(&hca->inuse_lock);
1944 					hca->inuse = FALSE;
1945 					cv_signal(&hca->cb_cv);
1946 					mutex_exit(&hca->inuse_lock);
1947 				}
1948 				rw_exit(&hca->cl_conn_list.conn_lock);
1949 			}
1950 			rw_exit(&hca->srv_conn_list.conn_lock);
1951 		}
1952 		rw_exit(&hca->state_lock);
1953 	}
1954 
1955 	return (RDMA_SUCCESS);
1956 }
1957 
1958 /*
1959  * Wait for send completion notification. Only on receiving a
1960  * notification be it a successful or error completion, free the
1961  * send_wid.
1962  */
1963 static rdma_stat
1964 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
1965 {
1966 	clock_t timout, cv_wait_ret;
1967 	rdma_stat error = RDMA_SUCCESS;
1968 	int	i;
1969 
1970 	/*
1971 	 * Wait for send to complete
1972 	 */
1973 	ASSERT(wd != NULL);
1974 	mutex_enter(&wd->sendwait_lock);
1975 	if (wd->status == (uint_t)SEND_WAIT) {
1976 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
1977 		    ddi_get_lbolt();
1978 
1979 		if (qp->mode == RIB_SERVER) {
1980 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
1981 			    &wd->sendwait_lock, timout)) > 0 &&
1982 			    wd->status == (uint_t)SEND_WAIT)
1983 				;
1984 			switch (cv_wait_ret) {
1985 			case -1:	/* timeout */
1986 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
1987 
1988 				wd->cv_sig = 0;		/* no signal needed */
1989 				error = RDMA_TIMEDOUT;
1990 				break;
1991 			default:	/* got send completion */
1992 				break;
1993 			}
1994 		} else {
1995 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
1996 			    &wd->sendwait_lock, timout)) > 0 &&
1997 			    wd->status == (uint_t)SEND_WAIT)
1998 				;
1999 			switch (cv_wait_ret) {
2000 			case -1:	/* timeout */
2001 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2002 
2003 				wd->cv_sig = 0;		/* no signal needed */
2004 				error = RDMA_TIMEDOUT;
2005 				break;
2006 			case 0:		/* interrupted */
2007 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2008 
2009 				wd->cv_sig = 0;		/* no signal needed */
2010 				error = RDMA_INTR;
2011 				break;
2012 			default:	/* got send completion */
2013 				break;
2014 			}
2015 		}
2016 	}
2017 
2018 	if (wd->status != (uint_t)SEND_WAIT) {
2019 		/* got send completion */
2020 		if (wd->status != RDMA_SUCCESS) {
2021 			if (wd->status != RDMA_CONNLOST) {
2022 				error = RDMA_FAILED;
2023 			} else {
2024 				error = RDMA_CONNLOST;
2025 			}
2026 		}
2027 		for (i = 0; i < wd->nsbufs; i++) {
2028 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2029 			    (void *)(uintptr_t)wd->sbufaddr[i]);
2030 		}
2031 		mutex_exit(&wd->sendwait_lock);
2032 		(void) rib_free_sendwait(wd);
2033 	} else {
2034 		mutex_exit(&wd->sendwait_lock);
2035 	}
2036 	return (error);
2037 }
2038 
2039 static struct send_wid *
2040 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2041 {
2042 	struct send_wid	*wd;
2043 
2044 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2045 	wd->xid = xid;
2046 	wd->cv_sig = cv_sig;
2047 	wd->qp = qp;
2048 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2049 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2050 	wd->status = (uint_t)SEND_WAIT;
2051 
2052 	return (wd);
2053 }
2054 
2055 static int
2056 rib_free_sendwait(struct send_wid *wdesc)
2057 {
2058 	cv_destroy(&wdesc->wait_cv);
2059 	mutex_destroy(&wdesc->sendwait_lock);
2060 	kmem_free(wdesc, sizeof (*wdesc));
2061 
2062 	return (0);
2063 }
2064 
2065 static rdma_stat
2066 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2067 {
2068 	mutex_enter(&qp->replylist_lock);
2069 	if (rep != NULL) {
2070 		(void) rib_remreply(qp, rep);
2071 		mutex_exit(&qp->replylist_lock);
2072 		return (RDMA_SUCCESS);
2073 	}
2074 	mutex_exit(&qp->replylist_lock);
2075 	return (RDMA_FAILED);
2076 }
2077 
2078 /*
2079  * Send buffers are freed here only in case of error in posting
2080  * on QP. If the post succeeded, the send buffers are freed upon
2081  * send completion in rib_sendwait() or in the scq_handler.
2082  */
2083 rdma_stat
2084 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2085 	int send_sig, int cv_sig, caddr_t *swid)
2086 {
2087 	struct send_wid	*wdesc;
2088 	struct clist	*clp;
2089 	ibt_status_t	ibt_status = IBT_SUCCESS;
2090 	rdma_stat	ret = RDMA_SUCCESS;
2091 	ibt_send_wr_t	tx_wr;
2092 	int		i, nds;
2093 	ibt_wr_ds_t	sgl[DSEG_MAX];
2094 	uint_t		total_msg_size;
2095 	rib_qp_t	*qp;
2096 
2097 	qp = ctoqp(conn);
2098 
2099 	ASSERT(cl != NULL);
2100 
2101 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2102 
2103 	nds = 0;
2104 	total_msg_size = 0;
2105 	clp = cl;
2106 	while (clp != NULL) {
2107 		if (nds >= DSEG_MAX) {
2108 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2109 			return (RDMA_FAILED);
2110 		}
2111 		sgl[nds].ds_va = clp->w.c_saddr;
2112 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2113 		sgl[nds].ds_len = clp->c_len;
2114 		total_msg_size += clp->c_len;
2115 		clp = clp->c_next;
2116 		nds++;
2117 	}
2118 
2119 	if (send_sig) {
2120 		/* Set SEND_SIGNAL flag. */
2121 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2122 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2123 		*swid = (caddr_t)wdesc;
2124 	} else {
2125 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2126 		wdesc = rib_init_sendwait(msgid, 0, qp);
2127 		*swid = (caddr_t)wdesc;
2128 	}
2129 	wdesc->nsbufs = nds;
2130 	for (i = 0; i < nds; i++) {
2131 		wdesc->sbufaddr[i] = sgl[i].ds_va;
2132 	}
2133 
2134 	tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2135 	tx_wr.wr_opcode = IBT_WRC_SEND;
2136 	tx_wr.wr_trans = IBT_RC_SRV;
2137 	tx_wr.wr_nds = nds;
2138 	tx_wr.wr_sgl = sgl;
2139 
2140 	mutex_enter(&conn->c_lock);
2141 	if (conn->c_state == C_CONNECTED) {
2142 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2143 	}
2144 	if (conn->c_state != C_CONNECTED ||
2145 	    ibt_status != IBT_SUCCESS) {
2146 		if (conn->c_state != C_DISCONN_PEND)
2147 			conn->c_state = C_ERROR_CONN;
2148 		mutex_exit(&conn->c_lock);
2149 		for (i = 0; i < nds; i++) {
2150 			rib_rbuf_free(conn, SEND_BUFFER,
2151 			    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2152 		}
2153 		(void) rib_free_sendwait(wdesc);
2154 		return (RDMA_CONNLOST);
2155 	}
2156 	mutex_exit(&conn->c_lock);
2157 
2158 	if (send_sig) {
2159 		if (cv_sig) {
2160 			/*
2161 			 * cv_wait for send to complete.
2162 			 * We can fail due to a timeout or signal or
2163 			 * unsuccessful send.
2164 			 */
2165 			ret = rib_sendwait(qp, wdesc);
2166 
2167 			return (ret);
2168 		}
2169 	}
2170 
2171 	return (RDMA_SUCCESS);
2172 }
2173 
2174 
2175 rdma_stat
2176 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2177 {
2178 	rdma_stat	ret;
2179 	caddr_t		wd;
2180 
2181 	/* send-wait & cv_signal */
2182 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2183 	return (ret);
2184 }
2185 
2186 /*
2187  * Server interface (svc_rdma_ksend).
2188  * Send RPC reply and wait for RDMA_DONE.
2189  */
2190 rdma_stat
2191 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2192 {
2193 	rdma_stat ret = RDMA_SUCCESS;
2194 	struct rdma_done_list *rd;
2195 	clock_t timout, cv_wait_ret;
2196 	caddr_t *wid = NULL;
2197 	rib_qp_t *qp = ctoqp(conn);
2198 
2199 	mutex_enter(&qp->rdlist_lock);
2200 	rd = rdma_done_add(qp, msgid);
2201 
2202 	/* No cv_signal (whether send-wait or no-send-wait) */
2203 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2204 
2205 	if (ret != RDMA_SUCCESS) {
2206 		rdma_done_rm(qp, rd);
2207 	} else {
2208 		/*
2209 		 * Wait for RDMA_DONE from remote end
2210 		 */
2211 		timout =
2212 		    drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2213 		cv_wait_ret = cv_timedwait(&rd->rdma_done_cv,
2214 		    &qp->rdlist_lock,
2215 		    timout);
2216 
2217 		rdma_done_rm(qp, rd);
2218 
2219 		if (cv_wait_ret < 0) {
2220 			ret = RDMA_TIMEDOUT;
2221 		}
2222 	}
2223 
2224 	mutex_exit(&qp->rdlist_lock);
2225 	return (ret);
2226 }
2227 
2228 static struct recv_wid *
2229 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2230 {
2231 	struct recv_wid	*rwid;
2232 
2233 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2234 	rwid->xid = msgid;
2235 	rwid->addr = sgl->ds_va;
2236 	rwid->qp = qp;
2237 
2238 	return (rwid);
2239 }
2240 
2241 static void
2242 rib_free_wid(struct recv_wid *rwid)
2243 {
2244 	kmem_free(rwid, sizeof (struct recv_wid));
2245 }
2246 
2247 rdma_stat
2248 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2249 {
2250 	rib_qp_t	*qp = ctoqp(conn);
2251 	struct clist	*clp = cl;
2252 	struct reply	*rep;
2253 	struct recv_wid	*rwid;
2254 	int		nds;
2255 	ibt_wr_ds_t	sgl[DSEG_MAX];
2256 	ibt_recv_wr_t	recv_wr;
2257 	rdma_stat	ret;
2258 	ibt_status_t	ibt_status;
2259 
2260 	/*
2261 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2262 	 */
2263 
2264 	nds = 0;
2265 	while (cl != NULL) {
2266 		if (nds >= DSEG_MAX) {
2267 			ret = RDMA_FAILED;
2268 			goto done;
2269 		}
2270 		sgl[nds].ds_va = cl->w.c_saddr;
2271 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2272 		sgl[nds].ds_len = cl->c_len;
2273 		cl = cl->c_next;
2274 		nds++;
2275 	}
2276 
2277 	if (nds != 1) {
2278 		ret = RDMA_FAILED;
2279 		goto done;
2280 	}
2281 
2282 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2283 	recv_wr.wr_nds = nds;
2284 	recv_wr.wr_sgl = sgl;
2285 
2286 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2287 	if (rwid) {
2288 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2289 	} else {
2290 		ret = RDMA_NORESOURCE;
2291 		goto done;
2292 	}
2293 	rep = rib_addreplylist(qp, msgid);
2294 	if (!rep) {
2295 		rib_free_wid(rwid);
2296 		ret = RDMA_NORESOURCE;
2297 		goto done;
2298 	}
2299 
2300 	mutex_enter(&conn->c_lock);
2301 
2302 	if (conn->c_state == C_CONNECTED) {
2303 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2304 	}
2305 
2306 	if (conn->c_state != C_CONNECTED ||
2307 	    ibt_status != IBT_SUCCESS) {
2308 		if (conn->c_state != C_DISCONN_PEND)
2309 			conn->c_state = C_ERROR_CONN;
2310 		mutex_exit(&conn->c_lock);
2311 		rib_free_wid(rwid);
2312 		(void) rib_rem_rep(qp, rep);
2313 		ret = RDMA_CONNLOST;
2314 		goto done;
2315 	}
2316 	mutex_exit(&conn->c_lock);
2317 	return (RDMA_SUCCESS);
2318 
2319 done:
2320 	while (clp != NULL) {
2321 		rib_rbuf_free(conn, RECV_BUFFER,
2322 		    (void *)(uintptr_t)clp->w.c_saddr3);
2323 		clp = clp->c_next;
2324 	}
2325 	return (ret);
2326 }
2327 
2328 rdma_stat
2329 rib_svc_post(CONN* conn, struct clist *cl)
2330 {
2331 	rib_qp_t	*qp = ctoqp(conn);
2332 	struct svc_recv	*s_recvp;
2333 	int		nds;
2334 	ibt_wr_ds_t	sgl[DSEG_MAX];
2335 	ibt_recv_wr_t	recv_wr;
2336 	ibt_status_t	ibt_status;
2337 
2338 	nds = 0;
2339 	while (cl != NULL) {
2340 		if (nds >= DSEG_MAX) {
2341 			return (RDMA_FAILED);
2342 		}
2343 		sgl[nds].ds_va = cl->w.c_saddr;
2344 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2345 		sgl[nds].ds_len = cl->c_len;
2346 		cl = cl->c_next;
2347 		nds++;
2348 	}
2349 
2350 	if (nds != 1) {
2351 		rib_rbuf_free(conn, RECV_BUFFER,
2352 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2353 
2354 		return (RDMA_FAILED);
2355 	}
2356 
2357 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2358 	recv_wr.wr_nds = nds;
2359 	recv_wr.wr_sgl = sgl;
2360 
2361 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2362 	/* Use s_recvp's addr as wr id */
2363 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2364 	mutex_enter(&conn->c_lock);
2365 	if (conn->c_state == C_CONNECTED) {
2366 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2367 	}
2368 	if (conn->c_state != C_CONNECTED ||
2369 	    ibt_status != IBT_SUCCESS) {
2370 		if (conn->c_state != C_DISCONN_PEND)
2371 			conn->c_state = C_ERROR_CONN;
2372 		mutex_exit(&conn->c_lock);
2373 		rib_rbuf_free(conn, RECV_BUFFER,
2374 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2375 		(void) rib_free_svc_recv(s_recvp);
2376 
2377 		return (RDMA_CONNLOST);
2378 	}
2379 	mutex_exit(&conn->c_lock);
2380 
2381 	return (RDMA_SUCCESS);
2382 }
2383 
2384 /* Client */
2385 rdma_stat
2386 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2387 {
2388 
2389 	return (rib_clnt_post(conn, cl, msgid));
2390 }
2391 
2392 /* Client */
2393 rdma_stat
2394 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2395 {
2396 	rib_qp_t	*qp = ctoqp(conn);
2397 	struct reply	*rep;
2398 
2399 	mutex_enter(&qp->replylist_lock);
2400 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2401 		if (rep->xid == msgid) {
2402 			if (rep->vaddr_cq) {
2403 				rib_rbuf_free(conn, RECV_BUFFER,
2404 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2405 			}
2406 			(void) rib_remreply(qp, rep);
2407 			break;
2408 		}
2409 	}
2410 	mutex_exit(&qp->replylist_lock);
2411 
2412 	return (RDMA_SUCCESS);
2413 }
2414 
2415 /* Server */
2416 rdma_stat
2417 rib_post_recv(CONN *conn, struct clist *cl)
2418 {
2419 	rib_qp_t	*qp = ctoqp(conn);
2420 
2421 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2422 		mutex_enter(&qp->posted_rbufs_lock);
2423 		qp->n_posted_rbufs++;
2424 		mutex_exit(&qp->posted_rbufs_lock);
2425 		return (RDMA_SUCCESS);
2426 	}
2427 	return (RDMA_FAILED);
2428 }
2429 
2430 /*
2431  * Client side only interface to "recv" the rpc reply buf
2432  * posted earlier by rib_post_resp(conn, cl, msgid).
2433  */
2434 rdma_stat
2435 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2436 {
2437 	struct reply *rep = NULL;
2438 	clock_t timout, cv_wait_ret;
2439 	rdma_stat ret = RDMA_SUCCESS;
2440 	rib_qp_t *qp = ctoqp(conn);
2441 
2442 	/*
2443 	 * Find the reply structure for this msgid
2444 	 */
2445 	mutex_enter(&qp->replylist_lock);
2446 
2447 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2448 		if (rep->xid == msgid)
2449 			break;
2450 	}
2451 
2452 	if (rep != NULL) {
2453 		/*
2454 		 * If message not yet received, wait.
2455 		 */
2456 		if (rep->status == (uint_t)REPLY_WAIT) {
2457 			timout = ddi_get_lbolt() +
2458 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2459 
2460 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2461 			    &qp->replylist_lock, timout)) > 0 &&
2462 			    rep->status == (uint_t)REPLY_WAIT)
2463 				;
2464 
2465 			switch (cv_wait_ret) {
2466 			case -1:	/* timeout */
2467 				ret = RDMA_TIMEDOUT;
2468 				break;
2469 			case 0:
2470 				ret = RDMA_INTR;
2471 				break;
2472 			default:
2473 				break;
2474 			}
2475 		}
2476 
2477 		if (rep->status == RDMA_SUCCESS) {
2478 			struct clist *cl = NULL;
2479 
2480 			/*
2481 			 * Got message successfully
2482 			 */
2483 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2484 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2485 			*clp = cl;
2486 		} else {
2487 			if (rep->status != (uint_t)REPLY_WAIT) {
2488 				/*
2489 				 * Got error in reply message. Free
2490 				 * recv buffer here.
2491 				 */
2492 				ret = rep->status;
2493 				rib_rbuf_free(conn, RECV_BUFFER,
2494 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2495 			}
2496 		}
2497 		(void) rib_remreply(qp, rep);
2498 	} else {
2499 		/*
2500 		 * No matching reply structure found for given msgid on the
2501 		 * reply wait list.
2502 		 */
2503 		ret = RDMA_INVAL;
2504 		DTRACE_PROBE(rpcib__i__nomatchxid2);
2505 	}
2506 
2507 	/*
2508 	 * Done.
2509 	 */
2510 	mutex_exit(&qp->replylist_lock);
2511 	return (ret);
2512 }
2513 
2514 /*
2515  * RDMA write a buffer to the remote address.
2516  */
2517 rdma_stat
2518 rib_write(CONN *conn, struct clist *cl, int wait)
2519 {
2520 	ibt_send_wr_t	tx_wr;
2521 	int		cv_sig;
2522 	int		i;
2523 	ibt_wr_ds_t	sgl[DSEG_MAX];
2524 	struct send_wid	*wdesc;
2525 	ibt_status_t	ibt_status;
2526 	rdma_stat	ret = RDMA_SUCCESS;
2527 	rib_qp_t	*qp = ctoqp(conn);
2528 	uint64_t	n_writes = 0;
2529 	bool_t		force_wait = FALSE;
2530 
2531 	if (cl == NULL) {
2532 		return (RDMA_FAILED);
2533 	}
2534 
2535 	while ((cl != NULL)) {
2536 		if (cl->c_len > 0) {
2537 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2538 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2539 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2540 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2541 			sgl[0].ds_va = cl->w.c_saddr;
2542 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2543 			sgl[0].ds_len = cl->c_len;
2544 
2545 			if (wait) {
2546 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2547 				cv_sig = 1;
2548 			} else {
2549 				if (n_writes > max_unsignaled_rws) {
2550 					n_writes = 0;
2551 					force_wait = TRUE;
2552 					tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2553 					cv_sig = 1;
2554 				} else {
2555 					tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2556 					cv_sig = 0;
2557 				}
2558 			}
2559 
2560 			wdesc = rib_init_sendwait(0, cv_sig, qp);
2561 			tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2562 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2563 			tx_wr.wr_trans = IBT_RC_SRV;
2564 			tx_wr.wr_nds = 1;
2565 			tx_wr.wr_sgl = sgl;
2566 
2567 			mutex_enter(&conn->c_lock);
2568 			if (conn->c_state == C_CONNECTED) {
2569 				ibt_status =
2570 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2571 			}
2572 			if (conn->c_state != C_CONNECTED ||
2573 			    ibt_status != IBT_SUCCESS) {
2574 				if (conn->c_state != C_DISCONN_PEND)
2575 					conn->c_state = C_ERROR_CONN;
2576 				mutex_exit(&conn->c_lock);
2577 				(void) rib_free_sendwait(wdesc);
2578 				return (RDMA_CONNLOST);
2579 			}
2580 			mutex_exit(&conn->c_lock);
2581 
2582 			/*
2583 			 * Wait for send to complete
2584 			 */
2585 			if (wait || force_wait) {
2586 				force_wait = FALSE;
2587 				ret = rib_sendwait(qp, wdesc);
2588 				if (ret != 0) {
2589 					return (ret);
2590 				}
2591 			} else {
2592 				mutex_enter(&wdesc->sendwait_lock);
2593 				for (i = 0; i < wdesc->nsbufs; i++) {
2594 					rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2595 					    (void *)(uintptr_t)
2596 					    wdesc->sbufaddr[i]);
2597 				}
2598 				mutex_exit(&wdesc->sendwait_lock);
2599 				(void) rib_free_sendwait(wdesc);
2600 			}
2601 			n_writes ++;
2602 		}
2603 		cl = cl->c_next;
2604 	}
2605 	return (RDMA_SUCCESS);
2606 }
2607 
2608 /*
2609  * RDMA Read a buffer from the remote address.
2610  */
2611 rdma_stat
2612 rib_read(CONN *conn, struct clist *cl, int wait)
2613 {
2614 	ibt_send_wr_t	rx_wr;
2615 	int		cv_sig;
2616 	int		i;
2617 	ibt_wr_ds_t	sgl;
2618 	struct send_wid	*wdesc;
2619 	ibt_status_t	ibt_status = IBT_SUCCESS;
2620 	rdma_stat	ret = RDMA_SUCCESS;
2621 	rib_qp_t	*qp = ctoqp(conn);
2622 
2623 	if (cl == NULL) {
2624 		return (RDMA_FAILED);
2625 	}
2626 
2627 	while (cl != NULL) {
2628 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2629 		/*
2630 		 * Remote address is at the head chunk item in list.
2631 		 */
2632 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2633 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2634 
2635 		sgl.ds_va = cl->u.c_daddr;
2636 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2637 		sgl.ds_len = cl->c_len;
2638 
2639 		if (wait) {
2640 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2641 			cv_sig = 1;
2642 		} else {
2643 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2644 			cv_sig = 0;
2645 		}
2646 
2647 		wdesc = rib_init_sendwait(0, cv_sig, qp);
2648 		rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2649 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2650 		rx_wr.wr_trans = IBT_RC_SRV;
2651 		rx_wr.wr_nds = 1;
2652 		rx_wr.wr_sgl = &sgl;
2653 
2654 		mutex_enter(&conn->c_lock);
2655 		if (conn->c_state == C_CONNECTED) {
2656 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2657 		}
2658 		if (conn->c_state != C_CONNECTED ||
2659 		    ibt_status != IBT_SUCCESS) {
2660 			if (conn->c_state != C_DISCONN_PEND)
2661 				conn->c_state = C_ERROR_CONN;
2662 			mutex_exit(&conn->c_lock);
2663 			(void) rib_free_sendwait(wdesc);
2664 			return (RDMA_CONNLOST);
2665 		}
2666 		mutex_exit(&conn->c_lock);
2667 
2668 		/*
2669 		 * Wait for send to complete if this is the
2670 		 * last item in the list.
2671 		 */
2672 		if (wait && cl->c_next == NULL) {
2673 			ret = rib_sendwait(qp, wdesc);
2674 			if (ret != 0) {
2675 				return (ret);
2676 			}
2677 		} else {
2678 			mutex_enter(&wdesc->sendwait_lock);
2679 			for (i = 0; i < wdesc->nsbufs; i++) {
2680 				rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2681 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2682 			}
2683 			mutex_exit(&wdesc->sendwait_lock);
2684 			(void) rib_free_sendwait(wdesc);
2685 		}
2686 		cl = cl->c_next;
2687 	}
2688 	return (RDMA_SUCCESS);
2689 }
2690 
2691 /*
2692  * rib_srv_cm_handler()
2693  *    Connection Manager callback to handle RC connection requests.
2694  */
2695 /* ARGSUSED */
2696 static ibt_cm_status_t
2697 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2698 	ibt_cm_return_args_t *ret_args, void *priv_data,
2699 	ibt_priv_data_len_t len)
2700 {
2701 	queue_t		*q;
2702 	rib_qp_t	*qp;
2703 	rpcib_state_t	*ribstat;
2704 	rib_hca_t	*hca;
2705 	rdma_stat	status = RDMA_SUCCESS;
2706 	int		i;
2707 	struct clist	cl;
2708 	rdma_buf_t	rdbuf = {0};
2709 	void		*buf = NULL;
2710 	CONN		*conn;
2711 	ibt_ip_cm_info_t	ipinfo;
2712 	struct sockaddr_in *s;
2713 	struct sockaddr_in6 *s6;
2714 	int sin_size = sizeof (struct sockaddr_in);
2715 	int in_size = sizeof (struct in_addr);
2716 	int sin6_size = sizeof (struct sockaddr_in6);
2717 
2718 	ASSERT(any != NULL);
2719 	ASSERT(event != NULL);
2720 
2721 	ribstat = (rpcib_state_t *)any;
2722 	hca = (rib_hca_t *)ribstat->hca;
2723 	ASSERT(hca != NULL);
2724 
2725 	/* got a connection request */
2726 	switch (event->cm_type) {
2727 	case IBT_CM_EVENT_REQ_RCV:
2728 		/*
2729 		 * If the plugin is in the NO_ACCEPT state, bail out.
2730 		 */
2731 		mutex_enter(&plugin_state_lock);
2732 		if (plugin_state == NO_ACCEPT) {
2733 			mutex_exit(&plugin_state_lock);
2734 			return (IBT_CM_REJECT);
2735 		}
2736 		mutex_exit(&plugin_state_lock);
2737 
2738 		/*
2739 		 * Need to send a MRA MAD to CM so that it does not
2740 		 * timeout on us.
2741 		 */
2742 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2743 		    event->cm_event.req.req_timeout * 8, NULL, 0);
2744 
2745 		mutex_enter(&rib_stat->open_hca_lock);
2746 		q = rib_stat->q;
2747 		mutex_exit(&rib_stat->open_hca_lock);
2748 
2749 		status = rib_svc_create_chan(hca, (caddr_t)q,
2750 		    event->cm_event.req.req_prim_hca_port, &qp);
2751 
2752 		if (status) {
2753 			return (IBT_CM_REJECT);
2754 		}
2755 
2756 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2757 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2758 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2759 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2760 
2761 		/*
2762 		 * Pre-posts RECV buffers
2763 		 */
2764 		conn = qptoc(qp);
2765 		for (i = 0; i < preposted_rbufs; i++) {
2766 			bzero(&rdbuf, sizeof (rdbuf));
2767 			rdbuf.type = RECV_BUFFER;
2768 			buf = rib_rbuf_alloc(conn, &rdbuf);
2769 			if (buf == NULL) {
2770 				(void) rib_disconnect_channel(conn, NULL);
2771 				return (IBT_CM_REJECT);
2772 			}
2773 
2774 			bzero(&cl, sizeof (cl));
2775 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
2776 			cl.c_len = rdbuf.len;
2777 			cl.c_smemhandle.mrc_lmr =
2778 			    rdbuf.handle.mrc_lmr; /* lkey */
2779 			cl.c_next = NULL;
2780 			status = rib_post_recv(conn, &cl);
2781 			if (status != RDMA_SUCCESS) {
2782 				(void) rib_disconnect_channel(conn, NULL);
2783 				return (IBT_CM_REJECT);
2784 			}
2785 		}
2786 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
2787 
2788 		/*
2789 		 * Get the address translation
2790 		 */
2791 		rw_enter(&hca->state_lock, RW_READER);
2792 		if (hca->state == HCA_DETACHED) {
2793 			rw_exit(&hca->state_lock);
2794 			return (IBT_CM_REJECT);
2795 		}
2796 		rw_exit(&hca->state_lock);
2797 
2798 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
2799 
2800 		if (ibt_get_ip_data(event->cm_priv_data_len,
2801 		    event->cm_priv_data,
2802 		    &ipinfo) != IBT_SUCCESS) {
2803 
2804 			return (IBT_CM_REJECT);
2805 		}
2806 
2807 		switch (ipinfo.src_addr.family) {
2808 		case AF_INET:
2809 
2810 			conn->c_raddr.maxlen =
2811 			    conn->c_raddr.len = sin_size;
2812 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
2813 
2814 			s = (struct sockaddr_in *)conn->c_raddr.buf;
2815 			s->sin_family = AF_INET;
2816 
2817 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
2818 			    &s->sin_addr, in_size);
2819 
2820 			break;
2821 
2822 		case AF_INET6:
2823 
2824 			conn->c_raddr.maxlen =
2825 			    conn->c_raddr.len = sin6_size;
2826 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
2827 
2828 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
2829 			s6->sin6_family = AF_INET6;
2830 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
2831 			    &s6->sin6_addr,
2832 			    sizeof (struct in6_addr));
2833 
2834 			break;
2835 
2836 		default:
2837 			return (IBT_CM_REJECT);
2838 		}
2839 
2840 		break;
2841 
2842 	case IBT_CM_EVENT_CONN_CLOSED:
2843 	{
2844 		CONN		*conn;
2845 		rib_qp_t	*qp;
2846 
2847 		switch (event->cm_event.closed) {
2848 		case IBT_CM_CLOSED_DREP_RCVD:
2849 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
2850 		case IBT_CM_CLOSED_DUP:
2851 		case IBT_CM_CLOSED_ABORT:
2852 		case IBT_CM_CLOSED_ALREADY:
2853 			/*
2854 			 * These cases indicate the local end initiated
2855 			 * the closing of the channel. Nothing to do here.
2856 			 */
2857 			break;
2858 		default:
2859 			/*
2860 			 * Reason for CONN_CLOSED event must be one of
2861 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
2862 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
2863 			 * the remote end is closing the channel. In these
2864 			 * cases free the channel and transition to error
2865 			 * state
2866 			 */
2867 			qp = ibt_get_chan_private(event->cm_channel);
2868 			conn = qptoc(qp);
2869 			mutex_enter(&conn->c_lock);
2870 			if (conn->c_state == C_DISCONN_PEND) {
2871 				mutex_exit(&conn->c_lock);
2872 				break;
2873 			}
2874 			conn->c_state = C_ERROR_CONN;
2875 
2876 			/*
2877 			 * Free the rc_channel. Channel has already
2878 			 * transitioned to ERROR state and WRs have been
2879 			 * FLUSHED_ERR already.
2880 			 */
2881 			(void) ibt_free_channel(qp->qp_hdl);
2882 			qp->qp_hdl = NULL;
2883 
2884 			/*
2885 			 * Free the conn if c_ref goes down to 0
2886 			 */
2887 			if (conn->c_ref == 0) {
2888 				/*
2889 				 * Remove from list and free conn
2890 				 */
2891 				conn->c_state = C_DISCONN_PEND;
2892 				mutex_exit(&conn->c_lock);
2893 				(void) rib_disconnect_channel(conn,
2894 				    &hca->srv_conn_list);
2895 			} else {
2896 				mutex_exit(&conn->c_lock);
2897 			}
2898 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
2899 			break;
2900 		}
2901 		break;
2902 	}
2903 	case IBT_CM_EVENT_CONN_EST:
2904 		/*
2905 		 * RTU received, hence connection established.
2906 		 */
2907 		if (rib_debug > 1)
2908 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2909 			    "(CONN_EST) channel established");
2910 		break;
2911 
2912 	default:
2913 		if (rib_debug > 2) {
2914 			/* Let CM handle the following events. */
2915 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
2916 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2917 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
2918 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
2919 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2920 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
2921 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
2922 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2923 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
2924 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
2925 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2926 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
2927 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
2928 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2929 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
2930 			}
2931 		}
2932 		return (IBT_CM_DEFAULT);
2933 	}
2934 
2935 	/* accept all other CM messages (i.e. let the CM handle them) */
2936 	return (IBT_CM_ACCEPT);
2937 }
2938 
2939 static rdma_stat
2940 rib_register_service(rib_hca_t *hca, int service_type)
2941 {
2942 	ibt_srv_desc_t		sdesc;
2943 	ibt_hca_portinfo_t	*port_infop;
2944 	ib_svc_id_t		srv_id;
2945 	ibt_srv_hdl_t		srv_hdl;
2946 	uint_t			port_size;
2947 	uint_t			pki, i, num_ports, nbinds;
2948 	ibt_status_t		ibt_status;
2949 	rib_service_t		*new_service;
2950 	ib_pkey_t		pkey;
2951 
2952 	/*
2953 	 * Query all ports for the given HCA
2954 	 */
2955 	rw_enter(&hca->state_lock, RW_READER);
2956 	if (hca->state != HCA_DETACHED) {
2957 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
2958 		    &num_ports, &port_size);
2959 		rw_exit(&hca->state_lock);
2960 	} else {
2961 		rw_exit(&hca->state_lock);
2962 		return (RDMA_FAILED);
2963 	}
2964 	if (ibt_status != IBT_SUCCESS) {
2965 		return (RDMA_FAILED);
2966 	}
2967 
2968 	DTRACE_PROBE1(rpcib__i__regservice_numports,
2969 	    int, num_ports);
2970 
2971 	for (i = 0; i < num_ports; i++) {
2972 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
2973 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
2974 			    int, i+1);
2975 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
2976 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
2977 			    int, i+1);
2978 		}
2979 	}
2980 
2981 	/*
2982 	 * Get all the IP addresses on this system to register the
2983 	 * given "service type" on all DNS recognized IP addrs.
2984 	 * Each service type such as NFS will have all the systems
2985 	 * IP addresses as its different names. For now the only
2986 	 * type of service we support in RPCIB is NFS.
2987 	 */
2988 	rw_enter(&hca->service_list_lock, RW_WRITER);
2989 	/*
2990 	 * Start registering and binding service to active
2991 	 * on active ports on this HCA.
2992 	 */
2993 	nbinds = 0;
2994 	new_service = NULL;
2995 
2996 	/*
2997 	 * We use IP addresses as the service names for
2998 	 * service registration.  Register each of them
2999 	 * with CM to obtain a svc_id and svc_hdl.  We do not
3000 	 * register the service with machine's loopback address.
3001 	 */
3002 	(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3003 	(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3004 	(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3005 
3006 	sdesc.sd_handler = rib_srv_cm_handler;
3007 	sdesc.sd_flags = 0;
3008 	ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3009 	    &sdesc, ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port),
3010 	    1, &srv_hdl, &srv_id);
3011 
3012 	for (i = 0; i < num_ports; i++) {
3013 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3014 			continue;
3015 
3016 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3017 			pkey = port_infop[i].p_pkey_tbl[pki];
3018 			if ((pkey & IBSRM_HB) &&
3019 			    (pkey != IB_PKEY_INVALID_FULL)) {
3020 
3021 				/*
3022 				 * Allocate and prepare a service entry
3023 				 */
3024 				new_service =
3025 				    kmem_zalloc(1 * sizeof (rib_service_t),
3026 				    KM_SLEEP);
3027 
3028 				new_service->srv_type = service_type;
3029 				new_service->srv_hdl = srv_hdl;
3030 				new_service->srv_next = NULL;
3031 
3032 				ibt_status = ibt_bind_service(srv_hdl,
3033 				    port_infop[i].p_sgid_tbl[0],
3034 				    NULL, rib_stat, NULL);
3035 
3036 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3037 				    int, ibt_status);
3038 
3039 				if (ibt_status != IBT_SUCCESS) {
3040 					kmem_free(new_service,
3041 					    sizeof (rib_service_t));
3042 					new_service = NULL;
3043 					continue;
3044 				}
3045 
3046 				/*
3047 				 * Add to the service list for this HCA
3048 				 */
3049 				new_service->srv_next = hca->service_list;
3050 				hca->service_list = new_service;
3051 				new_service = NULL;
3052 				nbinds++;
3053 			}
3054 		}
3055 	}
3056 	rw_exit(&hca->service_list_lock);
3057 
3058 	ibt_free_portinfo(port_infop, port_size);
3059 
3060 	if (nbinds == 0) {
3061 		return (RDMA_FAILED);
3062 	} else {
3063 		/*
3064 		 * Put this plugin into accept state, since atleast
3065 		 * one registration was successful.
3066 		 */
3067 		mutex_enter(&plugin_state_lock);
3068 		plugin_state = ACCEPT;
3069 		mutex_exit(&plugin_state_lock);
3070 		return (RDMA_SUCCESS);
3071 	}
3072 }
3073 
3074 void
3075 rib_listen(struct rdma_svc_data *rd)
3076 {
3077 	rdma_stat status = RDMA_SUCCESS;
3078 
3079 	rd->active = 0;
3080 	rd->err_code = RDMA_FAILED;
3081 
3082 	/*
3083 	 * First check if a hca is still attached
3084 	 */
3085 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
3086 	if (rib_stat->hca->state != HCA_INITED) {
3087 		rw_exit(&rib_stat->hca->state_lock);
3088 		return;
3089 	}
3090 	rw_exit(&rib_stat->hca->state_lock);
3091 
3092 	rib_stat->q = &rd->q;
3093 	/*
3094 	 * Right now the only service type is NFS. Hence force feed this
3095 	 * value. Ideally to communicate the service type it should be
3096 	 * passed down in rdma_svc_data.
3097 	 */
3098 	rib_stat->service_type = NFS;
3099 	status = rib_register_service(rib_stat->hca, NFS);
3100 	if (status != RDMA_SUCCESS) {
3101 		rd->err_code = status;
3102 		return;
3103 	}
3104 	/*
3105 	 * Service active on an HCA, check rd->err_code for more
3106 	 * explainable errors.
3107 	 */
3108 	rd->active = 1;
3109 	rd->err_code = status;
3110 }
3111 
3112 /* XXXX */
3113 /* ARGSUSED */
3114 static void
3115 rib_listen_stop(struct rdma_svc_data *svcdata)
3116 {
3117 	rib_hca_t		*hca;
3118 
3119 	/*
3120 	 * KRPC called the RDMATF to stop the listeners, this means
3121 	 * stop sending incomming or recieved requests to KRPC master
3122 	 * transport handle for RDMA-IB. This is also means that the
3123 	 * master transport handle, responsible for us, is going away.
3124 	 */
3125 	mutex_enter(&plugin_state_lock);
3126 	plugin_state = NO_ACCEPT;
3127 	if (svcdata != NULL)
3128 		svcdata->active = 0;
3129 	mutex_exit(&plugin_state_lock);
3130 
3131 	/*
3132 	 * First check if a hca is still attached
3133 	 */
3134 	hca = rib_stat->hca;
3135 	rw_enter(&hca->state_lock, RW_READER);
3136 	if (hca->state != HCA_INITED) {
3137 		rw_exit(&hca->state_lock);
3138 		return;
3139 	}
3140 	rib_close_channels(&hca->srv_conn_list);
3141 	rib_stop_services(hca);
3142 	rw_exit(&hca->state_lock);
3143 }
3144 
3145 /*
3146  * Traverse the HCA's service list to unbind and deregister services.
3147  * Instead of unbinding the service for a service handle by
3148  * calling ibt_unbind_service() for each port/pkey, we unbind
3149  * all the services for the service handle by making only one
3150  * call to ibt_unbind_all_services().  Then, we deregister the
3151  * service for the service handle.
3152  *
3153  * When traversing the entries in service_list, we compare the
3154  * srv_hdl of the current entry with that of the next.  If they
3155  * are different or if the next entry is NULL, the current entry
3156  * marks the last binding of the service handle.  In this case,
3157  * call ibt_unbind_all_services() and deregister the service for
3158  * the service handle.  If they are the same, the current and the
3159  * next entries are bound to the same service handle.  In this
3160  * case, move on to the next entry.
3161  */
3162 static void
3163 rib_stop_services(rib_hca_t *hca)
3164 {
3165 	rib_service_t		*srv_list, *to_remove;
3166 
3167 	/*
3168 	 * unbind and deregister the services for this service type.
3169 	 * Right now there is only one service type. In future it will
3170 	 * be passed down to this function.
3171 	 */
3172 	rw_enter(&hca->service_list_lock, RW_WRITER);
3173 	srv_list = hca->service_list;
3174 	while (srv_list != NULL) {
3175 		to_remove = srv_list;
3176 		srv_list = to_remove->srv_next;
3177 		if (srv_list == NULL || bcmp(to_remove->srv_hdl,
3178 		    srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) {
3179 
3180 			(void) ibt_unbind_all_services(to_remove->srv_hdl);
3181 			(void) ibt_deregister_service(hca->ibt_clnt_hdl,
3182 			    to_remove->srv_hdl);
3183 		}
3184 
3185 		kmem_free(to_remove, sizeof (rib_service_t));
3186 	}
3187 	hca->service_list = NULL;
3188 	rw_exit(&hca->service_list_lock);
3189 }
3190 
3191 static struct svc_recv *
3192 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3193 {
3194 	struct svc_recv	*recvp;
3195 
3196 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3197 	recvp->vaddr = sgl->ds_va;
3198 	recvp->qp = qp;
3199 	recvp->bytes_xfer = 0;
3200 	return (recvp);
3201 }
3202 
3203 static int
3204 rib_free_svc_recv(struct svc_recv *recvp)
3205 {
3206 	kmem_free(recvp, sizeof (*recvp));
3207 
3208 	return (0);
3209 }
3210 
3211 static struct reply *
3212 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3213 {
3214 	struct reply	*rep;
3215 
3216 
3217 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3218 	if (rep == NULL) {
3219 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3220 		return (NULL);
3221 	}
3222 	rep->xid = msgid;
3223 	rep->vaddr_cq = NULL;
3224 	rep->bytes_xfer = 0;
3225 	rep->status = (uint_t)REPLY_WAIT;
3226 	rep->prev = NULL;
3227 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3228 
3229 	mutex_enter(&qp->replylist_lock);
3230 	if (qp->replylist) {
3231 		rep->next = qp->replylist;
3232 		qp->replylist->prev = rep;
3233 	}
3234 	qp->rep_list_size++;
3235 
3236 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3237 	    int, qp->rep_list_size);
3238 
3239 	qp->replylist = rep;
3240 	mutex_exit(&qp->replylist_lock);
3241 
3242 	return (rep);
3243 }
3244 
3245 static rdma_stat
3246 rib_rem_replylist(rib_qp_t *qp)
3247 {
3248 	struct reply	*r, *n;
3249 
3250 	mutex_enter(&qp->replylist_lock);
3251 	for (r = qp->replylist; r != NULL; r = n) {
3252 		n = r->next;
3253 		(void) rib_remreply(qp, r);
3254 	}
3255 	mutex_exit(&qp->replylist_lock);
3256 
3257 	return (RDMA_SUCCESS);
3258 }
3259 
3260 static int
3261 rib_remreply(rib_qp_t *qp, struct reply *rep)
3262 {
3263 
3264 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3265 	if (rep->prev) {
3266 		rep->prev->next = rep->next;
3267 	}
3268 	if (rep->next) {
3269 		rep->next->prev = rep->prev;
3270 	}
3271 	if (qp->replylist == rep)
3272 		qp->replylist = rep->next;
3273 
3274 	cv_destroy(&rep->wait_cv);
3275 	qp->rep_list_size--;
3276 
3277 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3278 	    int, qp->rep_list_size);
3279 
3280 	kmem_free(rep, sizeof (*rep));
3281 
3282 	return (0);
3283 }
3284 
3285 rdma_stat
3286 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3287 	struct mrc *buf_handle)
3288 {
3289 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3290 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3291 	rdma_stat	status;
3292 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3293 
3294 	/*
3295 	 * Note: ALL buffer pools use the same memory type RDMARW.
3296 	 */
3297 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3298 	if (status == RDMA_SUCCESS) {
3299 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3300 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3301 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3302 	} else {
3303 		buf_handle->mrc_linfo = NULL;
3304 		buf_handle->mrc_lmr = 0;
3305 		buf_handle->mrc_rmr = 0;
3306 	}
3307 	return (status);
3308 }
3309 
3310 static rdma_stat
3311 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3312 	ibt_mr_flags_t spec,
3313 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3314 {
3315 	ibt_mr_attr_t	mem_attr;
3316 	ibt_status_t	ibt_status;
3317 	mem_attr.mr_vaddr = (uintptr_t)buf;
3318 	mem_attr.mr_len = (ib_msglen_t)size;
3319 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3320 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3321 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3322 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3323 
3324 	rw_enter(&hca->state_lock, RW_READER);
3325 	if (hca->state == HCA_INITED) {
3326 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3327 		    &mem_attr, mr_hdlp, mr_descp);
3328 		rw_exit(&hca->state_lock);
3329 	} else {
3330 		rw_exit(&hca->state_lock);
3331 		return (RDMA_FAILED);
3332 	}
3333 
3334 	if (ibt_status != IBT_SUCCESS) {
3335 		return (RDMA_FAILED);
3336 	}
3337 	return (RDMA_SUCCESS);
3338 }
3339 
3340 rdma_stat
3341 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3342 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3343 {
3344 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3345 	rib_lrc_entry_t *l;
3346 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3347 	rdma_stat	status;
3348 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3349 
3350 	/*
3351 	 * Non-coherent memory registration.
3352 	 */
3353 	l = (rib_lrc_entry_t *)lrc;
3354 	if (l) {
3355 		if (l->registered) {
3356 			buf_handle->mrc_linfo =
3357 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3358 			buf_handle->mrc_lmr =
3359 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3360 			buf_handle->mrc_rmr =
3361 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3362 			*sync_handle = (RIB_SYNCMEM_HANDLE)
3363 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3364 			return (RDMA_SUCCESS);
3365 		} else {
3366 			/* Always register the whole buffer */
3367 			buf = (caddr_t)l->lrc_buf;
3368 			buflen = l->lrc_len;
3369 		}
3370 	}
3371 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3372 
3373 	if (status == RDMA_SUCCESS) {
3374 		if (l) {
3375 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3376 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3377 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3378 			l->registered		 = TRUE;
3379 		}
3380 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3381 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3382 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3383 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3384 	} else {
3385 		buf_handle->mrc_linfo = NULL;
3386 		buf_handle->mrc_lmr = 0;
3387 		buf_handle->mrc_rmr = 0;
3388 	}
3389 	return (status);
3390 }
3391 
3392 /* ARGSUSED */
3393 rdma_stat
3394 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3395 {
3396 	rib_hca_t *hca = (ctoqp(conn))->hca;
3397 	/*
3398 	 * Allow memory deregistration even if HCA is
3399 	 * getting detached. Need all outstanding
3400 	 * memory registrations to be deregistered
3401 	 * before HCA_DETACH_EVENT can be accepted.
3402 	 */
3403 	(void) ibt_deregister_mr(hca->hca_hdl,
3404 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3405 	return (RDMA_SUCCESS);
3406 }
3407 
3408 /* ARGSUSED */
3409 rdma_stat
3410 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3411 		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3412 {
3413 	rib_lrc_entry_t *l;
3414 	l = (rib_lrc_entry_t *)lrc;
3415 	if (l)
3416 		if (l->registered)
3417 			return (RDMA_SUCCESS);
3418 
3419 	(void) rib_deregistermem(conn, buf, buf_handle);
3420 
3421 	return (RDMA_SUCCESS);
3422 }
3423 
3424 /* ARGSUSED */
3425 rdma_stat
3426 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3427 		int len, int cpu)
3428 {
3429 	ibt_status_t	status;
3430 	rib_hca_t *hca = (ctoqp(conn))->hca;
3431 	ibt_mr_sync_t	mr_segment;
3432 
3433 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3434 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3435 	mr_segment.ms_len = (ib_memlen_t)len;
3436 	if (cpu) {
3437 		/* make incoming data visible to memory */
3438 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3439 	} else {
3440 		/* make memory changes visible to IO */
3441 		mr_segment.ms_flags = IBT_SYNC_READ;
3442 	}
3443 	rw_enter(&hca->state_lock, RW_READER);
3444 	if (hca->state == HCA_INITED) {
3445 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3446 		rw_exit(&hca->state_lock);
3447 	} else {
3448 		rw_exit(&hca->state_lock);
3449 		return (RDMA_FAILED);
3450 	}
3451 
3452 	if (status == IBT_SUCCESS)
3453 		return (RDMA_SUCCESS);
3454 	else {
3455 		return (RDMA_FAILED);
3456 	}
3457 }
3458 
3459 /*
3460  * XXXX	????
3461  */
3462 static rdma_stat
3463 rib_getinfo(rdma_info_t *info)
3464 {
3465 	/*
3466 	 * XXXX	Hack!
3467 	 */
3468 	info->addrlen = 16;
3469 	info->mts = 1000000;
3470 	info->mtu = 1000000;
3471 
3472 	return (RDMA_SUCCESS);
3473 }
3474 
3475 rib_bufpool_t *
3476 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3477 {
3478 	rib_bufpool_t	*rbp = NULL;
3479 	bufpool_t	*bp = NULL;
3480 	caddr_t		buf;
3481 	ibt_mr_attr_t	mem_attr;
3482 	ibt_status_t	ibt_status;
3483 	int		i, j;
3484 
3485 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3486 
3487 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3488 	    num * sizeof (void *), KM_SLEEP);
3489 
3490 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3491 	bp->numelems = num;
3492 
3493 
3494 	switch (ptype) {
3495 	case SEND_BUFFER:
3496 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3497 		bp->rsize = RPC_MSG_SZ;
3498 		break;
3499 	case RECV_BUFFER:
3500 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3501 		bp->rsize = RPC_BUF_SIZE;
3502 		break;
3503 	default:
3504 		goto fail;
3505 	}
3506 
3507 	/*
3508 	 * Register the pool.
3509 	 */
3510 	bp->bufsize = num * bp->rsize;
3511 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3512 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3513 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3514 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3515 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3516 	rw_enter(&hca->state_lock, RW_READER);
3517 
3518 	if (hca->state != HCA_INITED) {
3519 		rw_exit(&hca->state_lock);
3520 		goto fail;
3521 	}
3522 
3523 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3524 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3525 		mem_attr.mr_vaddr = (uintptr_t)buf;
3526 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3527 		mem_attr.mr_as = NULL;
3528 		ibt_status = ibt_register_mr(hca->hca_hdl,
3529 		    hca->pd_hdl, &mem_attr,
3530 		    &rbp->mr_hdl[i],
3531 		    &rbp->mr_desc[i]);
3532 		if (ibt_status != IBT_SUCCESS) {
3533 			for (j = 0; j < i; j++) {
3534 				(void) ibt_deregister_mr(hca->hca_hdl,
3535 				    rbp->mr_hdl[j]);
3536 			}
3537 			rw_exit(&hca->state_lock);
3538 			goto fail;
3539 		}
3540 	}
3541 	rw_exit(&hca->state_lock);
3542 	buf = (caddr_t)bp->buf;
3543 	for (i = 0; i < num; i++, buf += bp->rsize) {
3544 		bp->buflist[i] = (void *)buf;
3545 	}
3546 	bp->buffree = num - 1;	/* no. of free buffers */
3547 	rbp->bpool = bp;
3548 
3549 	return (rbp);
3550 fail:
3551 	if (bp) {
3552 		if (bp->buf)
3553 			kmem_free(bp->buf, bp->bufsize);
3554 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3555 	}
3556 	if (rbp) {
3557 		if (rbp->mr_hdl)
3558 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3559 		if (rbp->mr_desc)
3560 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3561 		kmem_free(rbp, sizeof (rib_bufpool_t));
3562 	}
3563 	return (NULL);
3564 }
3565 
3566 static void
3567 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3568 {
3569 	int i;
3570 	rib_bufpool_t *rbp = NULL;
3571 	bufpool_t *bp;
3572 
3573 	/*
3574 	 * Obtain pool address based on type of pool
3575 	 */
3576 	switch (ptype) {
3577 		case SEND_BUFFER:
3578 			rbp = hca->send_pool;
3579 			break;
3580 		case RECV_BUFFER:
3581 			rbp = hca->recv_pool;
3582 			break;
3583 		default:
3584 			return;
3585 	}
3586 	if (rbp == NULL)
3587 		return;
3588 
3589 	bp = rbp->bpool;
3590 
3591 	/*
3592 	 * Deregister the pool memory and free it.
3593 	 */
3594 	for (i = 0; i < bp->numelems; i++) {
3595 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3596 	}
3597 }
3598 
3599 static void
3600 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3601 {
3602 
3603 	rib_bufpool_t *rbp = NULL;
3604 	bufpool_t *bp;
3605 
3606 	/*
3607 	 * Obtain pool address based on type of pool
3608 	 */
3609 	switch (ptype) {
3610 		case SEND_BUFFER:
3611 			rbp = hca->send_pool;
3612 			break;
3613 		case RECV_BUFFER:
3614 			rbp = hca->recv_pool;
3615 			break;
3616 		default:
3617 			return;
3618 	}
3619 	if (rbp == NULL)
3620 		return;
3621 
3622 	bp = rbp->bpool;
3623 
3624 	/*
3625 	 * Free the pool memory.
3626 	 */
3627 	if (rbp->mr_hdl)
3628 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3629 
3630 	if (rbp->mr_desc)
3631 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
3632 	if (bp->buf)
3633 		kmem_free(bp->buf, bp->bufsize);
3634 	mutex_destroy(&bp->buflock);
3635 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
3636 	kmem_free(rbp, sizeof (rib_bufpool_t));
3637 }
3638 
3639 void
3640 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
3641 {
3642 	/*
3643 	 * Deregister the pool memory and free it.
3644 	 */
3645 	rib_rbufpool_deregister(hca, ptype);
3646 	rib_rbufpool_free(hca, ptype);
3647 }
3648 
3649 /*
3650  * Fetch a buffer from the pool of type specified in rdbuf->type.
3651  */
3652 static rdma_stat
3653 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3654 {
3655 	rib_lrc_entry_t *rlep;
3656 
3657 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
3658 		rlep = rib_get_cache_buf(conn, rdbuf->len);
3659 		rdbuf->rb_private =  (caddr_t)rlep;
3660 		rdbuf->addr = rlep->lrc_buf;
3661 		rdbuf->handle = rlep->lrc_mhandle;
3662 		return (RDMA_SUCCESS);
3663 	}
3664 
3665 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
3666 	if (rdbuf->addr) {
3667 		switch (rdbuf->type) {
3668 		case SEND_BUFFER:
3669 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
3670 			break;
3671 		case RECV_BUFFER:
3672 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
3673 			break;
3674 		default:
3675 			rdbuf->len = 0;
3676 		}
3677 		return (RDMA_SUCCESS);
3678 	} else
3679 		return (RDMA_FAILED);
3680 }
3681 
3682 #if defined(MEASURE_POOL_DEPTH)
3683 static void rib_recv_bufs(uint32_t x) {
3684 
3685 }
3686 
3687 static void rib_send_bufs(uint32_t x) {
3688 
3689 }
3690 #endif
3691 
3692 /*
3693  * Fetch a buffer of specified type.
3694  * Note that rdbuf->handle is mw's rkey.
3695  */
3696 static void *
3697 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3698 {
3699 	rib_qp_t	*qp = ctoqp(conn);
3700 	rib_hca_t	*hca = qp->hca;
3701 	rdma_btype	ptype = rdbuf->type;
3702 	void		*buf;
3703 	rib_bufpool_t	*rbp = NULL;
3704 	bufpool_t	*bp;
3705 	int		i;
3706 
3707 	/*
3708 	 * Obtain pool address based on type of pool
3709 	 */
3710 	switch (ptype) {
3711 	case SEND_BUFFER:
3712 		rbp = hca->send_pool;
3713 		break;
3714 	case RECV_BUFFER:
3715 		rbp = hca->recv_pool;
3716 		break;
3717 	default:
3718 		return (NULL);
3719 	}
3720 	if (rbp == NULL)
3721 		return (NULL);
3722 
3723 	bp = rbp->bpool;
3724 
3725 	mutex_enter(&bp->buflock);
3726 	if (bp->buffree < 0) {
3727 		mutex_exit(&bp->buflock);
3728 		return (NULL);
3729 	}
3730 
3731 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
3732 	buf = bp->buflist[bp->buffree];
3733 	rdbuf->addr = buf;
3734 	rdbuf->len = bp->rsize;
3735 	for (i = bp->numelems - 1; i >= 0; i--) {
3736 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
3737 			rdbuf->handle.mrc_rmr =
3738 			    (uint32_t)rbp->mr_desc[i].md_rkey;
3739 			rdbuf->handle.mrc_linfo =
3740 			    (uintptr_t)rbp->mr_hdl[i];
3741 			rdbuf->handle.mrc_lmr =
3742 			    (uint32_t)rbp->mr_desc[i].md_lkey;
3743 #if defined(MEASURE_POOL_DEPTH)
3744 			if (ptype == SEND_BUFFER)
3745 				rib_send_bufs(MAX_BUFS - (bp->buffree+1));
3746 			if (ptype == RECV_BUFFER)
3747 				rib_recv_bufs(MAX_BUFS - (bp->buffree+1));
3748 #endif
3749 			bp->buffree--;
3750 
3751 			mutex_exit(&bp->buflock);
3752 
3753 			return (buf);
3754 		}
3755 	}
3756 
3757 	mutex_exit(&bp->buflock);
3758 
3759 	return (NULL);
3760 }
3761 
3762 static void
3763 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
3764 {
3765 
3766 	if (rdbuf->type == RDMA_LONG_BUFFER) {
3767 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
3768 		rdbuf->rb_private = NULL;
3769 		return;
3770 	}
3771 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
3772 }
3773 
3774 static void
3775 rib_rbuf_free(CONN *conn, int ptype, void *buf)
3776 {
3777 	rib_qp_t *qp = ctoqp(conn);
3778 	rib_hca_t *hca = qp->hca;
3779 	rib_bufpool_t *rbp = NULL;
3780 	bufpool_t *bp;
3781 
3782 	/*
3783 	 * Obtain pool address based on type of pool
3784 	 */
3785 	switch (ptype) {
3786 	case SEND_BUFFER:
3787 		rbp = hca->send_pool;
3788 		break;
3789 	case RECV_BUFFER:
3790 		rbp = hca->recv_pool;
3791 		break;
3792 	default:
3793 		return;
3794 	}
3795 	if (rbp == NULL)
3796 		return;
3797 
3798 	bp = rbp->bpool;
3799 
3800 	mutex_enter(&bp->buflock);
3801 	if (++bp->buffree >= bp->numelems) {
3802 		/*
3803 		 * Should never happen
3804 		 */
3805 		bp->buffree--;
3806 	} else {
3807 		bp->buflist[bp->buffree] = buf;
3808 	}
3809 	mutex_exit(&bp->buflock);
3810 }
3811 
3812 static rdma_stat
3813 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
3814 {
3815 	rw_enter(&connlist->conn_lock, RW_WRITER);
3816 	if (connlist->conn_hd) {
3817 		cn->c_next = connlist->conn_hd;
3818 		connlist->conn_hd->c_prev = cn;
3819 	}
3820 	connlist->conn_hd = cn;
3821 	rw_exit(&connlist->conn_lock);
3822 
3823 	return (RDMA_SUCCESS);
3824 }
3825 
3826 static rdma_stat
3827 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
3828 {
3829 	rw_enter(&connlist->conn_lock, RW_WRITER);
3830 	if (cn->c_prev) {
3831 		cn->c_prev->c_next = cn->c_next;
3832 	}
3833 	if (cn->c_next) {
3834 		cn->c_next->c_prev = cn->c_prev;
3835 	}
3836 	if (connlist->conn_hd == cn)
3837 		connlist->conn_hd = cn->c_next;
3838 	rw_exit(&connlist->conn_lock);
3839 
3840 	return (RDMA_SUCCESS);
3841 }
3842 
3843 /*
3844  * Connection management.
3845  * IBTF does not support recycling of channels. So connections are only
3846  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
3847  * C_DISCONN_PEND state. No C_IDLE state.
3848  * C_CONN_PEND state: Connection establishment in progress to the server.
3849  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
3850  * It has an RC channel associated with it. ibt_post_send/recv are allowed
3851  * only in this state.
3852  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
3853  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
3854  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
3855  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
3856  * c_ref drops to 0 (this indicates that RPC has no more references to this
3857  * connection), the connection should be destroyed. A connection transitions
3858  * into this state when it is being destroyed.
3859  */
3860 /* ARGSUSED */
3861 static rdma_stat
3862 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn)
3863 {
3864 	CONN *cn;
3865 	int status = RDMA_SUCCESS;
3866 	rib_hca_t *hca = rib_stat->hca;
3867 	rib_qp_t *qp;
3868 	clock_t cv_stat, timout;
3869 	rpcib_ping_t rpt;
3870 
3871 	if (hca == NULL)
3872 		return (RDMA_FAILED);
3873 
3874 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
3875 	if (hca->state == HCA_DETACHED) {
3876 		rw_exit(&rib_stat->hca->state_lock);
3877 		return (RDMA_FAILED);
3878 	}
3879 	rw_exit(&rib_stat->hca->state_lock);
3880 
3881 again:
3882 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
3883 	cn = hca->cl_conn_list.conn_hd;
3884 	while (cn != NULL) {
3885 		/*
3886 		 * First, clear up any connection in the ERROR state
3887 		 */
3888 		mutex_enter(&cn->c_lock);
3889 		if (cn->c_state == C_ERROR_CONN) {
3890 			if (cn->c_ref == 0) {
3891 				/*
3892 				 * Remove connection from list and destroy it.
3893 				 */
3894 				cn->c_state = C_DISCONN_PEND;
3895 				mutex_exit(&cn->c_lock);
3896 				rw_exit(&hca->cl_conn_list.conn_lock);
3897 				(void) rib_disconnect_channel(cn,
3898 				    &hca->cl_conn_list);
3899 				goto again;
3900 			}
3901 			mutex_exit(&cn->c_lock);
3902 			cn = cn->c_next;
3903 			continue;
3904 		}
3905 		if (cn->c_state == C_DISCONN_PEND) {
3906 			mutex_exit(&cn->c_lock);
3907 			cn = cn->c_next;
3908 			continue;
3909 		}
3910 		if ((cn->c_raddr.len == svcaddr->len) &&
3911 		    bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) {
3912 			/*
3913 			 * Our connection. Give up conn list lock
3914 			 * as we are done traversing the list.
3915 			 */
3916 			rw_exit(&hca->cl_conn_list.conn_lock);
3917 			if (cn->c_state == C_CONNECTED) {
3918 				cn->c_ref++;	/* sharing a conn */
3919 				mutex_exit(&cn->c_lock);
3920 				*conn = cn;
3921 				return (status);
3922 			}
3923 			if (cn->c_state == C_CONN_PEND) {
3924 				/*
3925 				 * Hold a reference to this conn before
3926 				 * we give up the lock.
3927 				 */
3928 				cn->c_ref++;
3929 				timout =  ddi_get_lbolt() +
3930 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
3931 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
3932 				    &cn->c_lock, timout)) > 0 &&
3933 				    cn->c_state == C_CONN_PEND)
3934 					;
3935 				if (cv_stat == 0) {
3936 					cn->c_ref--;
3937 					mutex_exit(&cn->c_lock);
3938 					return (RDMA_INTR);
3939 				}
3940 				if (cv_stat < 0) {
3941 					cn->c_ref--;
3942 					mutex_exit(&cn->c_lock);
3943 					return (RDMA_TIMEDOUT);
3944 				}
3945 				if (cn->c_state == C_CONNECTED) {
3946 					*conn = cn;
3947 					mutex_exit(&cn->c_lock);
3948 					return (status);
3949 				} else {
3950 					cn->c_ref--;
3951 					mutex_exit(&cn->c_lock);
3952 					return (RDMA_TIMEDOUT);
3953 				}
3954 			}
3955 		}
3956 		mutex_exit(&cn->c_lock);
3957 		cn = cn->c_next;
3958 	}
3959 	rw_exit(&hca->cl_conn_list.conn_lock);
3960 
3961 	bzero(&rpt, sizeof (rpcib_ping_t));
3962 
3963 	status = rib_ping_srv(addr_type, svcaddr, &rpt);
3964 	if (status != RDMA_SUCCESS) {
3965 		return (RDMA_FAILED);
3966 	}
3967 
3968 	/*
3969 	 * Channel to server doesn't exist yet, create one.
3970 	 */
3971 	if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) {
3972 		return (RDMA_FAILED);
3973 	}
3974 	cn = qptoc(qp);
3975 	cn->c_state = C_CONN_PEND;
3976 	cn->c_ref = 1;
3977 
3978 	/*
3979 	 * Add to conn list.
3980 	 * We had given up the READER lock. In the time since then,
3981 	 * another thread might have created the connection we are
3982 	 * trying here. But for now, that is quiet alright - there
3983 	 * might be two connections between a pair of hosts instead
3984 	 * of one. If we really want to close that window,
3985 	 * then need to check the list after acquiring the
3986 	 * WRITER lock.
3987 	 */
3988 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
3989 	status = rib_conn_to_srv(hca, qp, &rpt);
3990 	mutex_enter(&cn->c_lock);
3991 	if (status == RDMA_SUCCESS) {
3992 		cn->c_state = C_CONNECTED;
3993 		*conn = cn;
3994 	} else {
3995 		cn->c_state = C_ERROR_CONN;
3996 		cn->c_ref--;
3997 	}
3998 	cv_broadcast(&cn->c_cv);
3999 	mutex_exit(&cn->c_lock);
4000 	return (status);
4001 }
4002 
4003 static rdma_stat
4004 rib_conn_release(CONN *conn)
4005 {
4006 	rib_qp_t	*qp = ctoqp(conn);
4007 
4008 	mutex_enter(&conn->c_lock);
4009 	conn->c_ref--;
4010 
4011 	/*
4012 	 * If a conn is C_ERROR_CONN, close the channel.
4013 	 * If it's CONNECTED, keep it that way.
4014 	 */
4015 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4016 		conn->c_state = C_DISCONN_PEND;
4017 		mutex_exit(&conn->c_lock);
4018 		if (qp->mode == RIB_SERVER)
4019 			(void) rib_disconnect_channel(conn,
4020 			    &qp->hca->srv_conn_list);
4021 		else
4022 			(void) rib_disconnect_channel(conn,
4023 			    &qp->hca->cl_conn_list);
4024 		return (RDMA_SUCCESS);
4025 	}
4026 	mutex_exit(&conn->c_lock);
4027 	return (RDMA_SUCCESS);
4028 }
4029 
4030 /*
4031  * Add at front of list
4032  */
4033 static struct rdma_done_list *
4034 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4035 {
4036 	struct rdma_done_list *rd;
4037 
4038 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4039 
4040 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4041 	rd->xid = xid;
4042 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4043 
4044 	rd->prev = NULL;
4045 	rd->next = qp->rdlist;
4046 	if (qp->rdlist != NULL)
4047 		qp->rdlist->prev = rd;
4048 	qp->rdlist = rd;
4049 
4050 	return (rd);
4051 }
4052 
4053 static void
4054 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4055 {
4056 	struct rdma_done_list *r;
4057 
4058 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4059 
4060 	r = rd->next;
4061 	if (r != NULL) {
4062 		r->prev = rd->prev;
4063 	}
4064 
4065 	r = rd->prev;
4066 	if (r != NULL) {
4067 		r->next = rd->next;
4068 	} else {
4069 		qp->rdlist = rd->next;
4070 	}
4071 
4072 	cv_destroy(&rd->rdma_done_cv);
4073 	kmem_free(rd, sizeof (*rd));
4074 }
4075 
4076 static void
4077 rdma_done_rem_list(rib_qp_t *qp)
4078 {
4079 	struct rdma_done_list	*r, *n;
4080 
4081 	mutex_enter(&qp->rdlist_lock);
4082 	for (r = qp->rdlist; r != NULL; r = n) {
4083 		n = r->next;
4084 		rdma_done_rm(qp, r);
4085 	}
4086 	mutex_exit(&qp->rdlist_lock);
4087 }
4088 
4089 static void
4090 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4091 {
4092 	struct rdma_done_list *r = qp->rdlist;
4093 
4094 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4095 
4096 	while (r) {
4097 		if (r->xid == xid) {
4098 			cv_signal(&r->rdma_done_cv);
4099 			return;
4100 		} else {
4101 			r = r->next;
4102 		}
4103 	}
4104 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4105 	    int, xid);
4106 }
4107 
4108 
4109 /*
4110  * Goes through all connections and closes the channel
4111  * This will cause all the WRs on those channels to be
4112  * flushed.
4113  */
4114 static void
4115 rib_close_channels(rib_conn_list_t *connlist)
4116 {
4117 	CONN 		*conn;
4118 	rib_qp_t	*qp;
4119 
4120 	rw_enter(&connlist->conn_lock, RW_READER);
4121 	conn = connlist->conn_hd;
4122 	while (conn != NULL) {
4123 		mutex_enter(&conn->c_lock);
4124 		qp = ctoqp(conn);
4125 		if (conn->c_state == C_CONNECTED) {
4126 			/*
4127 			 * Live connection in CONNECTED state.
4128 			 * Call ibt_close_rc_channel in nonblocking mode
4129 			 * with no callbacks.
4130 			 */
4131 			conn->c_state = C_ERROR_CONN;
4132 			(void) ibt_close_rc_channel(qp->qp_hdl,
4133 			    IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
4134 			(void) ibt_free_channel(qp->qp_hdl);
4135 			qp->qp_hdl = NULL;
4136 		} else {
4137 			if (conn->c_state == C_ERROR_CONN &&
4138 			    qp->qp_hdl != NULL) {
4139 				/*
4140 				 * Connection in ERROR state but
4141 				 * channel is not yet freed.
4142 				 */
4143 				(void) ibt_close_rc_channel(qp->qp_hdl,
4144 				    IBT_NOCALLBACKS, NULL, 0, NULL,
4145 				    NULL, 0);
4146 				(void) ibt_free_channel(qp->qp_hdl);
4147 				qp->qp_hdl = NULL;
4148 			}
4149 		}
4150 		mutex_exit(&conn->c_lock);
4151 		conn = conn->c_next;
4152 	}
4153 	rw_exit(&connlist->conn_lock);
4154 }
4155 
4156 /*
4157  * Frees up all connections that are no longer being referenced
4158  */
4159 static void
4160 rib_purge_connlist(rib_conn_list_t *connlist)
4161 {
4162 	CONN 		*conn;
4163 
4164 top:
4165 	rw_enter(&connlist->conn_lock, RW_READER);
4166 	conn = connlist->conn_hd;
4167 	while (conn != NULL) {
4168 		mutex_enter(&conn->c_lock);
4169 
4170 		/*
4171 		 * At this point connection is either in ERROR
4172 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4173 		 * then some other thread is culling that connection.
4174 		 * If not and if c_ref is 0, then destroy the connection.
4175 		 */
4176 		if (conn->c_ref == 0 &&
4177 		    conn->c_state != C_DISCONN_PEND) {
4178 			/*
4179 			 * Cull the connection
4180 			 */
4181 			conn->c_state = C_DISCONN_PEND;
4182 			mutex_exit(&conn->c_lock);
4183 			rw_exit(&connlist->conn_lock);
4184 			(void) rib_disconnect_channel(conn, connlist);
4185 			goto top;
4186 		} else {
4187 			/*
4188 			 * conn disconnect already scheduled or will
4189 			 * happen from conn_release when c_ref drops to 0.
4190 			 */
4191 			mutex_exit(&conn->c_lock);
4192 		}
4193 		conn = conn->c_next;
4194 	}
4195 	rw_exit(&connlist->conn_lock);
4196 
4197 	/*
4198 	 * At this point, only connections with c_ref != 0 are on the list
4199 	 */
4200 }
4201 
4202 /*
4203  * Cleans and closes up all uses of the HCA
4204  */
4205 static void
4206 rib_detach_hca(rib_hca_t *hca)
4207 {
4208 
4209 	/*
4210 	 * Stop all services on the HCA
4211 	 * Go through cl_conn_list and close all rc_channels
4212 	 * Go through svr_conn_list and close all rc_channels
4213 	 * Free connections whose c_ref has dropped to 0
4214 	 * Destroy all CQs
4215 	 * Deregister and released all buffer pool memory after all
4216 	 * connections are destroyed
4217 	 * Free the protection domain
4218 	 * ibt_close_hca()
4219 	 */
4220 	rw_enter(&hca->state_lock, RW_WRITER);
4221 	if (hca->state == HCA_DETACHED) {
4222 		rw_exit(&hca->state_lock);
4223 		return;
4224 	}
4225 
4226 	hca->state = HCA_DETACHED;
4227 	rib_stat->nhca_inited--;
4228 
4229 	rib_stop_services(hca);
4230 	rib_close_channels(&hca->cl_conn_list);
4231 	rib_close_channels(&hca->srv_conn_list);
4232 
4233 	rib_mod.rdma_count--;
4234 
4235 	rw_exit(&hca->state_lock);
4236 
4237 	/*
4238 	 * purge will free all datastructures used by CQ handlers. We don't
4239 	 * want to receive completions after purge, so we'll free the CQs now.
4240 	 */
4241 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4242 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4243 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4244 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4245 
4246 	rib_purge_connlist(&hca->cl_conn_list);
4247 	rib_purge_connlist(&hca->srv_conn_list);
4248 
4249 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4250 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4251 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4252 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4253 	if (stats_enabled) {
4254 		kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4255 		    GLOBAL_ZONEID);
4256 	}
4257 
4258 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4259 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4260 	if (hca->srv_conn_list.conn_hd == NULL &&
4261 	    hca->cl_conn_list.conn_hd == NULL) {
4262 		/*
4263 		 * conn_lists are NULL, so destroy
4264 		 * buffers, close hca and be done.
4265 		 */
4266 		rib_rbufpool_destroy(hca, RECV_BUFFER);
4267 		rib_rbufpool_destroy(hca, SEND_BUFFER);
4268 		rib_destroy_cache(hca);
4269 		rdma_unregister_mod(&rib_mod);
4270 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4271 		(void) ibt_close_hca(hca->hca_hdl);
4272 		hca->hca_hdl = NULL;
4273 	}
4274 	rw_exit(&hca->cl_conn_list.conn_lock);
4275 	rw_exit(&hca->srv_conn_list.conn_lock);
4276 
4277 	if (hca->hca_hdl != NULL) {
4278 		mutex_enter(&hca->inuse_lock);
4279 		while (hca->inuse)
4280 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4281 		mutex_exit(&hca->inuse_lock);
4282 
4283 		rdma_unregister_mod(&rib_mod);
4284 
4285 		/*
4286 		 * conn_lists are now NULL, so destroy
4287 		 * buffers, close hca and be done.
4288 		 */
4289 		rib_rbufpool_destroy(hca, RECV_BUFFER);
4290 		rib_rbufpool_destroy(hca, SEND_BUFFER);
4291 		rib_destroy_cache(hca);
4292 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4293 		(void) ibt_close_hca(hca->hca_hdl);
4294 		hca->hca_hdl = NULL;
4295 	}
4296 }
4297 
4298 static void
4299 rib_server_side_cache_reclaim(void *argp)
4300 {
4301 	cache_avl_struct_t    *rcas;
4302 	rib_lrc_entry_t		*rb;
4303 	rib_hca_t *hca = (rib_hca_t *)argp;
4304 
4305 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4306 	rcas = avl_first(&hca->avl_tree);
4307 	if (rcas != NULL)
4308 		avl_remove(&hca->avl_tree, rcas);
4309 
4310 	while (rcas != NULL) {
4311 		while (rcas->r.forw != &rcas->r) {
4312 			rcas->elements--;
4313 			rib_total_buffers --;
4314 			rb = rcas->r.forw;
4315 			remque(rb);
4316 			if (rb->registered)
4317 				(void) rib_deregistermem_via_hca(hca,
4318 				    rb->lrc_buf, rb->lrc_mhandle);
4319 			cache_allocation -= rb->lrc_len;
4320 			kmem_free(rb->lrc_buf, rb->lrc_len);
4321 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4322 		}
4323 		mutex_destroy(&rcas->node_lock);
4324 		kmem_cache_free(hca->server_side_cache, rcas);
4325 		rcas = avl_first(&hca->avl_tree);
4326 		if (rcas != NULL)
4327 			avl_remove(&hca->avl_tree, rcas);
4328 	}
4329 	rw_exit(&hca->avl_rw_lock);
4330 }
4331 
4332 static void
4333 rib_server_side_cache_cleanup(void *argp)
4334 {
4335 	cache_avl_struct_t    *rcas;
4336 	rib_lrc_entry_t		*rb;
4337 	rib_hca_t *hca = (rib_hca_t *)argp;
4338 
4339 	rw_enter(&hca->avl_rw_lock, RW_READER);
4340 	if (cache_allocation < cache_limit) {
4341 		rw_exit(&hca->avl_rw_lock);
4342 		return;
4343 	}
4344 	rw_exit(&hca->avl_rw_lock);
4345 
4346 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4347 	rcas = avl_last(&hca->avl_tree);
4348 	if (rcas != NULL)
4349 		avl_remove(&hca->avl_tree, rcas);
4350 
4351 	while (rcas != NULL) {
4352 		while (rcas->r.forw != &rcas->r) {
4353 			rcas->elements--;
4354 			rib_total_buffers --;
4355 			rb = rcas->r.forw;
4356 			remque(rb);
4357 			if (rb->registered)
4358 				(void) rib_deregistermem_via_hca(hca,
4359 				    rb->lrc_buf, rb->lrc_mhandle);
4360 			cache_allocation -= rb->lrc_len;
4361 			kmem_free(rb->lrc_buf, rb->lrc_len);
4362 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4363 		}
4364 		mutex_destroy(&rcas->node_lock);
4365 		if (hca->server_side_cache) {
4366 			kmem_cache_free(hca->server_side_cache, rcas);
4367 		}
4368 		if ((cache_allocation) < cache_limit) {
4369 			rw_exit(&hca->avl_rw_lock);
4370 			return;
4371 		}
4372 
4373 		rcas = avl_last(&hca->avl_tree);
4374 		if (rcas != NULL)
4375 			avl_remove(&hca->avl_tree, rcas);
4376 	}
4377 	rw_exit(&hca->avl_rw_lock);
4378 }
4379 
4380 static int
4381 avl_compare(const void *t1, const void *t2)
4382 {
4383 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
4384 		return (0);
4385 
4386 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
4387 		return (-1);
4388 
4389 	return (1);
4390 }
4391 
4392 static void
4393 rib_destroy_cache(rib_hca_t *hca)
4394 {
4395 	if (hca->reg_cache_clean_up != NULL) {
4396 		ddi_taskq_destroy(hca->reg_cache_clean_up);
4397 		hca->reg_cache_clean_up = NULL;
4398 	}
4399 	if (hca->avl_init) {
4400 		rib_server_side_cache_reclaim((void *)hca);
4401 		if (hca->server_side_cache) {
4402 			kmem_cache_destroy(hca->server_side_cache);
4403 			hca->server_side_cache = NULL;
4404 		}
4405 		avl_destroy(&hca->avl_tree);
4406 		mutex_destroy(&hca->cache_allocation);
4407 		rw_destroy(&hca->avl_rw_lock);
4408 	}
4409 	hca->avl_init = FALSE;
4410 }
4411 
4412 static void
4413 rib_force_cleanup(void *hca)
4414 {
4415 	if (((rib_hca_t *)hca)->reg_cache_clean_up != NULL)
4416 		(void) ddi_taskq_dispatch(
4417 		    ((rib_hca_t *)hca)->reg_cache_clean_up,
4418 		    rib_server_side_cache_cleanup,
4419 		    (void *)hca, DDI_NOSLEEP);
4420 }
4421 
4422 static rib_lrc_entry_t *
4423 rib_get_cache_buf(CONN *conn, uint32_t len)
4424 {
4425 	cache_avl_struct_t	cas, *rcas;
4426 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4427 	rib_lrc_entry_t *reply_buf;
4428 	avl_index_t where = NULL;
4429 	uint64_t c_alloc = 0;
4430 
4431 	if (!hca->avl_init)
4432 		goto  error_alloc;
4433 
4434 	cas.len = len;
4435 
4436 	rw_enter(&hca->avl_rw_lock, RW_READER);
4437 
4438 	mutex_enter(&hca->cache_allocation);
4439 	c_alloc = cache_allocation;
4440 	mutex_exit(&hca->cache_allocation);
4441 
4442 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
4443 	    &where)) == NULL) {
4444 		/* Am I above the cache limit */
4445 		if ((c_alloc + len) >= cache_limit) {
4446 			rib_force_cleanup((void *)hca);
4447 			rw_exit(&hca->avl_rw_lock);
4448 			cache_misses_above_the_limit ++;
4449 
4450 			/* Allocate and register the buffer directly */
4451 			goto error_alloc;
4452 		}
4453 
4454 		rw_exit(&hca->avl_rw_lock);
4455 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
4456 
4457 		/* Recheck to make sure no other thread added the entry in */
4458 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
4459 		    &cas, &where)) == NULL) {
4460 			/* Allocate an avl tree entry */
4461 			rcas = (cache_avl_struct_t *)
4462 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
4463 
4464 			bzero(rcas, sizeof (cache_avl_struct_t));
4465 			rcas->elements = 0;
4466 			rcas->r.forw = &rcas->r;
4467 			rcas->r.back = &rcas->r;
4468 			rcas->len = len;
4469 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
4470 			avl_insert(&hca->avl_tree, rcas, where);
4471 		}
4472 	}
4473 
4474 	mutex_enter(&rcas->node_lock);
4475 
4476 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
4477 		rib_total_buffers--;
4478 		cache_hits++;
4479 		reply_buf = rcas->r.forw;
4480 		remque(reply_buf);
4481 		rcas->elements--;
4482 		mutex_exit(&rcas->node_lock);
4483 		rw_exit(&hca->avl_rw_lock);
4484 		mutex_enter(&hca->cache_allocation);
4485 		cache_allocation -= len;
4486 		mutex_exit(&hca->cache_allocation);
4487 	} else {
4488 		/* Am I above the cache limit */
4489 		mutex_exit(&rcas->node_lock);
4490 		if ((c_alloc + len) >= cache_limit) {
4491 			rib_force_cleanup((void *)hca);
4492 			rw_exit(&hca->avl_rw_lock);
4493 			cache_misses_above_the_limit ++;
4494 			/* Allocate and register the buffer directly */
4495 			goto error_alloc;
4496 		}
4497 		rw_exit(&hca->avl_rw_lock);
4498 		cache_misses ++;
4499 		/* Allocate a reply_buf entry */
4500 		reply_buf = (rib_lrc_entry_t *)
4501 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4502 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
4503 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
4504 		reply_buf->lrc_len  = len;
4505 		reply_buf->registered = FALSE;
4506 		reply_buf->avl_node = (void *)rcas;
4507 	}
4508 
4509 	return (reply_buf);
4510 
4511 error_alloc:
4512 	reply_buf = (rib_lrc_entry_t *)
4513 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4514 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
4515 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
4516 	reply_buf->lrc_len = len;
4517 	reply_buf->registered = FALSE;
4518 	reply_buf->avl_node = NULL;
4519 
4520 	return (reply_buf);
4521 }
4522 
4523 /*
4524  * Return a pre-registered back to the cache (without
4525  * unregistering the buffer)..
4526  */
4527 
4528 static void
4529 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
4530 {
4531 	cache_avl_struct_t    cas, *rcas;
4532 	avl_index_t where = NULL;
4533 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4534 
4535 	if (!hca->avl_init)
4536 		goto  error_free;
4537 
4538 	cas.len = reg_buf->lrc_len;
4539 	rw_enter(&hca->avl_rw_lock, RW_READER);
4540 	if ((rcas = (cache_avl_struct_t *)
4541 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
4542 		rw_exit(&hca->avl_rw_lock);
4543 		goto error_free;
4544 	} else {
4545 		rib_total_buffers ++;
4546 		cas.len = reg_buf->lrc_len;
4547 		mutex_enter(&rcas->node_lock);
4548 		insque(reg_buf, &rcas->r);
4549 		rcas->elements ++;
4550 		mutex_exit(&rcas->node_lock);
4551 		rw_exit(&hca->avl_rw_lock);
4552 		mutex_enter(&hca->cache_allocation);
4553 		cache_allocation += cas.len;
4554 		mutex_exit(&hca->cache_allocation);
4555 	}
4556 
4557 	return;
4558 
4559 error_free:
4560 
4561 	if (reg_buf->registered)
4562 		(void) rib_deregistermem_via_hca(hca,
4563 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
4564 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
4565 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
4566 }
4567 
4568 static rdma_stat
4569 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
4570 	uint_t buflen, struct mrc *buf_handle)
4571 {
4572 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
4573 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
4574 	rdma_stat	status;
4575 
4576 
4577 	/*
4578 	 * Note: ALL buffer pools use the same memory type RDMARW.
4579 	 */
4580 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
4581 	if (status == RDMA_SUCCESS) {
4582 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
4583 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
4584 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
4585 	} else {
4586 		buf_handle->mrc_linfo = NULL;
4587 		buf_handle->mrc_lmr = 0;
4588 		buf_handle->mrc_rmr = 0;
4589 	}
4590 	return (status);
4591 }
4592 
4593 /* ARGSUSED */
4594 static rdma_stat
4595 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
4596     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
4597 {
4598 
4599 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
4600 	return (RDMA_SUCCESS);
4601 }
4602 
4603 /* ARGSUSED */
4604 static rdma_stat
4605 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
4606 {
4607 
4608 	(void) ibt_deregister_mr(hca->hca_hdl,
4609 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
4610 	return (RDMA_SUCCESS);
4611 }
4612 
4613 /*
4614  * Check if the IP interface named by `lifrp' is RDMA-capable.
4615  */
4616 static boolean_t
4617 rpcib_rdma_capable_interface(struct lifreq *lifrp)
4618 {
4619 	char ifname[LIFNAMSIZ];
4620 	char *cp;
4621 
4622 	if (lifrp->lifr_type == IFT_IB)
4623 		return (B_TRUE);
4624 
4625 	/*
4626 	 * Strip off the logical interface portion before getting
4627 	 * intimate with the name.
4628 	 */
4629 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
4630 	if ((cp = strchr(ifname, ':')) != NULL)
4631 		*cp = '\0';
4632 
4633 	return (strcmp("lo0", ifname) == 0);
4634 }
4635 
4636 static int
4637 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
4638 {
4639 	vnode_t *kvp, *vp;
4640 	TIUSER  *tiptr;
4641 	struct  strioctl iocb;
4642 	k_sigset_t smask;
4643 	int	err = 0;
4644 
4645 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) {
4646 		if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE,
4647 		    &tiptr, CRED()) == 0) {
4648 			vp = tiptr->fp->f_vnode;
4649 		} else {
4650 			VN_RELE(kvp);
4651 			return (EPROTO);
4652 		}
4653 	} else {
4654 		return (EPROTO);
4655 	}
4656 
4657 	iocb.ic_cmd = cmd;
4658 	iocb.ic_timout = 0;
4659 	iocb.ic_len = len;
4660 	iocb.ic_dp = (caddr_t)arg;
4661 	sigintr(&smask, 0);
4662 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4663 	sigunintr(&smask);
4664 	(void) t_kclose(tiptr, 0);
4665 	VN_RELE(kvp);
4666 	return (err);
4667 }
4668 
4669 /*
4670  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
4671  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
4672  */
4673 static int
4674 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
4675 {
4676 	int err;
4677 	struct lifnum lifn;
4678 
4679 	bzero(&lifn, sizeof (struct lifnum));
4680 	lifn.lifn_family = AF_UNSPEC;
4681 
4682 	err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
4683 	if (err != 0)
4684 		return (err);
4685 
4686 	/*
4687 	 * Pad the interface count to account for additional interfaces that
4688 	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
4689 	 */
4690 	lifn.lifn_count += 4;
4691 
4692 	bzero(lifcp, sizeof (struct lifconf));
4693 	lifcp->lifc_family = AF_UNSPEC;
4694 	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
4695 	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
4696 
4697 	err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
4698 	if (err != 0) {
4699 		kmem_free(lifcp->lifc_buf, *bufsizep);
4700 		return (err);
4701 	}
4702 	return (0);
4703 }
4704 
4705 static boolean_t
4706 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
4707 {
4708 	uint_t i, nifs;
4709 	uint_t bufsize;
4710 	struct lifconf lifc;
4711 	struct lifreq *lifrp;
4712 	struct sockaddr_in *sinp;
4713 	struct sockaddr_in6 *sin6p;
4714 
4715 	bzero(addrs4, sizeof (rpcib_ipaddrs_t));
4716 	bzero(addrs6, sizeof (rpcib_ipaddrs_t));
4717 
4718 	if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
4719 		return (B_FALSE);
4720 
4721 	if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
4722 		kmem_free(lifc.lifc_buf, bufsize);
4723 		return (B_FALSE);
4724 	}
4725 
4726 	/*
4727 	 * Worst case is that all of the addresses are IB-capable and have
4728 	 * the same address family, so size our buffers accordingly.
4729 	 */
4730 	addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
4731 	addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
4732 	addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
4733 	addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
4734 
4735 	for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
4736 		if (!rpcib_rdma_capable_interface(lifrp))
4737 			continue;
4738 
4739 		if (lifrp->lifr_addr.ss_family == AF_INET) {
4740 			sinp = addrs4->ri_list;
4741 			bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
4742 			    sizeof (struct sockaddr_in));
4743 		} else if (lifrp->lifr_addr.ss_family == AF_INET6) {
4744 			sin6p = addrs6->ri_list;
4745 			bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
4746 			    sizeof (struct sockaddr_in6));
4747 		}
4748 	}
4749 
4750 	kmem_free(lifc.lifc_buf, bufsize);
4751 	return (B_TRUE);
4752 }
4753 
4754 /* ARGSUSED */
4755 static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) {
4756 
4757 	if (KSTAT_WRITE == rw) {
4758 		return (EACCES);
4759 	}
4760 	rpcib_kstat.cache_limit.value.ui64 =
4761 	    (uint64_t)cache_limit;
4762 	rpcib_kstat.cache_allocation.value.ui64 =
4763 	    (uint64_t)cache_allocation;
4764 	rpcib_kstat.cache_hits.value.ui64 =
4765 	    (uint64_t)cache_hits;
4766 	rpcib_kstat.cache_misses.value.ui64 =
4767 	    (uint64_t)cache_misses;
4768 	rpcib_kstat.cache_misses_above_the_limit.value.ui64 =
4769 	    (uint64_t)cache_misses_above_the_limit;
4770 	return (0);
4771 }
4772