xref: /titanic_51/usr/src/uts/common/rpc/rpcib.c (revision 16dd44c265271a75647fb0bb41109bb7c585a526)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
28  *
29  * Portions of this source code is developed by the team members of
30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31  * headed by Professor Dhabaleswar K. (DK) Panda.
32  *
33  * Acknowledgements to contributions from developors:
34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
35  *   Lei Chai      : chail@cse.ohio-state.edu
36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
37  *
38  */
39 
40 /*
41  * The rpcib plugin. Implements the interface for RDMATF's
42  * interaction with IBTF.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/user.h>
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/file.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/stropts.h>
56 #include <sys/errno.h>
57 #include <sys/kmem.h>
58 #include <sys/debug.h>
59 #include <sys/pathname.h>
60 #include <sys/kstat.h>
61 #include <sys/t_lock.h>
62 #include <sys/ddi.h>
63 #include <sys/cmn_err.h>
64 #include <sys/time.h>
65 #include <sys/isa_defs.h>
66 #include <sys/callb.h>
67 #include <sys/sunddi.h>
68 #include <sys/sunndi.h>
69 #include <sys/sdt.h>
70 #include <sys/ib/ibtl/ibti.h>
71 #include <rpc/rpc.h>
72 #include <rpc/ib.h>
73 #include <sys/modctl.h>
74 #include <sys/kstr.h>
75 #include <sys/sockio.h>
76 #include <sys/vnode.h>
77 #include <sys/tiuser.h>
78 #include <net/if.h>
79 #include <net/if_types.h>
80 #include <sys/cred.h>
81 #include <rpc/rpc_rdma.h>
82 #include <nfs/nfs.h>
83 #include <sys/atomic.h>
84 
85 #define	NFS_RDMA_PORT	2050
86 
87 /*
88  * Convenience structures for connection management
89  */
90 typedef struct rpcib_ipaddrs {
91 	void	*ri_list;	/* pointer to list of addresses */
92 	uint_t	ri_count;	/* number of addresses in list */
93 	uint_t	ri_size;	/* size of ri_list in bytes */
94 } rpcib_ipaddrs_t;
95 
96 
97 typedef struct rpcib_ping {
98 	rib_hca_t  *hca;
99 	ibt_path_info_t path;
100 	ibt_ip_addr_t srcip;
101 	ibt_ip_addr_t dstip;
102 } rpcib_ping_t;
103 
104 /*
105  * Prototype declarations for driver ops
106  */
107 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
108 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
109 				void *, void **);
110 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
111 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
112 static int	rpcib_do_ip_ioctl(int, int, void *);
113 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
114 static int rpcib_cache_kstat_update(kstat_t *, int);
115 static void rib_force_cleanup(void *);
116 
117 struct {
118 	kstat_named_t cache_limit;
119 	kstat_named_t cache_allocation;
120 	kstat_named_t cache_hits;
121 	kstat_named_t cache_misses;
122 	kstat_named_t cache_misses_above_the_limit;
123 } rpcib_kstat = {
124 	{"cache_limit",			KSTAT_DATA_UINT64 },
125 	{"cache_allocation",		KSTAT_DATA_UINT64 },
126 	{"cache_hits",			KSTAT_DATA_UINT64 },
127 	{"cache_misses",		KSTAT_DATA_UINT64 },
128 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
129 };
130 
131 /* rpcib cb_ops */
132 static struct cb_ops rpcib_cbops = {
133 	nulldev,		/* open */
134 	nulldev,		/* close */
135 	nodev,			/* strategy */
136 	nodev,			/* print */
137 	nodev,			/* dump */
138 	nodev,			/* read */
139 	nodev,			/* write */
140 	nodev,			/* ioctl */
141 	nodev,			/* devmap */
142 	nodev,			/* mmap */
143 	nodev,			/* segmap */
144 	nochpoll,		/* poll */
145 	ddi_prop_op,		/* prop_op */
146 	NULL,			/* stream */
147 	D_MP,			/* cb_flag */
148 	CB_REV,			/* rev */
149 	nodev,			/* int (*cb_aread)() */
150 	nodev			/* int (*cb_awrite)() */
151 };
152 
153 /*
154  * Device options
155  */
156 static struct dev_ops rpcib_ops = {
157 	DEVO_REV,		/* devo_rev, */
158 	0,			/* refcnt  */
159 	rpcib_getinfo,		/* info */
160 	nulldev,		/* identify */
161 	nulldev,		/* probe */
162 	rpcib_attach,		/* attach */
163 	rpcib_detach,		/* detach */
164 	nodev,			/* reset */
165 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
166 	NULL,			/* bus operations */
167 	NULL,			/* power */
168 	ddi_quiesce_not_needed,		/* quiesce */
169 };
170 
171 /*
172  * Module linkage information.
173  */
174 
175 static struct modldrv rib_modldrv = {
176 	&mod_driverops,		/* Driver module */
177 	"RPCIB plugin driver",	/* Driver name and version */
178 	&rpcib_ops,		/* Driver ops */
179 };
180 
181 static struct modlinkage rib_modlinkage = {
182 	MODREV_1,
183 	(void *)&rib_modldrv,
184 	NULL
185 };
186 
187 typedef struct rib_lrc_entry {
188 	struct rib_lrc_entry *forw;
189 	struct rib_lrc_entry *back;
190 	char *lrc_buf;
191 
192 	uint32_t lrc_len;
193 	void  *avl_node;
194 	bool_t registered;
195 
196 	struct mrc lrc_mhandle;
197 	bool_t lrc_on_freed_list;
198 } rib_lrc_entry_t;
199 
200 typedef	struct cache_struct	{
201 	rib_lrc_entry_t		r;
202 	uint32_t		len;
203 	uint32_t		elements;
204 	kmutex_t		node_lock;
205 	avl_node_t		avl_link;
206 } cache_avl_struct_t;
207 
208 static uint64_t	rib_total_buffers = 0;
209 uint64_t	cache_limit = 100 * 1024 * 1024;
210 static volatile uint64_t	cache_allocation = 0;
211 static uint64_t	cache_watermark = 80 * 1024 * 1024;
212 static uint64_t	cache_hits = 0;
213 static uint64_t	cache_misses = 0;
214 static uint64_t	cache_cold_misses = 0;
215 static uint64_t	cache_hot_misses = 0;
216 static uint64_t	cache_misses_above_the_limit = 0;
217 static bool_t	stats_enabled = FALSE;
218 
219 static uint64_t max_unsignaled_rws = 5;
220 
221 /*
222  * rib_stat: private data pointer used when registering
223  *	with the IBTF.  It is returned to the consumer
224  *	in all callbacks.
225  */
226 static rpcib_state_t *rib_stat = NULL;
227 
228 #define	RNR_RETRIES	IBT_RNR_RETRY_1
229 #define	MAX_PORTS	2
230 
231 int preposted_rbufs = RDMA_BUFS_GRANT;
232 int send_threshold = 1;
233 
234 /*
235  * State of the plugin.
236  * ACCEPT = accepting new connections and requests.
237  * NO_ACCEPT = not accepting new connection and requests.
238  * This should eventually move to rpcib_state_t structure, since this
239  * will tell in which state the plugin is for a particular type of service
240  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
241  * state for one and in no_accept state for the other.
242  */
243 int		plugin_state;
244 kmutex_t	plugin_state_lock;
245 
246 ldi_ident_t rpcib_li;
247 
248 /*
249  * RPCIB RDMATF operations
250  */
251 #if defined(MEASURE_POOL_DEPTH)
252 static void rib_posted_rbufs(uint32_t x) { return; }
253 #endif
254 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
255 static rdma_stat rib_disconnect(CONN *conn);
256 static void rib_listen(struct rdma_svc_data *rd);
257 static void rib_listen_stop(struct rdma_svc_data *rd);
258 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
259 	uint_t buflen, struct mrc *buf_handle);
260 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
261 	struct mrc buf_handle);
262 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
263 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
264 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
265 		struct mrc buf_handle);
266 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
267 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
268 	void *lrc);
269 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
270 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
271 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
272 	caddr_t buf, int len, int cpu);
273 
274 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
275 
276 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
277 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
278 
279 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
280 
281 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
282 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
283 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
284 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
285 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
286 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
287 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
288 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
289 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
290 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
291 static rdma_stat rib_conn_release(CONN *conn);
292 static rdma_stat rib_getinfo(rdma_info_t *info);
293 
294 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
295 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
296 static void rib_destroy_cache(rib_hca_t *hca);
297 static	void	rib_server_side_cache_reclaim(void *argp);
298 static int avl_compare(const void *t1, const void *t2);
299 
300 static void rib_stop_services(rib_hca_t *);
301 static void rib_close_channels(rib_conn_list_t *);
302 
303 /*
304  * RPCIB addressing operations
305  */
306 
307 /*
308  * RDMA operations the RPCIB module exports
309  */
310 static rdmaops_t rib_ops = {
311 	rib_reachable,
312 	rib_conn_get,
313 	rib_conn_release,
314 	rib_listen,
315 	rib_listen_stop,
316 	rib_registermem,
317 	rib_deregistermem,
318 	rib_registermemsync,
319 	rib_deregistermemsync,
320 	rib_syncmem,
321 	rib_reg_buf_alloc,
322 	rib_reg_buf_free,
323 	rib_send,
324 	rib_send_resp,
325 	rib_post_resp,
326 	rib_post_resp_remove,
327 	rib_post_recv,
328 	rib_recv,
329 	rib_read,
330 	rib_write,
331 	rib_getinfo,
332 };
333 
334 /*
335  * RDMATF RPCIB plugin details
336  */
337 static rdma_mod_t rib_mod = {
338 	"ibtf",		/* api name */
339 	RDMATF_VERS_1,
340 	0,
341 	&rib_ops,	/* rdma op vector for ibtf */
342 };
343 
344 static rdma_stat open_hcas(rpcib_state_t *);
345 static rdma_stat rib_qp_init(rib_qp_t *, int);
346 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
347 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
348 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
349 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
350 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
351 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
352 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
353 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
354 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
355 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
356 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
357 	rib_qp_t **);
358 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
359 	rib_qp_t **);
360 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
361 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
362 static int rib_free_sendwait(struct send_wid *);
363 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
364 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
365 static void rdma_done_rem_list(rib_qp_t *);
366 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
367 
368 static void rib_async_handler(void *,
369 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
370 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
371 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
372 static int rib_free_svc_recv(struct svc_recv *);
373 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
374 static void rib_free_wid(struct recv_wid *);
375 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
376 static void rib_detach_hca(rib_hca_t *);
377 
378 /*
379  * Registration with IBTF as a consumer
380  */
381 static struct ibt_clnt_modinfo_s rib_modinfo = {
382 	IBTI_V_CURR,
383 	IBT_GENERIC,
384 	rib_async_handler,	/* async event handler */
385 	NULL,			/* Memory Region Handler */
386 	"nfs/ib"
387 };
388 
389 /*
390  * Global strucuture
391  */
392 
393 typedef struct rpcib_s {
394 	dev_info_t	*rpcib_dip;
395 	kmutex_t	rpcib_mutex;
396 } rpcib_t;
397 
398 rpcib_t rpcib;
399 
400 /*
401  * /etc/system controlled variable to control
402  * debugging in rpcib kernel module.
403  * Set it to values greater that 1 to control
404  * the amount of debugging messages required.
405  */
406 int rib_debug = 0;
407 
408 int
409 _init(void)
410 {
411 	int error;
412 
413 	error = mod_install((struct modlinkage *)&rib_modlinkage);
414 	if (error != 0) {
415 		/*
416 		 * Could not load module
417 		 */
418 		return (error);
419 	}
420 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
421 	return (0);
422 }
423 
424 int
425 _fini()
426 {
427 	int status;
428 
429 	/*
430 	 * Remove module
431 	 */
432 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
433 		return (status);
434 	}
435 	mutex_destroy(&plugin_state_lock);
436 	return (0);
437 }
438 
439 int
440 _info(struct modinfo *modinfop)
441 {
442 	return (mod_info(&rib_modlinkage, modinfop));
443 }
444 
445 /*
446  * rpcib_getinfo()
447  * Given the device number, return the devinfo pointer or the
448  * instance number.
449  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
450  */
451 
452 /*ARGSUSED*/
453 static int
454 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
455 {
456 	int ret = DDI_SUCCESS;
457 
458 	switch (cmd) {
459 	case DDI_INFO_DEVT2DEVINFO:
460 		if (rpcib.rpcib_dip != NULL)
461 			*result = rpcib.rpcib_dip;
462 		else {
463 			*result = NULL;
464 			ret = DDI_FAILURE;
465 		}
466 		break;
467 
468 	case DDI_INFO_DEVT2INSTANCE:
469 		*result = NULL;
470 		break;
471 
472 	default:
473 		ret = DDI_FAILURE;
474 	}
475 	return (ret);
476 }
477 
478 static int
479 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
480 {
481 	ibt_status_t	ibt_status;
482 	rdma_stat	r_status;
483 
484 	switch (cmd) {
485 	case DDI_ATTACH:
486 		break;
487 	case DDI_RESUME:
488 		return (DDI_SUCCESS);
489 	default:
490 		return (DDI_FAILURE);
491 	}
492 
493 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
494 
495 	mutex_enter(&rpcib.rpcib_mutex);
496 	if (rpcib.rpcib_dip != NULL) {
497 		mutex_exit(&rpcib.rpcib_mutex);
498 		return (DDI_FAILURE);
499 	}
500 	rpcib.rpcib_dip = dip;
501 	mutex_exit(&rpcib.rpcib_mutex);
502 	/*
503 	 * Create the "rpcib" minor-node.
504 	 */
505 	if (ddi_create_minor_node(dip,
506 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
507 		/* Error message, no cmn_err as they print on console */
508 		return (DDI_FAILURE);
509 	}
510 
511 	if (rib_stat == NULL) {
512 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
513 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
514 	}
515 
516 	rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids);
517 	if (rib_stat->hca_count < 1) {
518 		mutex_destroy(&rib_stat->open_hca_lock);
519 		kmem_free(rib_stat, sizeof (*rib_stat));
520 		rib_stat = NULL;
521 		return (DDI_FAILURE);
522 	}
523 
524 	ibt_status = ibt_attach(&rib_modinfo, dip,
525 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
526 
527 	if (ibt_status != IBT_SUCCESS) {
528 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
529 		mutex_destroy(&rib_stat->open_hca_lock);
530 		kmem_free(rib_stat, sizeof (*rib_stat));
531 		rib_stat = NULL;
532 		return (DDI_FAILURE);
533 	}
534 
535 	mutex_enter(&rib_stat->open_hca_lock);
536 	if (open_hcas(rib_stat) != RDMA_SUCCESS) {
537 		mutex_exit(&rib_stat->open_hca_lock);
538 		goto open_fail;
539 	}
540 	mutex_exit(&rib_stat->open_hca_lock);
541 
542 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
543 	    DDI_PROP_SUCCESS) {
544 		cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
545 		    "failed.");
546 		goto register_fail;
547 	}
548 
549 	/*
550 	 * Register with rdmatf
551 	 */
552 	rib_mod.rdma_count = rib_stat->nhca_inited;
553 	r_status = rdma_register_mod(&rib_mod);
554 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
555 		cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
556 		    "status = %d", r_status);
557 		goto register_fail;
558 	}
559 
560 	return (DDI_SUCCESS);
561 
562 register_fail:
563 	rib_detach_hca(rib_stat->hca);
564 open_fail:
565 	ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
566 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
567 	mutex_destroy(&rib_stat->open_hca_lock);
568 	kmem_free(rib_stat, sizeof (*rib_stat));
569 	rib_stat = NULL;
570 	return (DDI_FAILURE);
571 }
572 
573 /*ARGSUSED*/
574 static int
575 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
576 {
577 	switch (cmd) {
578 
579 	case DDI_DETACH:
580 		break;
581 
582 	case DDI_SUSPEND:
583 	default:
584 		return (DDI_FAILURE);
585 	}
586 
587 	/*
588 	 * Detach the hca and free resources
589 	 */
590 	mutex_enter(&plugin_state_lock);
591 	plugin_state = NO_ACCEPT;
592 	mutex_exit(&plugin_state_lock);
593 	rib_detach_hca(rib_stat->hca);
594 	ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
595 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
596 	mutex_destroy(&rib_stat->open_hca_lock);
597 	if (rib_stat->hcas) {
598 		kmem_free(rib_stat->hcas, rib_stat->hca_count *
599 		    sizeof (rib_hca_t));
600 		rib_stat->hcas = NULL;
601 	}
602 	kmem_free(rib_stat, sizeof (*rib_stat));
603 	rib_stat = NULL;
604 
605 	mutex_enter(&rpcib.rpcib_mutex);
606 	rpcib.rpcib_dip = NULL;
607 	mutex_exit(&rpcib.rpcib_mutex);
608 	mutex_destroy(&rpcib.rpcib_mutex);
609 	return (DDI_SUCCESS);
610 }
611 
612 
613 static void rib_rbufpool_free(rib_hca_t *, int);
614 static void rib_rbufpool_deregister(rib_hca_t *, int);
615 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
616 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
617 static rdma_stat rib_rem_replylist(rib_qp_t *);
618 static int rib_remreply(rib_qp_t *, struct reply *);
619 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
620 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
621 
622 
623 /*
624  * One CQ pair per HCA
625  */
626 static rdma_stat
627 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
628 	rib_cq_t **cqp, rpcib_state_t *ribstat)
629 {
630 	rib_cq_t	*cq;
631 	ibt_cq_attr_t	cq_attr;
632 	uint32_t	real_size;
633 	ibt_status_t	status;
634 	rdma_stat	error = RDMA_SUCCESS;
635 
636 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
637 	cq->rib_hca = hca;
638 	cq_attr.cq_size = cq_size;
639 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
640 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
641 	    &real_size);
642 	if (status != IBT_SUCCESS) {
643 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
644 		    " status=%d", status);
645 		error = RDMA_FAILED;
646 		goto fail;
647 	}
648 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat);
649 
650 	/*
651 	 * Enable CQ callbacks. CQ Callbacks are single shot
652 	 * (e.g. you have to call ibt_enable_cq_notify()
653 	 * after each callback to get another one).
654 	 */
655 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
656 	if (status != IBT_SUCCESS) {
657 		cmn_err(CE_WARN, "rib_create_cq: "
658 		    "enable_cq_notify failed, status %d", status);
659 		error = RDMA_FAILED;
660 		goto fail;
661 	}
662 	*cqp = cq;
663 
664 	return (error);
665 fail:
666 	if (cq->rib_cq_hdl)
667 		(void) ibt_free_cq(cq->rib_cq_hdl);
668 	if (cq)
669 		kmem_free(cq, sizeof (rib_cq_t));
670 	return (error);
671 }
672 
673 static rdma_stat
674 open_hcas(rpcib_state_t *ribstat)
675 {
676 	rib_hca_t		*hca;
677 	ibt_status_t		ibt_status;
678 	rdma_stat		status;
679 	ibt_hca_portinfo_t	*pinfop;
680 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
681 	uint_t			size, cq_size;
682 	int			i;
683 	kstat_t *ksp;
684 	cache_avl_struct_t example_avl_node;
685 	char rssc_name[32];
686 
687 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
688 
689 	if (ribstat->hcas == NULL)
690 		ribstat->hcas = kmem_zalloc(ribstat->hca_count *
691 		    sizeof (rib_hca_t), KM_SLEEP);
692 
693 	/*
694 	 * Open a hca and setup for RDMA
695 	 */
696 	for (i = 0; i < ribstat->hca_count; i++) {
697 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
698 		    ribstat->hca_guids[i],
699 		    &ribstat->hcas[i].hca_hdl);
700 		if (ibt_status != IBT_SUCCESS) {
701 			continue;
702 		}
703 		ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
704 		hca = &(ribstat->hcas[i]);
705 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
706 		hca->state = HCA_INITED;
707 
708 		/*
709 		 * query HCA info
710 		 */
711 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
712 		if (ibt_status != IBT_SUCCESS) {
713 			goto fail1;
714 		}
715 
716 		/*
717 		 * One PD (Protection Domain) per HCA.
718 		 * A qp is allowed to access a memory region
719 		 * only when it's in the same PD as that of
720 		 * the memory region.
721 		 */
722 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
723 		if (ibt_status != IBT_SUCCESS) {
724 			goto fail1;
725 		}
726 
727 		/*
728 		 * query HCA ports
729 		 */
730 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
731 		    0, &pinfop, &hca->hca_nports, &size);
732 		if (ibt_status != IBT_SUCCESS) {
733 			goto fail2;
734 		}
735 		hca->hca_ports = pinfop;
736 		hca->hca_pinfosz = size;
737 		pinfop = NULL;
738 
739 		cq_size = DEF_CQ_SIZE; /* default cq size */
740 		/*
741 		 * Create 2 pairs of cq's (1 pair for client
742 		 * and the other pair for server) on this hca.
743 		 * If number of qp's gets too large, then several
744 		 * cq's will be needed.
745 		 */
746 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
747 		    &hca->svc_rcq, ribstat);
748 		if (status != RDMA_SUCCESS) {
749 			goto fail3;
750 		}
751 
752 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
753 		    &hca->svc_scq, ribstat);
754 		if (status != RDMA_SUCCESS) {
755 			goto fail3;
756 		}
757 
758 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
759 		    &hca->clnt_rcq, ribstat);
760 		if (status != RDMA_SUCCESS) {
761 			goto fail3;
762 		}
763 
764 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
765 		    &hca->clnt_scq, ribstat);
766 		if (status != RDMA_SUCCESS) {
767 			goto fail3;
768 		}
769 
770 		/*
771 		 * Create buffer pools.
772 		 * Note rib_rbuf_create also allocates memory windows.
773 		 */
774 		hca->recv_pool = rib_rbufpool_create(hca,
775 		    RECV_BUFFER, MAX_BUFS);
776 		if (hca->recv_pool == NULL) {
777 			goto fail3;
778 		}
779 
780 		hca->send_pool = rib_rbufpool_create(hca,
781 		    SEND_BUFFER, MAX_BUFS);
782 		if (hca->send_pool == NULL) {
783 			rib_rbufpool_destroy(hca, RECV_BUFFER);
784 			goto fail3;
785 		}
786 
787 		if (hca->server_side_cache == NULL) {
788 			(void) sprintf(rssc_name,
789 			    "rib_server_side_cache_%04d", i);
790 			hca->server_side_cache = kmem_cache_create(
791 			    rssc_name,
792 			    sizeof (cache_avl_struct_t), 0,
793 			    NULL,
794 			    NULL,
795 			    rib_server_side_cache_reclaim,
796 			    hca, NULL, 0);
797 		}
798 
799 		avl_create(&hca->avl_tree,
800 		    avl_compare,
801 		    sizeof (cache_avl_struct_t),
802 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
803 		    (uint_t)(uintptr_t)&example_avl_node);
804 
805 		rw_init(&hca->avl_rw_lock,
806 		    NULL, RW_DRIVER, hca->iblock);
807 		mutex_init(&hca->cache_allocation,
808 		    NULL, MUTEX_DRIVER, NULL);
809 		hca->avl_init = TRUE;
810 
811 		/* Create kstats for the cache */
812 		ASSERT(INGLOBALZONE(curproc));
813 
814 		if (!stats_enabled) {
815 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
816 			    KSTAT_TYPE_NAMED,
817 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
818 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
819 			    GLOBAL_ZONEID);
820 			if (ksp) {
821 				ksp->ks_data = (void *) &rpcib_kstat;
822 				ksp->ks_update = rpcib_cache_kstat_update;
823 				kstat_install(ksp);
824 				stats_enabled = TRUE;
825 			}
826 		}
827 		if (NULL == hca->reg_cache_clean_up) {
828 			hca->reg_cache_clean_up = ddi_taskq_create(NULL,
829 			    "REG_CACHE_CLEANUP", 1, TASKQ_DEFAULTPRI, 0);
830 		}
831 
832 		/*
833 		 * Initialize the registered service list and
834 		 * the lock
835 		 */
836 		hca->service_list = NULL;
837 		rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
838 
839 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
840 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
841 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
842 		    hca->iblock);
843 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
844 		    hca->iblock);
845 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
846 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
847 		hca->inuse = TRUE;
848 		/*
849 		 * XXX One hca only. Add multi-hca functionality if needed
850 		 * later.
851 		 */
852 		ribstat->hca = hca;
853 		ribstat->nhca_inited++;
854 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
855 		break;
856 
857 fail3:
858 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
859 fail2:
860 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
861 fail1:
862 		(void) ibt_close_hca(hca->hca_hdl);
863 
864 	}
865 	if (ribstat->hca != NULL)
866 		return (RDMA_SUCCESS);
867 	else
868 		return (RDMA_FAILED);
869 }
870 
871 /*
872  * Callback routines
873  */
874 
875 /*
876  * SCQ handlers
877  */
878 /* ARGSUSED */
879 static void
880 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
881 {
882 	ibt_status_t	ibt_status;
883 	ibt_wc_t	wc;
884 	int		i;
885 
886 	/*
887 	 * Re-enable cq notify here to avoid missing any
888 	 * completion queue notification.
889 	 */
890 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
891 
892 	ibt_status = IBT_SUCCESS;
893 	while (ibt_status != IBT_CQ_EMPTY) {
894 	bzero(&wc, sizeof (wc));
895 	ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
896 	if (ibt_status != IBT_SUCCESS)
897 		return;
898 
899 	/*
900 	 * Got a send completion
901 	 */
902 	if (wc.wc_id != NULL) {	/* XXX can it be otherwise ???? */
903 		struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
904 		CONN	*conn = qptoc(wd->qp);
905 
906 		mutex_enter(&wd->sendwait_lock);
907 		switch (wc.wc_status) {
908 		case IBT_WC_SUCCESS:
909 			wd->status = RDMA_SUCCESS;
910 			break;
911 		case IBT_WC_WR_FLUSHED_ERR:
912 			wd->status = RDMA_FAILED;
913 			break;
914 		default:
915 /*
916  *    RC Send Q Error Code		Local state     Remote State
917  *    ==================== 		===========     ============
918  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
919  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
920  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
921  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
922  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
923  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
924  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
925  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
926  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
927  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
928  *    IBT_WC_WR_FLUSHED_ERR               None            None
929  */
930 			/*
931 			 * Channel in error state. Set connection to
932 			 * ERROR and cleanup will happen either from
933 			 * conn_release  or from rib_conn_get
934 			 */
935 			wd->status = RDMA_FAILED;
936 			mutex_enter(&conn->c_lock);
937 			if (conn->c_state != C_DISCONN_PEND)
938 				conn->c_state = C_ERROR_CONN;
939 			mutex_exit(&conn->c_lock);
940 			break;
941 		}
942 
943 		if (wd->cv_sig == 1) {
944 			/*
945 			 * Notify poster
946 			 */
947 			cv_signal(&wd->wait_cv);
948 			mutex_exit(&wd->sendwait_lock);
949 		} else {
950 			/*
951 			 * Poster not waiting for notification.
952 			 * Free the send buffers and send_wid
953 			 */
954 			for (i = 0; i < wd->nsbufs; i++) {
955 				rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
956 				    (void *)(uintptr_t)wd->sbufaddr[i]);
957 				}
958 			mutex_exit(&wd->sendwait_lock);
959 			(void) rib_free_sendwait(wd);
960 			}
961 		}
962 	}
963 }
964 
965 /* ARGSUSED */
966 static void
967 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
968 {
969 	ibt_status_t	ibt_status;
970 	ibt_wc_t	wc;
971 	int		i;
972 
973 	/*
974 	 * Re-enable cq notify here to avoid missing any
975 	 * completion queue notification.
976 	 */
977 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
978 
979 	ibt_status = IBT_SUCCESS;
980 	while (ibt_status != IBT_CQ_EMPTY) {
981 		bzero(&wc, sizeof (wc));
982 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
983 		if (ibt_status != IBT_SUCCESS)
984 			return;
985 
986 		/*
987 		 * Got a send completion
988 		 */
989 		if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
990 			struct send_wid *wd =
991 			    (struct send_wid *)(uintptr_t)wc.wc_id;
992 			mutex_enter(&wd->sendwait_lock);
993 			if (wd->cv_sig == 1) {
994 				/*
995 				 * Update completion status and notify poster
996 				 */
997 				if (wc.wc_status == IBT_WC_SUCCESS)
998 					wd->status = RDMA_SUCCESS;
999 				else
1000 					wd->status = RDMA_FAILED;
1001 				cv_signal(&wd->wait_cv);
1002 				mutex_exit(&wd->sendwait_lock);
1003 			} else {
1004 				/*
1005 				 * Poster not waiting for notification.
1006 				 * Free the send buffers and send_wid
1007 				 */
1008 				for (i = 0; i < wd->nsbufs; i++) {
1009 					rib_rbuf_free(qptoc(wd->qp),
1010 					    SEND_BUFFER,
1011 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1012 				}
1013 				mutex_exit(&wd->sendwait_lock);
1014 				(void) rib_free_sendwait(wd);
1015 			}
1016 		}
1017 	}
1018 }
1019 
1020 /*
1021  * RCQ handler
1022  */
1023 /* ARGSUSED */
1024 static void
1025 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1026 {
1027 	rib_qp_t	*qp;
1028 	ibt_status_t	ibt_status;
1029 	ibt_wc_t	wc;
1030 	struct recv_wid	*rwid;
1031 
1032 	/*
1033 	 * Re-enable cq notify here to avoid missing any
1034 	 * completion queue notification.
1035 	 */
1036 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1037 
1038 	ibt_status = IBT_SUCCESS;
1039 	while (ibt_status != IBT_CQ_EMPTY) {
1040 		bzero(&wc, sizeof (wc));
1041 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1042 		if (ibt_status != IBT_SUCCESS)
1043 			return;
1044 
1045 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1046 		qp = rwid->qp;
1047 		if (wc.wc_status == IBT_WC_SUCCESS) {
1048 			XDR	inxdrs, *xdrs;
1049 			uint_t	xid, vers, op, find_xid = 0;
1050 			struct reply	*r;
1051 			CONN *conn = qptoc(qp);
1052 			uint32_t rdma_credit = 0;
1053 
1054 			xdrs = &inxdrs;
1055 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1056 			    wc.wc_bytes_xfer, XDR_DECODE);
1057 			/*
1058 			 * Treat xid as opaque (xid is the first entity
1059 			 * in the rpc rdma message).
1060 			 */
1061 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1062 
1063 			/* Skip xid and set the xdr position accordingly. */
1064 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1065 			(void) xdr_u_int(xdrs, &vers);
1066 			(void) xdr_u_int(xdrs, &rdma_credit);
1067 			(void) xdr_u_int(xdrs, &op);
1068 			XDR_DESTROY(xdrs);
1069 
1070 			if (vers != RPCRDMA_VERS) {
1071 				/*
1072 				 * Invalid RPC/RDMA version. Cannot
1073 				 * interoperate.  Set connection to
1074 				 * ERROR state and bail out.
1075 				 */
1076 				mutex_enter(&conn->c_lock);
1077 				if (conn->c_state != C_DISCONN_PEND)
1078 					conn->c_state = C_ERROR_CONN;
1079 				mutex_exit(&conn->c_lock);
1080 				rib_rbuf_free(conn, RECV_BUFFER,
1081 				    (void *)(uintptr_t)rwid->addr);
1082 				rib_free_wid(rwid);
1083 				continue;
1084 			}
1085 
1086 			mutex_enter(&qp->replylist_lock);
1087 			for (r = qp->replylist; r != NULL; r = r->next) {
1088 				if (r->xid == xid) {
1089 					find_xid = 1;
1090 					switch (op) {
1091 					case RDMA_MSG:
1092 					case RDMA_NOMSG:
1093 					case RDMA_MSGP:
1094 						r->status = RDMA_SUCCESS;
1095 						r->vaddr_cq = rwid->addr;
1096 						r->bytes_xfer =
1097 						    wc.wc_bytes_xfer;
1098 						cv_signal(&r->wait_cv);
1099 						break;
1100 					default:
1101 						rib_rbuf_free(qptoc(qp),
1102 						    RECV_BUFFER,
1103 						    (void *)(uintptr_t)
1104 						    rwid->addr);
1105 						break;
1106 					}
1107 					break;
1108 				}
1109 			}
1110 			mutex_exit(&qp->replylist_lock);
1111 			if (find_xid == 0) {
1112 				/* RPC caller not waiting for reply */
1113 
1114 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1115 				    int, xid);
1116 
1117 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1118 				    (void *)(uintptr_t)rwid->addr);
1119 			}
1120 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1121 			CONN *conn = qptoc(qp);
1122 
1123 			/*
1124 			 * Connection being flushed. Just free
1125 			 * the posted buffer
1126 			 */
1127 			rib_rbuf_free(conn, RECV_BUFFER,
1128 			    (void *)(uintptr_t)rwid->addr);
1129 		} else {
1130 			CONN *conn = qptoc(qp);
1131 /*
1132  *  RC Recv Q Error Code		Local state     Remote State
1133  *  ====================		===========     ============
1134  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1135  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1136  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1137  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1138  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1139  *  IBT_WC_WR_FLUSHED_ERR               None            None
1140  */
1141 			/*
1142 			 * Channel in error state. Set connection
1143 			 * in ERROR state.
1144 			 */
1145 			mutex_enter(&conn->c_lock);
1146 			if (conn->c_state != C_DISCONN_PEND)
1147 				conn->c_state = C_ERROR_CONN;
1148 			mutex_exit(&conn->c_lock);
1149 			rib_rbuf_free(conn, RECV_BUFFER,
1150 			    (void *)(uintptr_t)rwid->addr);
1151 		}
1152 		rib_free_wid(rwid);
1153 	}
1154 }
1155 
1156 /* Server side */
1157 /* ARGSUSED */
1158 static void
1159 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1160 {
1161 	rdma_recv_data_t *rdp;
1162 	rib_qp_t	*qp;
1163 	ibt_status_t	ibt_status;
1164 	ibt_wc_t	wc;
1165 	struct svc_recv	*s_recvp;
1166 	CONN		*conn;
1167 	mblk_t		*mp;
1168 
1169 	/*
1170 	 * Re-enable cq notify here to avoid missing any
1171 	 * completion queue notification.
1172 	 */
1173 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1174 
1175 	ibt_status = IBT_SUCCESS;
1176 	while (ibt_status != IBT_CQ_EMPTY) {
1177 		bzero(&wc, sizeof (wc));
1178 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1179 		if (ibt_status != IBT_SUCCESS)
1180 			return;
1181 
1182 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1183 		qp = s_recvp->qp;
1184 		conn = qptoc(qp);
1185 		mutex_enter(&qp->posted_rbufs_lock);
1186 		qp->n_posted_rbufs--;
1187 #if defined(MEASURE_POOL_DEPTH)
1188 		rib_posted_rbufs(preposted_rbufs -  qp->n_posted_rbufs);
1189 #endif
1190 		if (qp->n_posted_rbufs == 0)
1191 			cv_signal(&qp->posted_rbufs_cv);
1192 		mutex_exit(&qp->posted_rbufs_lock);
1193 
1194 		if (wc.wc_status == IBT_WC_SUCCESS) {
1195 			XDR	inxdrs, *xdrs;
1196 			uint_t	xid, vers, op;
1197 			uint32_t rdma_credit;
1198 
1199 			xdrs = &inxdrs;
1200 			/* s_recvp->vaddr stores data */
1201 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1202 			    wc.wc_bytes_xfer, XDR_DECODE);
1203 
1204 			/*
1205 			 * Treat xid as opaque (xid is the first entity
1206 			 * in the rpc rdma message).
1207 			 */
1208 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1209 			/* Skip xid and set the xdr position accordingly. */
1210 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1211 			if (!xdr_u_int(xdrs, &vers) ||
1212 			    !xdr_u_int(xdrs, &rdma_credit) ||
1213 			    !xdr_u_int(xdrs, &op)) {
1214 				rib_rbuf_free(conn, RECV_BUFFER,
1215 				    (void *)(uintptr_t)s_recvp->vaddr);
1216 				XDR_DESTROY(xdrs);
1217 				(void) rib_free_svc_recv(s_recvp);
1218 				continue;
1219 			}
1220 			XDR_DESTROY(xdrs);
1221 
1222 			if (vers != RPCRDMA_VERS) {
1223 				/*
1224 				 * Invalid RPC/RDMA version.
1225 				 * Drop rpc rdma message.
1226 				 */
1227 				rib_rbuf_free(conn, RECV_BUFFER,
1228 				    (void *)(uintptr_t)s_recvp->vaddr);
1229 				(void) rib_free_svc_recv(s_recvp);
1230 				continue;
1231 			}
1232 			/*
1233 			 * Is this for RDMA_DONE?
1234 			 */
1235 			if (op == RDMA_DONE) {
1236 				rib_rbuf_free(conn, RECV_BUFFER,
1237 				    (void *)(uintptr_t)s_recvp->vaddr);
1238 				/*
1239 				 * Wake up the thread waiting on
1240 				 * a RDMA_DONE for xid
1241 				 */
1242 				mutex_enter(&qp->rdlist_lock);
1243 				rdma_done_notify(qp, xid);
1244 				mutex_exit(&qp->rdlist_lock);
1245 				(void) rib_free_svc_recv(s_recvp);
1246 				continue;
1247 			}
1248 
1249 			mutex_enter(&plugin_state_lock);
1250 			if (plugin_state == ACCEPT) {
1251 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1252 				    == NULL)
1253 					(void) strwaitbuf(
1254 					    sizeof (*rdp), BPRI_LO);
1255 				/*
1256 				 * Plugin is in accept state, hence the master
1257 				 * transport queue for this is still accepting
1258 				 * requests. Hence we can call svc_queuereq to
1259 				 * queue this recieved msg.
1260 				 */
1261 				rdp = (rdma_recv_data_t *)mp->b_rptr;
1262 				rdp->conn = conn;
1263 				rdp->rpcmsg.addr =
1264 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1265 				rdp->rpcmsg.type = RECV_BUFFER;
1266 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1267 				rdp->status = wc.wc_status;
1268 				mutex_enter(&conn->c_lock);
1269 				conn->c_ref++;
1270 				mutex_exit(&conn->c_lock);
1271 				mp->b_wptr += sizeof (*rdp);
1272 				svc_queuereq((queue_t *)rib_stat->q, mp);
1273 				mutex_exit(&plugin_state_lock);
1274 			} else {
1275 				/*
1276 				 * The master transport for this is going
1277 				 * away and the queue is not accepting anymore
1278 				 * requests for krpc, so don't do anything, just
1279 				 * free the msg.
1280 				 */
1281 				mutex_exit(&plugin_state_lock);
1282 				rib_rbuf_free(conn, RECV_BUFFER,
1283 				    (void *)(uintptr_t)s_recvp->vaddr);
1284 			}
1285 		} else {
1286 			rib_rbuf_free(conn, RECV_BUFFER,
1287 			    (void *)(uintptr_t)s_recvp->vaddr);
1288 		}
1289 		(void) rib_free_svc_recv(s_recvp);
1290 	}
1291 }
1292 
1293 /*
1294  * Handles DR event of IBT_HCA_DETACH_EVENT.
1295  */
1296 /* ARGSUSED */
1297 static void
1298 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1299 	ibt_async_code_t code, ibt_async_event_t *event)
1300 {
1301 
1302 	switch (code) {
1303 	case IBT_HCA_ATTACH_EVENT:
1304 		/* ignore */
1305 		break;
1306 	case IBT_HCA_DETACH_EVENT:
1307 	{
1308 		ASSERT(rib_stat->hca->hca_hdl == hca_hdl);
1309 		rib_detach_hca(rib_stat->hca);
1310 #ifdef DEBUG
1311 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1312 #endif
1313 		break;
1314 	}
1315 #ifdef DEBUG
1316 	case IBT_EVENT_PATH_MIGRATED:
1317 		cmn_err(CE_NOTE, "rib_async_handler(): "
1318 		    "IBT_EVENT_PATH_MIGRATED\n");
1319 		break;
1320 	case IBT_EVENT_SQD:
1321 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1322 		break;
1323 	case IBT_EVENT_COM_EST:
1324 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1325 		break;
1326 	case IBT_ERROR_CATASTROPHIC_CHAN:
1327 		cmn_err(CE_NOTE, "rib_async_handler(): "
1328 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1329 		break;
1330 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1331 		cmn_err(CE_NOTE, "rib_async_handler(): "
1332 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1333 		break;
1334 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1335 		cmn_err(CE_NOTE, "rib_async_handler(): "
1336 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1337 		break;
1338 	case IBT_ERROR_PATH_MIGRATE_REQ:
1339 		cmn_err(CE_NOTE, "rib_async_handler(): "
1340 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1341 		break;
1342 	case IBT_ERROR_CQ:
1343 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1344 		break;
1345 	case IBT_ERROR_PORT_DOWN:
1346 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1347 		break;
1348 	case IBT_EVENT_PORT_UP:
1349 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1350 		break;
1351 	case IBT_ASYNC_OPAQUE1:
1352 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1353 		break;
1354 	case IBT_ASYNC_OPAQUE2:
1355 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1356 		break;
1357 	case IBT_ASYNC_OPAQUE3:
1358 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1359 		break;
1360 	case IBT_ASYNC_OPAQUE4:
1361 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1362 		break;
1363 #endif
1364 	default:
1365 		break;
1366 	}
1367 }
1368 
1369 /*
1370  * Client's reachable function.
1371  */
1372 static rdma_stat
1373 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1374 {
1375 	rdma_stat	status;
1376 	rpcib_ping_t	rpt;
1377 
1378 	/*
1379 	 * First check if a hca is still attached
1380 	 */
1381 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
1382 	if (rib_stat->hca->state != HCA_INITED) {
1383 		rw_exit(&rib_stat->hca->state_lock);
1384 		return (RDMA_FAILED);
1385 	}
1386 
1387 	bzero(&rpt, sizeof (rpcib_ping_t));
1388 	status = rib_ping_srv(addr_type, raddr, &rpt);
1389 	rw_exit(&rib_stat->hca->state_lock);
1390 
1391 	if (status == RDMA_SUCCESS) {
1392 		*handle = (void *)rpt.hca;
1393 		return (RDMA_SUCCESS);
1394 	} else {
1395 		*handle = NULL;
1396 		DTRACE_PROBE(rpcib__i__pingfailed);
1397 		return (RDMA_FAILED);
1398 	}
1399 }
1400 
1401 /* Client side qp creation */
1402 static rdma_stat
1403 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1404 {
1405 	rib_qp_t	*kqp = NULL;
1406 	CONN		*conn;
1407 	rdma_clnt_cred_ctrl_t *cc_info;
1408 
1409 	ASSERT(qp != NULL);
1410 	*qp = NULL;
1411 
1412 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1413 	conn = qptoc(kqp);
1414 	kqp->hca = hca;
1415 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1416 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1417 
1418 	kqp->mode = RIB_CLIENT;
1419 	kqp->chan_flags = IBT_BLOCKING;
1420 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1421 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1422 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1423 	/*
1424 	 * Initialize
1425 	 */
1426 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1427 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1428 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1429 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1430 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1431 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1432 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1433 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1434 	/*
1435 	 * Initialize the client credit control
1436 	 * portion of the rdmaconn struct.
1437 	 */
1438 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1439 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1440 	cc_info->clnt_cc_granted_ops = 0;
1441 	cc_info->clnt_cc_in_flight_ops = 0;
1442 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1443 
1444 	*qp = kqp;
1445 	return (RDMA_SUCCESS);
1446 }
1447 
1448 /* Server side qp creation */
1449 static rdma_stat
1450 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1451 {
1452 	rib_qp_t	*kqp = NULL;
1453 	ibt_chan_sizes_t	chan_sizes;
1454 	ibt_rc_chan_alloc_args_t	qp_attr;
1455 	ibt_status_t		ibt_status;
1456 	rdma_srv_cred_ctrl_t *cc_info;
1457 
1458 	*qp = NULL;
1459 
1460 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1461 	kqp->hca = hca;
1462 	kqp->port_num = port;
1463 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1464 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1465 
1466 	/*
1467 	 * Create the qp handle
1468 	 */
1469 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1470 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1471 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1472 	qp_attr.rc_pd = hca->pd_hdl;
1473 	qp_attr.rc_hca_port_num = port;
1474 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1475 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1476 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1477 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1478 	qp_attr.rc_clone_chan = NULL;
1479 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1480 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1481 
1482 	rw_enter(&hca->state_lock, RW_READER);
1483 	if (hca->state != HCA_DETACHED) {
1484 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1485 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1486 		    &chan_sizes);
1487 	} else {
1488 		rw_exit(&hca->state_lock);
1489 		goto fail;
1490 	}
1491 	rw_exit(&hca->state_lock);
1492 
1493 	if (ibt_status != IBT_SUCCESS) {
1494 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1495 		    int, ibt_status);
1496 		goto fail;
1497 	}
1498 
1499 	kqp->mode = RIB_SERVER;
1500 	kqp->chan_flags = IBT_BLOCKING;
1501 	kqp->q = q;	/* server ONLY */
1502 
1503 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1504 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1505 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1506 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1507 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1508 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1509 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1510 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1511 	/*
1512 	 * Set the private data area to qp to be used in callbacks
1513 	 */
1514 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1515 	kqp->rdmaconn.c_state = C_CONNECTED;
1516 
1517 	/*
1518 	 * Initialize the server credit control
1519 	 * portion of the rdmaconn struct.
1520 	 */
1521 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1522 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1523 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1524 	cc_info->srv_cc_cur_buffers_used = 0;
1525 	cc_info->srv_cc_posted = preposted_rbufs;
1526 
1527 	*qp = kqp;
1528 
1529 	return (RDMA_SUCCESS);
1530 fail:
1531 	if (kqp)
1532 		kmem_free(kqp, sizeof (rib_qp_t));
1533 
1534 	return (RDMA_FAILED);
1535 }
1536 
1537 /* ARGSUSED */
1538 ibt_cm_status_t
1539 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1540     ibt_cm_return_args_t *ret_args, void *priv_data,
1541     ibt_priv_data_len_t len)
1542 {
1543 	rpcib_state_t   *ribstat;
1544 	rib_hca_t	*hca;
1545 
1546 	ribstat = (rpcib_state_t *)clnt_hdl;
1547 	hca = (rib_hca_t *)ribstat->hca;
1548 
1549 	switch (event->cm_type) {
1550 
1551 	/* got a connection close event */
1552 	case IBT_CM_EVENT_CONN_CLOSED:
1553 	{
1554 		CONN	*conn;
1555 		rib_qp_t *qp;
1556 
1557 		/* check reason why connection was closed */
1558 		switch (event->cm_event.closed) {
1559 		case IBT_CM_CLOSED_DREP_RCVD:
1560 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1561 		case IBT_CM_CLOSED_DUP:
1562 		case IBT_CM_CLOSED_ABORT:
1563 		case IBT_CM_CLOSED_ALREADY:
1564 			/*
1565 			 * These cases indicate the local end initiated
1566 			 * the closing of the channel. Nothing to do here.
1567 			 */
1568 			break;
1569 		default:
1570 			/*
1571 			 * Reason for CONN_CLOSED event must be one of
1572 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1573 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1574 			 * the remote end is closing the channel. In these
1575 			 * cases free the channel and transition to error
1576 			 * state
1577 			 */
1578 			qp = ibt_get_chan_private(event->cm_channel);
1579 			conn = qptoc(qp);
1580 			mutex_enter(&conn->c_lock);
1581 			if (conn->c_state == C_DISCONN_PEND) {
1582 				mutex_exit(&conn->c_lock);
1583 				break;
1584 			}
1585 
1586 			conn->c_state = C_ERROR_CONN;
1587 
1588 			/*
1589 			 * Free the rc_channel. Channel has already
1590 			 * transitioned to ERROR state and WRs have been
1591 			 * FLUSHED_ERR already.
1592 			 */
1593 			(void) ibt_free_channel(qp->qp_hdl);
1594 			qp->qp_hdl = NULL;
1595 
1596 			/*
1597 			 * Free the conn if c_ref is down to 0 already
1598 			 */
1599 			if (conn->c_ref == 0) {
1600 				/*
1601 				 * Remove from list and free conn
1602 				 */
1603 				conn->c_state = C_DISCONN_PEND;
1604 				mutex_exit(&conn->c_lock);
1605 				(void) rib_disconnect_channel(conn,
1606 				    &hca->cl_conn_list);
1607 			} else {
1608 				mutex_exit(&conn->c_lock);
1609 			}
1610 #ifdef DEBUG
1611 			if (rib_debug)
1612 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1613 				    "(CONN_CLOSED) channel disconnected");
1614 #endif
1615 			break;
1616 		}
1617 		break;
1618 	}
1619 	default:
1620 		break;
1621 	}
1622 	return (IBT_CM_ACCEPT);
1623 }
1624 
1625 /*
1626  * Connect to the server.
1627  */
1628 rdma_stat
1629 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1630 {
1631 	ibt_chan_open_args_t	chan_args;	/* channel args */
1632 	ibt_chan_sizes_t	chan_sizes;
1633 	ibt_rc_chan_alloc_args_t	qp_attr;
1634 	ibt_status_t		ibt_status;
1635 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1636 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1637 	ibt_ip_cm_info_t	ipcm_info;
1638 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1639 
1640 
1641 	(void) bzero(&chan_args, sizeof (chan_args));
1642 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1643 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1644 
1645 	ipcm_info.src_addr.family = rptp->srcip.family;
1646 	switch (ipcm_info.src_addr.family) {
1647 	case AF_INET:
1648 		ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1649 		break;
1650 	case AF_INET6:
1651 		ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1652 		break;
1653 	}
1654 
1655 	ipcm_info.dst_addr.family = rptp->srcip.family;
1656 	switch (ipcm_info.dst_addr.family) {
1657 	case AF_INET:
1658 		ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1659 		break;
1660 	case AF_INET6:
1661 		ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1662 		break;
1663 	}
1664 
1665 	ipcm_info.src_port = NFS_RDMA_PORT;
1666 
1667 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1668 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1669 
1670 	if (ibt_status != IBT_SUCCESS) {
1671 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1672 		return (-1);
1673 	}
1674 
1675 	qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1676 	/* Alloc a RC channel */
1677 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1678 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1679 	qp_attr.rc_pd = hca->pd_hdl;
1680 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1681 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1682 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1683 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1684 	qp_attr.rc_clone_chan = NULL;
1685 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1686 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1687 
1688 	rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT);
1689 	chan_args.oc_path = &rptp->path;
1690 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1691 	chan_args.oc_cm_clnt_private = (void *)rib_stat;
1692 	chan_args.oc_rdma_ra_out = 4;
1693 	chan_args.oc_rdma_ra_in = 4;
1694 	chan_args.oc_path_retry_cnt = 2;
1695 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1696 	chan_args.oc_priv_data = cmp_ip_pvt;
1697 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1698 
1699 refresh:
1700 	rw_enter(&hca->state_lock, RW_READER);
1701 	if (hca->state != HCA_DETACHED) {
1702 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1703 		    IBT_ACHAN_NO_FLAGS,
1704 		    &qp_attr, &qp->qp_hdl,
1705 		    &chan_sizes);
1706 	} else {
1707 		rw_exit(&hca->state_lock);
1708 		return (RDMA_FAILED);
1709 	}
1710 	rw_exit(&hca->state_lock);
1711 
1712 	if (ibt_status != IBT_SUCCESS) {
1713 		DTRACE_PROBE1(rpcib__i_conntosrv,
1714 		    int, ibt_status);
1715 		return (RDMA_FAILED);
1716 	}
1717 
1718 	/* Connect to the Server */
1719 	(void) bzero(&ret_args, sizeof (ret_args));
1720 	mutex_enter(&qp->cb_lock);
1721 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1722 	    IBT_BLOCKING, &chan_args, &ret_args);
1723 	if (ibt_status != IBT_SUCCESS) {
1724 		DTRACE_PROBE2(rpcib__i_openrctosrv,
1725 		    int, ibt_status, int, ret_args.rc_status);
1726 
1727 		(void) ibt_free_channel(qp->qp_hdl);
1728 		qp->qp_hdl = NULL;
1729 		mutex_exit(&qp->cb_lock);
1730 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1731 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1732 			/*
1733 			 * Got IBT_CM_CONN_STALE probably because of stale
1734 			 * data on the passive end of a channel that existed
1735 			 * prior to reboot. Retry establishing a channel
1736 			 * REFRESH_ATTEMPTS times, during which time the
1737 			 * stale conditions on the server might clear up.
1738 			 */
1739 			goto refresh;
1740 		}
1741 		return (RDMA_FAILED);
1742 	}
1743 	mutex_exit(&qp->cb_lock);
1744 	/*
1745 	 * Set the private data area to qp to be used in callbacks
1746 	 */
1747 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1748 	return (RDMA_SUCCESS);
1749 }
1750 
1751 rdma_stat
1752 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1753 {
1754 	uint_t			i;
1755 	ibt_status_t		ibt_status;
1756 	uint8_t			num_paths_p;
1757 	ibt_ip_path_attr_t	ipattr;
1758 	ibt_path_ip_src_t	srcip;
1759 	rpcib_ipaddrs_t		addrs4;
1760 	rpcib_ipaddrs_t		addrs6;
1761 	struct sockaddr_in	*sinp;
1762 	struct sockaddr_in6	*sin6p;
1763 	rdma_stat		retval = RDMA_SUCCESS;
1764 
1765 	ASSERT(raddr->buf != NULL);
1766 
1767 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1768 
1769 	if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1770 	    (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1771 		retval = RDMA_FAILED;
1772 		goto done;
1773 	}
1774 
1775 	/* Prep the destination address */
1776 	switch (addr_type) {
1777 	case AF_INET:
1778 		sinp = (struct sockaddr_in *)raddr->buf;
1779 		rptp->dstip.family = AF_INET;
1780 		rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1781 		sinp = addrs4.ri_list;
1782 
1783 		ipattr.ipa_dst_ip 	= &rptp->dstip;
1784 		ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
1785 		ipattr.ipa_ndst		= 1;
1786 		ipattr.ipa_max_paths	= 1;
1787 		ipattr.ipa_src_ip.family = rptp->dstip.family;
1788 		for (i = 0; i < addrs4.ri_count; i++) {
1789 			num_paths_p = 0;
1790 			ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr;
1791 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
1792 
1793 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1794 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1795 			    &num_paths_p, &srcip);
1796 			if (ibt_status == IBT_SUCCESS &&
1797 			    num_paths_p != 0 &&
1798 			    rptp->path.pi_hca_guid == rib_stat->hca->hca_guid) {
1799 				rptp->hca = rib_stat->hca;
1800 				rptp->srcip.family = AF_INET;
1801 				rptp->srcip.un.ip4addr =
1802 				    srcip.ip_primary.un.ip4addr;
1803 				goto done;
1804 			}
1805 		}
1806 		retval = RDMA_FAILED;
1807 		break;
1808 
1809 	case AF_INET6:
1810 		sin6p = (struct sockaddr_in6 *)raddr->buf;
1811 		rptp->dstip.family = AF_INET6;
1812 		rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1813 		sin6p = addrs6.ri_list;
1814 
1815 		ipattr.ipa_dst_ip 	= &rptp->dstip;
1816 		ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
1817 		ipattr.ipa_ndst		= 1;
1818 		ipattr.ipa_max_paths	= 1;
1819 		ipattr.ipa_src_ip.family = rptp->dstip.family;
1820 		for (i = 0; i < addrs6.ri_count; i++) {
1821 			num_paths_p = 0;
1822 			ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr;
1823 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
1824 
1825 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1826 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1827 			    &num_paths_p, &srcip);
1828 			if (ibt_status == IBT_SUCCESS &&
1829 			    num_paths_p != 0 &&
1830 			    rptp->path.pi_hca_guid == rib_stat->hca->hca_guid) {
1831 				rptp->hca = rib_stat->hca;
1832 				rptp->srcip.family = AF_INET6;
1833 				rptp->srcip.un.ip6addr =
1834 				    srcip.ip_primary.un.ip6addr;
1835 				goto done;
1836 			}
1837 		}
1838 		retval = RDMA_FAILED;
1839 		break;
1840 
1841 	default:
1842 		retval = RDMA_INVAL;
1843 		break;
1844 	}
1845 done:
1846 
1847 	if (addrs4.ri_size > 0)
1848 		kmem_free(addrs4.ri_list, addrs4.ri_size);
1849 	if (addrs6.ri_size > 0)
1850 		kmem_free(addrs6.ri_list, addrs6.ri_size);
1851 	return (retval);
1852 }
1853 
1854 /*
1855  * Close channel, remove from connection list and
1856  * free up resources allocated for that channel.
1857  */
1858 rdma_stat
1859 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
1860 {
1861 	rib_qp_t	*qp = ctoqp(conn);
1862 	rib_hca_t	*hca;
1863 
1864 	/*
1865 	 * c_ref == 0 and connection is in C_DISCONN_PEND
1866 	 */
1867 	hca = qp->hca;
1868 	if (conn_list != NULL)
1869 		(void) rib_rm_conn(conn, conn_list);
1870 
1871 	if (qp->qp_hdl != NULL) {
1872 		/*
1873 		 * If the channel has not been establised,
1874 		 * ibt_flush_channel is called to flush outstanding WRs
1875 		 * on the Qs.  Otherwise, ibt_close_rc_channel() is
1876 		 * called.  The channel is then freed.
1877 		 */
1878 		if (conn_list != NULL)
1879 			(void) ibt_close_rc_channel(qp->qp_hdl,
1880 			    IBT_BLOCKING, NULL, 0, NULL, NULL, 0);
1881 		else
1882 			(void) ibt_flush_channel(qp->qp_hdl);
1883 
1884 		mutex_enter(&qp->posted_rbufs_lock);
1885 		while (qp->n_posted_rbufs)
1886 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
1887 		mutex_exit(&qp->posted_rbufs_lock);
1888 		(void) ibt_free_channel(qp->qp_hdl);
1889 		qp->qp_hdl = NULL;
1890 	}
1891 
1892 	ASSERT(qp->rdlist == NULL);
1893 
1894 	if (qp->replylist != NULL) {
1895 		(void) rib_rem_replylist(qp);
1896 	}
1897 
1898 	cv_destroy(&qp->cb_conn_cv);
1899 	cv_destroy(&qp->posted_rbufs_cv);
1900 	mutex_destroy(&qp->cb_lock);
1901 
1902 	mutex_destroy(&qp->replylist_lock);
1903 	mutex_destroy(&qp->posted_rbufs_lock);
1904 	mutex_destroy(&qp->rdlist_lock);
1905 
1906 	cv_destroy(&conn->c_cv);
1907 	mutex_destroy(&conn->c_lock);
1908 
1909 	if (conn->c_raddr.buf != NULL) {
1910 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
1911 	}
1912 	if (conn->c_laddr.buf != NULL) {
1913 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
1914 	}
1915 
1916 	/*
1917 	 * Credit control cleanup.
1918 	 */
1919 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
1920 		rdma_clnt_cred_ctrl_t *cc_info;
1921 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1922 		cv_destroy(&cc_info->clnt_cc_cv);
1923 	}
1924 
1925 	kmem_free(qp, sizeof (rib_qp_t));
1926 
1927 	/*
1928 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
1929 	 * then the hca is no longer being used.
1930 	 */
1931 	if (conn_list != NULL) {
1932 		rw_enter(&hca->state_lock, RW_READER);
1933 		if (hca->state == HCA_DETACHED) {
1934 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
1935 			if (hca->srv_conn_list.conn_hd == NULL) {
1936 				rw_enter(&hca->cl_conn_list.conn_lock,
1937 				    RW_READER);
1938 
1939 				if (hca->cl_conn_list.conn_hd == NULL) {
1940 					mutex_enter(&hca->inuse_lock);
1941 					hca->inuse = FALSE;
1942 					cv_signal(&hca->cb_cv);
1943 					mutex_exit(&hca->inuse_lock);
1944 				}
1945 				rw_exit(&hca->cl_conn_list.conn_lock);
1946 			}
1947 			rw_exit(&hca->srv_conn_list.conn_lock);
1948 		}
1949 		rw_exit(&hca->state_lock);
1950 	}
1951 
1952 	return (RDMA_SUCCESS);
1953 }
1954 
1955 /*
1956  * Wait for send completion notification. Only on receiving a
1957  * notification be it a successful or error completion, free the
1958  * send_wid.
1959  */
1960 static rdma_stat
1961 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
1962 {
1963 	clock_t timout, cv_wait_ret;
1964 	rdma_stat error = RDMA_SUCCESS;
1965 	int	i;
1966 
1967 	/*
1968 	 * Wait for send to complete
1969 	 */
1970 	ASSERT(wd != NULL);
1971 	mutex_enter(&wd->sendwait_lock);
1972 	if (wd->status == (uint_t)SEND_WAIT) {
1973 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
1974 		    ddi_get_lbolt();
1975 
1976 		if (qp->mode == RIB_SERVER) {
1977 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
1978 			    &wd->sendwait_lock, timout)) > 0 &&
1979 			    wd->status == (uint_t)SEND_WAIT)
1980 				;
1981 			switch (cv_wait_ret) {
1982 			case -1:	/* timeout */
1983 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
1984 
1985 				wd->cv_sig = 0;		/* no signal needed */
1986 				error = RDMA_TIMEDOUT;
1987 				break;
1988 			default:	/* got send completion */
1989 				break;
1990 			}
1991 		} else {
1992 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
1993 			    &wd->sendwait_lock, timout)) > 0 &&
1994 			    wd->status == (uint_t)SEND_WAIT)
1995 				;
1996 			switch (cv_wait_ret) {
1997 			case -1:	/* timeout */
1998 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
1999 
2000 				wd->cv_sig = 0;		/* no signal needed */
2001 				error = RDMA_TIMEDOUT;
2002 				break;
2003 			case 0:		/* interrupted */
2004 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2005 
2006 				wd->cv_sig = 0;		/* no signal needed */
2007 				error = RDMA_INTR;
2008 				break;
2009 			default:	/* got send completion */
2010 				break;
2011 			}
2012 		}
2013 	}
2014 
2015 	if (wd->status != (uint_t)SEND_WAIT) {
2016 		/* got send completion */
2017 		if (wd->status != RDMA_SUCCESS) {
2018 			error = wd->status;
2019 		if (wd->status != RDMA_CONNLOST)
2020 			error = RDMA_FAILED;
2021 		}
2022 		for (i = 0; i < wd->nsbufs; i++) {
2023 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2024 			    (void *)(uintptr_t)wd->sbufaddr[i]);
2025 		}
2026 		mutex_exit(&wd->sendwait_lock);
2027 		(void) rib_free_sendwait(wd);
2028 	} else {
2029 		mutex_exit(&wd->sendwait_lock);
2030 	}
2031 	return (error);
2032 }
2033 
2034 static struct send_wid *
2035 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2036 {
2037 	struct send_wid	*wd;
2038 
2039 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2040 	wd->xid = xid;
2041 	wd->cv_sig = cv_sig;
2042 	wd->qp = qp;
2043 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2044 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2045 	wd->status = (uint_t)SEND_WAIT;
2046 
2047 	return (wd);
2048 }
2049 
2050 static int
2051 rib_free_sendwait(struct send_wid *wdesc)
2052 {
2053 	cv_destroy(&wdesc->wait_cv);
2054 	mutex_destroy(&wdesc->sendwait_lock);
2055 	kmem_free(wdesc, sizeof (*wdesc));
2056 
2057 	return (0);
2058 }
2059 
2060 static rdma_stat
2061 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2062 {
2063 	mutex_enter(&qp->replylist_lock);
2064 	if (rep != NULL) {
2065 		(void) rib_remreply(qp, rep);
2066 		mutex_exit(&qp->replylist_lock);
2067 		return (RDMA_SUCCESS);
2068 	}
2069 	mutex_exit(&qp->replylist_lock);
2070 	return (RDMA_FAILED);
2071 }
2072 
2073 /*
2074  * Send buffers are freed here only in case of error in posting
2075  * on QP. If the post succeeded, the send buffers are freed upon
2076  * send completion in rib_sendwait() or in the scq_handler.
2077  */
2078 rdma_stat
2079 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2080 	int send_sig, int cv_sig, caddr_t *swid)
2081 {
2082 	struct send_wid	*wdesc;
2083 	struct clist	*clp;
2084 	ibt_status_t	ibt_status = IBT_SUCCESS;
2085 	rdma_stat	ret = RDMA_SUCCESS;
2086 	ibt_send_wr_t	tx_wr;
2087 	int		i, nds;
2088 	ibt_wr_ds_t	sgl[DSEG_MAX];
2089 	uint_t		total_msg_size;
2090 	rib_qp_t	*qp;
2091 
2092 	qp = ctoqp(conn);
2093 
2094 	ASSERT(cl != NULL);
2095 
2096 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2097 
2098 	nds = 0;
2099 	total_msg_size = 0;
2100 	clp = cl;
2101 	while (clp != NULL) {
2102 		if (nds >= DSEG_MAX) {
2103 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2104 			return (RDMA_FAILED);
2105 		}
2106 		sgl[nds].ds_va = clp->w.c_saddr;
2107 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2108 		sgl[nds].ds_len = clp->c_len;
2109 		total_msg_size += clp->c_len;
2110 		clp = clp->c_next;
2111 		nds++;
2112 	}
2113 
2114 	if (send_sig) {
2115 		/* Set SEND_SIGNAL flag. */
2116 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2117 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2118 		*swid = (caddr_t)wdesc;
2119 	} else {
2120 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2121 		wdesc = rib_init_sendwait(msgid, 0, qp);
2122 		*swid = (caddr_t)wdesc;
2123 	}
2124 	wdesc->nsbufs = nds;
2125 	for (i = 0; i < nds; i++) {
2126 		wdesc->sbufaddr[i] = sgl[i].ds_va;
2127 	}
2128 
2129 	tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2130 	tx_wr.wr_opcode = IBT_WRC_SEND;
2131 	tx_wr.wr_trans = IBT_RC_SRV;
2132 	tx_wr.wr_nds = nds;
2133 	tx_wr.wr_sgl = sgl;
2134 
2135 	mutex_enter(&conn->c_lock);
2136 	if (conn->c_state == C_CONNECTED) {
2137 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2138 	}
2139 	if (conn->c_state != C_CONNECTED ||
2140 	    ibt_status != IBT_SUCCESS) {
2141 		if (conn->c_state != C_DISCONN_PEND)
2142 			conn->c_state = C_ERROR_CONN;
2143 		mutex_exit(&conn->c_lock);
2144 		for (i = 0; i < nds; i++) {
2145 			rib_rbuf_free(conn, SEND_BUFFER,
2146 			    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2147 		}
2148 
2149 		(void) rib_free_sendwait(wdesc);
2150 
2151 		return (RDMA_CONNLOST);
2152 	}
2153 	mutex_exit(&conn->c_lock);
2154 
2155 	if (send_sig) {
2156 		if (cv_sig) {
2157 			/*
2158 			 * cv_wait for send to complete.
2159 			 * We can fail due to a timeout or signal or
2160 			 * unsuccessful send.
2161 			 */
2162 			ret = rib_sendwait(qp, wdesc);
2163 
2164 			return (ret);
2165 		}
2166 	}
2167 
2168 	return (RDMA_SUCCESS);
2169 }
2170 
2171 
2172 rdma_stat
2173 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2174 {
2175 	rdma_stat	ret;
2176 	caddr_t		wd;
2177 
2178 	/* send-wait & cv_signal */
2179 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2180 	return (ret);
2181 }
2182 
2183 /*
2184  * Server interface (svc_rdma_ksend).
2185  * Send RPC reply and wait for RDMA_DONE.
2186  */
2187 rdma_stat
2188 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2189 {
2190 	rdma_stat ret = RDMA_SUCCESS;
2191 	struct rdma_done_list *rd;
2192 	clock_t timout, cv_wait_ret;
2193 	caddr_t *wid = NULL;
2194 	rib_qp_t *qp = ctoqp(conn);
2195 
2196 	mutex_enter(&qp->rdlist_lock);
2197 	rd = rdma_done_add(qp, msgid);
2198 
2199 	/* No cv_signal (whether send-wait or no-send-wait) */
2200 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2201 
2202 	if (ret != RDMA_SUCCESS) {
2203 		rdma_done_rm(qp, rd);
2204 	} else {
2205 		/*
2206 		 * Wait for RDMA_DONE from remote end
2207 		 */
2208 		timout =
2209 		    drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2210 		cv_wait_ret = cv_timedwait(&rd->rdma_done_cv,
2211 		    &qp->rdlist_lock,
2212 		    timout);
2213 
2214 		rdma_done_rm(qp, rd);
2215 
2216 		if (cv_wait_ret < 0) {
2217 			ret = RDMA_TIMEDOUT;
2218 		}
2219 	}
2220 
2221 	mutex_exit(&qp->rdlist_lock);
2222 	return (ret);
2223 }
2224 
2225 static struct recv_wid *
2226 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2227 {
2228 	struct recv_wid	*rwid;
2229 
2230 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2231 	rwid->xid = msgid;
2232 	rwid->addr = sgl->ds_va;
2233 	rwid->qp = qp;
2234 
2235 	return (rwid);
2236 }
2237 
2238 static void
2239 rib_free_wid(struct recv_wid *rwid)
2240 {
2241 	kmem_free(rwid, sizeof (struct recv_wid));
2242 }
2243 
2244 rdma_stat
2245 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2246 {
2247 	rib_qp_t	*qp = ctoqp(conn);
2248 	struct clist	*clp = cl;
2249 	struct reply	*rep;
2250 	struct recv_wid	*rwid;
2251 	int		nds;
2252 	ibt_wr_ds_t	sgl[DSEG_MAX];
2253 	ibt_recv_wr_t	recv_wr;
2254 	rdma_stat	ret;
2255 	ibt_status_t	ibt_status;
2256 
2257 	/*
2258 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2259 	 */
2260 
2261 	nds = 0;
2262 	while (cl != NULL) {
2263 		if (nds >= DSEG_MAX) {
2264 			ret = RDMA_FAILED;
2265 			goto done;
2266 		}
2267 		sgl[nds].ds_va = cl->w.c_saddr;
2268 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2269 		sgl[nds].ds_len = cl->c_len;
2270 		cl = cl->c_next;
2271 		nds++;
2272 	}
2273 
2274 	if (nds != 1) {
2275 		ret = RDMA_FAILED;
2276 		goto done;
2277 	}
2278 
2279 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2280 	recv_wr.wr_nds = nds;
2281 	recv_wr.wr_sgl = sgl;
2282 
2283 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2284 	if (rwid) {
2285 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2286 	} else {
2287 		ret = RDMA_NORESOURCE;
2288 		goto done;
2289 	}
2290 	rep = rib_addreplylist(qp, msgid);
2291 	if (!rep) {
2292 		rib_free_wid(rwid);
2293 		ret = RDMA_NORESOURCE;
2294 		goto done;
2295 	}
2296 
2297 	mutex_enter(&conn->c_lock);
2298 
2299 	if (conn->c_state == C_CONNECTED) {
2300 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2301 	}
2302 
2303 	if (conn->c_state != C_CONNECTED ||
2304 	    ibt_status != IBT_SUCCESS) {
2305 		if (conn->c_state != C_DISCONN_PEND)
2306 			conn->c_state = C_ERROR_CONN;
2307 		mutex_exit(&conn->c_lock);
2308 		rib_free_wid(rwid);
2309 		(void) rib_rem_rep(qp, rep);
2310 		ret = RDMA_CONNLOST;
2311 		goto done;
2312 	}
2313 	mutex_exit(&conn->c_lock);
2314 	return (RDMA_SUCCESS);
2315 
2316 done:
2317 	while (clp != NULL) {
2318 		rib_rbuf_free(conn, RECV_BUFFER,
2319 		    (void *)(uintptr_t)clp->w.c_saddr3);
2320 		clp = clp->c_next;
2321 	}
2322 	return (ret);
2323 }
2324 
2325 rdma_stat
2326 rib_svc_post(CONN* conn, struct clist *cl)
2327 {
2328 	rib_qp_t	*qp = ctoqp(conn);
2329 	struct svc_recv	*s_recvp;
2330 	int		nds;
2331 	ibt_wr_ds_t	sgl[DSEG_MAX];
2332 	ibt_recv_wr_t	recv_wr;
2333 	ibt_status_t	ibt_status;
2334 
2335 	nds = 0;
2336 	while (cl != NULL) {
2337 		if (nds >= DSEG_MAX) {
2338 			return (RDMA_FAILED);
2339 		}
2340 		sgl[nds].ds_va = cl->w.c_saddr;
2341 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2342 		sgl[nds].ds_len = cl->c_len;
2343 		cl = cl->c_next;
2344 		nds++;
2345 	}
2346 
2347 	if (nds != 1) {
2348 		rib_rbuf_free(conn, RECV_BUFFER,
2349 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2350 
2351 		return (RDMA_FAILED);
2352 	}
2353 
2354 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2355 	recv_wr.wr_nds = nds;
2356 	recv_wr.wr_sgl = sgl;
2357 
2358 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2359 	/* Use s_recvp's addr as wr id */
2360 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2361 	mutex_enter(&conn->c_lock);
2362 	if (conn->c_state == C_CONNECTED) {
2363 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2364 	}
2365 	if (conn->c_state != C_CONNECTED ||
2366 	    ibt_status != IBT_SUCCESS) {
2367 		if (conn->c_state != C_DISCONN_PEND)
2368 			conn->c_state = C_ERROR_CONN;
2369 		mutex_exit(&conn->c_lock);
2370 		rib_rbuf_free(conn, RECV_BUFFER,
2371 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2372 		(void) rib_free_svc_recv(s_recvp);
2373 
2374 		return (RDMA_CONNLOST);
2375 	}
2376 	mutex_exit(&conn->c_lock);
2377 
2378 	return (RDMA_SUCCESS);
2379 }
2380 
2381 /* Client */
2382 rdma_stat
2383 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2384 {
2385 
2386 	return (rib_clnt_post(conn, cl, msgid));
2387 }
2388 
2389 /* Client */
2390 rdma_stat
2391 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2392 {
2393 	rib_qp_t	*qp = ctoqp(conn);
2394 	struct reply	*rep;
2395 
2396 	mutex_enter(&qp->replylist_lock);
2397 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2398 		if (rep->xid == msgid) {
2399 			if (rep->vaddr_cq) {
2400 				rib_rbuf_free(conn, RECV_BUFFER,
2401 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2402 			}
2403 			(void) rib_remreply(qp, rep);
2404 			break;
2405 		}
2406 	}
2407 	mutex_exit(&qp->replylist_lock);
2408 
2409 	return (RDMA_SUCCESS);
2410 }
2411 
2412 /* Server */
2413 rdma_stat
2414 rib_post_recv(CONN *conn, struct clist *cl)
2415 {
2416 	rib_qp_t	*qp = ctoqp(conn);
2417 
2418 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2419 		mutex_enter(&qp->posted_rbufs_lock);
2420 		qp->n_posted_rbufs++;
2421 		mutex_exit(&qp->posted_rbufs_lock);
2422 		return (RDMA_SUCCESS);
2423 	}
2424 	return (RDMA_FAILED);
2425 }
2426 
2427 /*
2428  * Client side only interface to "recv" the rpc reply buf
2429  * posted earlier by rib_post_resp(conn, cl, msgid).
2430  */
2431 rdma_stat
2432 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2433 {
2434 	struct reply *rep = NULL;
2435 	clock_t timout, cv_wait_ret;
2436 	rdma_stat ret = RDMA_SUCCESS;
2437 	rib_qp_t *qp = ctoqp(conn);
2438 
2439 	/*
2440 	 * Find the reply structure for this msgid
2441 	 */
2442 	mutex_enter(&qp->replylist_lock);
2443 
2444 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2445 		if (rep->xid == msgid)
2446 			break;
2447 	}
2448 
2449 	if (rep != NULL) {
2450 		/*
2451 		 * If message not yet received, wait.
2452 		 */
2453 		if (rep->status == (uint_t)REPLY_WAIT) {
2454 			timout = ddi_get_lbolt() +
2455 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2456 
2457 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2458 			    &qp->replylist_lock, timout)) > 0 &&
2459 			    rep->status == (uint_t)REPLY_WAIT)
2460 				;
2461 
2462 			switch (cv_wait_ret) {
2463 			case -1:	/* timeout */
2464 				ret = RDMA_TIMEDOUT;
2465 				break;
2466 			case 0:
2467 				ret = RDMA_INTR;
2468 				break;
2469 			default:
2470 				break;
2471 			}
2472 		}
2473 
2474 		if (rep->status == RDMA_SUCCESS) {
2475 			struct clist *cl = NULL;
2476 
2477 			/*
2478 			 * Got message successfully
2479 			 */
2480 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2481 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2482 			*clp = cl;
2483 		} else {
2484 			if (rep->status != (uint_t)REPLY_WAIT) {
2485 				/*
2486 				 * Got error in reply message. Free
2487 				 * recv buffer here.
2488 				 */
2489 				ret = rep->status;
2490 				rib_rbuf_free(conn, RECV_BUFFER,
2491 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2492 			}
2493 		}
2494 		(void) rib_remreply(qp, rep);
2495 	} else {
2496 		/*
2497 		 * No matching reply structure found for given msgid on the
2498 		 * reply wait list.
2499 		 */
2500 		ret = RDMA_INVAL;
2501 		DTRACE_PROBE(rpcib__i__nomatchxid2);
2502 	}
2503 
2504 	/*
2505 	 * Done.
2506 	 */
2507 	mutex_exit(&qp->replylist_lock);
2508 	return (ret);
2509 }
2510 
2511 /*
2512  * RDMA write a buffer to the remote address.
2513  */
2514 rdma_stat
2515 rib_write(CONN *conn, struct clist *cl, int wait)
2516 {
2517 	ibt_send_wr_t	tx_wr;
2518 	int		cv_sig;
2519 	int		i;
2520 	ibt_wr_ds_t	sgl[DSEG_MAX];
2521 	struct send_wid	*wdesc;
2522 	ibt_status_t	ibt_status;
2523 	rdma_stat	ret = RDMA_SUCCESS;
2524 	rib_qp_t	*qp = ctoqp(conn);
2525 	uint64_t	n_writes = 0;
2526 	bool_t		force_wait = FALSE;
2527 
2528 	if (cl == NULL) {
2529 		return (RDMA_FAILED);
2530 	}
2531 
2532 
2533 	while ((cl != NULL)) {
2534 		if (cl->c_len > 0) {
2535 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2536 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2537 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2538 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2539 			sgl[0].ds_va = cl->w.c_saddr;
2540 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2541 			sgl[0].ds_len = cl->c_len;
2542 
2543 			if (wait) {
2544 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2545 				cv_sig = 1;
2546 			} else {
2547 				if (n_writes > max_unsignaled_rws) {
2548 					n_writes = 0;
2549 					force_wait = TRUE;
2550 					tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2551 					cv_sig = 1;
2552 				} else {
2553 					tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2554 					cv_sig = 0;
2555 				}
2556 			}
2557 
2558 			wdesc = rib_init_sendwait(0, cv_sig, qp);
2559 			tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2560 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2561 			tx_wr.wr_trans = IBT_RC_SRV;
2562 			tx_wr.wr_nds = 1;
2563 			tx_wr.wr_sgl = sgl;
2564 
2565 			mutex_enter(&conn->c_lock);
2566 			if (conn->c_state == C_CONNECTED) {
2567 				ibt_status =
2568 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2569 			}
2570 			if (conn->c_state != C_CONNECTED ||
2571 			    ibt_status != IBT_SUCCESS) {
2572 				if (conn->c_state != C_DISCONN_PEND)
2573 					conn->c_state = C_ERROR_CONN;
2574 				mutex_exit(&conn->c_lock);
2575 				(void) rib_free_sendwait(wdesc);
2576 				return (RDMA_CONNLOST);
2577 			}
2578 			mutex_exit(&conn->c_lock);
2579 
2580 			/*
2581 			 * Wait for send to complete
2582 			 */
2583 			if (wait || force_wait) {
2584 				force_wait = FALSE;
2585 				ret = rib_sendwait(qp, wdesc);
2586 				if (ret != 0) {
2587 					return (ret);
2588 				}
2589 			} else {
2590 				mutex_enter(&wdesc->sendwait_lock);
2591 				for (i = 0; i < wdesc->nsbufs; i++) {
2592 					rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2593 					    (void *)(uintptr_t)
2594 					    wdesc->sbufaddr[i]);
2595 				}
2596 				mutex_exit(&wdesc->sendwait_lock);
2597 				(void) rib_free_sendwait(wdesc);
2598 			}
2599 			n_writes ++;
2600 		}
2601 		cl = cl->c_next;
2602 	}
2603 	return (RDMA_SUCCESS);
2604 }
2605 
2606 /*
2607  * RDMA Read a buffer from the remote address.
2608  */
2609 rdma_stat
2610 rib_read(CONN *conn, struct clist *cl, int wait)
2611 {
2612 	ibt_send_wr_t	rx_wr;
2613 	int		cv_sig;
2614 	int		i;
2615 	ibt_wr_ds_t	sgl;
2616 	struct send_wid	*wdesc;
2617 	ibt_status_t	ibt_status = IBT_SUCCESS;
2618 	rdma_stat	ret = RDMA_SUCCESS;
2619 	rib_qp_t	*qp = ctoqp(conn);
2620 
2621 	if (cl == NULL) {
2622 		return (RDMA_FAILED);
2623 	}
2624 
2625 	while (cl != NULL) {
2626 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2627 		/*
2628 		 * Remote address is at the head chunk item in list.
2629 		 */
2630 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2631 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2632 
2633 		sgl.ds_va = cl->u.c_daddr;
2634 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2635 		sgl.ds_len = cl->c_len;
2636 
2637 		if (wait) {
2638 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2639 			cv_sig = 1;
2640 		} else {
2641 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2642 			cv_sig = 0;
2643 		}
2644 
2645 		wdesc = rib_init_sendwait(0, cv_sig, qp);
2646 		rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2647 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2648 		rx_wr.wr_trans = IBT_RC_SRV;
2649 		rx_wr.wr_nds = 1;
2650 		rx_wr.wr_sgl = &sgl;
2651 
2652 		mutex_enter(&conn->c_lock);
2653 		if (conn->c_state == C_CONNECTED) {
2654 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2655 		}
2656 		if (conn->c_state != C_CONNECTED ||
2657 		    ibt_status != IBT_SUCCESS) {
2658 			if (conn->c_state != C_DISCONN_PEND)
2659 				conn->c_state = C_ERROR_CONN;
2660 			mutex_exit(&conn->c_lock);
2661 			(void) rib_free_sendwait(wdesc);
2662 			return (RDMA_CONNLOST);
2663 		}
2664 		mutex_exit(&conn->c_lock);
2665 
2666 		/*
2667 		 * Wait for send to complete if this is the
2668 		 * last item in the list.
2669 		 */
2670 		if (wait && cl->c_next == NULL) {
2671 			ret = rib_sendwait(qp, wdesc);
2672 			if (ret != 0) {
2673 				return (ret);
2674 			}
2675 		} else {
2676 			mutex_enter(&wdesc->sendwait_lock);
2677 			for (i = 0; i < wdesc->nsbufs; i++) {
2678 				rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2679 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2680 			}
2681 			mutex_exit(&wdesc->sendwait_lock);
2682 			(void) rib_free_sendwait(wdesc);
2683 		}
2684 		cl = cl->c_next;
2685 	}
2686 	return (RDMA_SUCCESS);
2687 }
2688 
2689 /*
2690  * rib_srv_cm_handler()
2691  *    Connection Manager callback to handle RC connection requests.
2692  */
2693 /* ARGSUSED */
2694 static ibt_cm_status_t
2695 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2696 	ibt_cm_return_args_t *ret_args, void *priv_data,
2697 	ibt_priv_data_len_t len)
2698 {
2699 	queue_t		*q;
2700 	rib_qp_t	*qp;
2701 	rpcib_state_t	*ribstat;
2702 	rib_hca_t	*hca;
2703 	rdma_stat	status = RDMA_SUCCESS;
2704 	int		i;
2705 	struct clist	cl;
2706 	rdma_buf_t	rdbuf = {0};
2707 	void		*buf = NULL;
2708 	CONN		*conn;
2709 	ibt_ip_cm_info_t	ipinfo;
2710 	struct sockaddr_in *s;
2711 	struct sockaddr_in6 *s6;
2712 	int sin_size = sizeof (struct sockaddr_in);
2713 	int in_size = sizeof (struct in_addr);
2714 	int sin6_size = sizeof (struct sockaddr_in6);
2715 
2716 	ASSERT(any != NULL);
2717 	ASSERT(event != NULL);
2718 
2719 	ribstat = (rpcib_state_t *)any;
2720 	hca = (rib_hca_t *)ribstat->hca;
2721 	ASSERT(hca != NULL);
2722 
2723 	/* got a connection request */
2724 	switch (event->cm_type) {
2725 	case IBT_CM_EVENT_REQ_RCV:
2726 		/*
2727 		 * If the plugin is in the NO_ACCEPT state, bail out.
2728 		 */
2729 		mutex_enter(&plugin_state_lock);
2730 		if (plugin_state == NO_ACCEPT) {
2731 			mutex_exit(&plugin_state_lock);
2732 			return (IBT_CM_REJECT);
2733 		}
2734 		mutex_exit(&plugin_state_lock);
2735 
2736 		/*
2737 		 * Need to send a MRA MAD to CM so that it does not
2738 		 * timeout on us.
2739 		 */
2740 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2741 		    event->cm_event.req.req_timeout * 8, NULL, 0);
2742 
2743 		mutex_enter(&rib_stat->open_hca_lock);
2744 		q = rib_stat->q;
2745 		mutex_exit(&rib_stat->open_hca_lock);
2746 
2747 		status = rib_svc_create_chan(hca, (caddr_t)q,
2748 		    event->cm_event.req.req_prim_hca_port, &qp);
2749 
2750 		if (status) {
2751 			return (IBT_CM_REJECT);
2752 		}
2753 
2754 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2755 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2756 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2757 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2758 
2759 		/*
2760 		 * Pre-posts RECV buffers
2761 		 */
2762 		conn = qptoc(qp);
2763 		for (i = 0; i < preposted_rbufs; i++) {
2764 			bzero(&rdbuf, sizeof (rdbuf));
2765 			rdbuf.type = RECV_BUFFER;
2766 			buf = rib_rbuf_alloc(conn, &rdbuf);
2767 			if (buf == NULL) {
2768 				(void) rib_disconnect_channel(conn, NULL);
2769 				return (IBT_CM_REJECT);
2770 			}
2771 
2772 			bzero(&cl, sizeof (cl));
2773 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
2774 			cl.c_len = rdbuf.len;
2775 			cl.c_smemhandle.mrc_lmr =
2776 			    rdbuf.handle.mrc_lmr; /* lkey */
2777 			cl.c_next = NULL;
2778 			status = rib_post_recv(conn, &cl);
2779 			if (status != RDMA_SUCCESS) {
2780 				(void) rib_disconnect_channel(conn, NULL);
2781 				return (IBT_CM_REJECT);
2782 			}
2783 		}
2784 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
2785 
2786 		/*
2787 		 * Get the address translation
2788 		 */
2789 		rw_enter(&hca->state_lock, RW_READER);
2790 		if (hca->state == HCA_DETACHED) {
2791 			rw_exit(&hca->state_lock);
2792 			return (IBT_CM_REJECT);
2793 		}
2794 		rw_exit(&hca->state_lock);
2795 
2796 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
2797 
2798 		if (ibt_get_ip_data(event->cm_priv_data_len,
2799 		    event->cm_priv_data,
2800 		    &ipinfo) != IBT_SUCCESS) {
2801 
2802 			return (IBT_CM_REJECT);
2803 		}
2804 
2805 		switch (ipinfo.src_addr.family) {
2806 		case AF_INET:
2807 
2808 			conn->c_raddr.maxlen =
2809 			    conn->c_raddr.len = sin_size;
2810 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
2811 
2812 			s = (struct sockaddr_in *)conn->c_raddr.buf;
2813 			s->sin_family = AF_INET;
2814 
2815 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
2816 			    &s->sin_addr, in_size);
2817 
2818 			break;
2819 
2820 		case AF_INET6:
2821 
2822 			conn->c_raddr.maxlen =
2823 			    conn->c_raddr.len = sin6_size;
2824 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
2825 
2826 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
2827 			s6->sin6_family = AF_INET6;
2828 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
2829 			    &s6->sin6_addr,
2830 			    sizeof (struct in6_addr));
2831 
2832 			break;
2833 
2834 		default:
2835 			return (IBT_CM_REJECT);
2836 		}
2837 
2838 		break;
2839 
2840 	case IBT_CM_EVENT_CONN_CLOSED:
2841 	{
2842 		CONN		*conn;
2843 		rib_qp_t	*qp;
2844 
2845 		switch (event->cm_event.closed) {
2846 		case IBT_CM_CLOSED_DREP_RCVD:
2847 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
2848 		case IBT_CM_CLOSED_DUP:
2849 		case IBT_CM_CLOSED_ABORT:
2850 		case IBT_CM_CLOSED_ALREADY:
2851 			/*
2852 			 * These cases indicate the local end initiated
2853 			 * the closing of the channel. Nothing to do here.
2854 			 */
2855 			break;
2856 		default:
2857 			/*
2858 			 * Reason for CONN_CLOSED event must be one of
2859 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
2860 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
2861 			 * the remote end is closing the channel. In these
2862 			 * cases free the channel and transition to error
2863 			 * state
2864 			 */
2865 			qp = ibt_get_chan_private(event->cm_channel);
2866 			conn = qptoc(qp);
2867 			mutex_enter(&conn->c_lock);
2868 			if (conn->c_state == C_DISCONN_PEND) {
2869 				mutex_exit(&conn->c_lock);
2870 				break;
2871 			}
2872 			conn->c_state = C_ERROR_CONN;
2873 
2874 			/*
2875 			 * Free the rc_channel. Channel has already
2876 			 * transitioned to ERROR state and WRs have been
2877 			 * FLUSHED_ERR already.
2878 			 */
2879 			(void) ibt_free_channel(qp->qp_hdl);
2880 			qp->qp_hdl = NULL;
2881 
2882 			/*
2883 			 * Free the conn if c_ref goes down to 0
2884 			 */
2885 			if (conn->c_ref == 0) {
2886 				/*
2887 				 * Remove from list and free conn
2888 				 */
2889 				conn->c_state = C_DISCONN_PEND;
2890 				mutex_exit(&conn->c_lock);
2891 				(void) rib_disconnect_channel(conn,
2892 				    &hca->srv_conn_list);
2893 			} else {
2894 				mutex_exit(&conn->c_lock);
2895 			}
2896 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
2897 			break;
2898 		}
2899 		break;
2900 	}
2901 	case IBT_CM_EVENT_CONN_EST:
2902 		/*
2903 		 * RTU received, hence connection established.
2904 		 */
2905 		if (rib_debug > 1)
2906 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2907 			    "(CONN_EST) channel established");
2908 		break;
2909 
2910 	default:
2911 		if (rib_debug > 2) {
2912 			/* Let CM handle the following events. */
2913 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
2914 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2915 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
2916 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
2917 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2918 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
2919 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
2920 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2921 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
2922 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
2923 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2924 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
2925 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
2926 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2927 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
2928 			}
2929 		}
2930 		return (IBT_CM_DEFAULT);
2931 	}
2932 
2933 	/* accept all other CM messages (i.e. let the CM handle them) */
2934 	return (IBT_CM_ACCEPT);
2935 }
2936 
2937 static rdma_stat
2938 rib_register_service(rib_hca_t *hca, int service_type)
2939 {
2940 	ibt_srv_desc_t		sdesc;
2941 	ibt_hca_portinfo_t	*port_infop;
2942 	ib_svc_id_t		srv_id;
2943 	ibt_srv_hdl_t		srv_hdl;
2944 	uint_t			port_size;
2945 	uint_t			pki, i, num_ports, nbinds;
2946 	ibt_status_t		ibt_status;
2947 	rib_service_t		*new_service;
2948 	ib_pkey_t		pkey;
2949 
2950 	/*
2951 	 * Query all ports for the given HCA
2952 	 */
2953 	rw_enter(&hca->state_lock, RW_READER);
2954 	if (hca->state != HCA_DETACHED) {
2955 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
2956 		    &num_ports, &port_size);
2957 		rw_exit(&hca->state_lock);
2958 	} else {
2959 		rw_exit(&hca->state_lock);
2960 		return (RDMA_FAILED);
2961 	}
2962 	if (ibt_status != IBT_SUCCESS) {
2963 		return (RDMA_FAILED);
2964 	}
2965 
2966 	DTRACE_PROBE1(rpcib__i__regservice_numports,
2967 	    int, num_ports);
2968 
2969 	for (i = 0; i < num_ports; i++) {
2970 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
2971 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
2972 			    int, i+1);
2973 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
2974 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
2975 			    int, i+1);
2976 		}
2977 	}
2978 
2979 	/*
2980 	 * Get all the IP addresses on this system to register the
2981 	 * given "service type" on all DNS recognized IP addrs.
2982 	 * Each service type such as NFS will have all the systems
2983 	 * IP addresses as its different names. For now the only
2984 	 * type of service we support in RPCIB is NFS.
2985 	 */
2986 	rw_enter(&hca->service_list_lock, RW_WRITER);
2987 	/*
2988 	 * Start registering and binding service to active
2989 	 * on active ports on this HCA.
2990 	 */
2991 	nbinds = 0;
2992 	new_service = NULL;
2993 
2994 	/*
2995 	 * We use IP addresses as the service names for
2996 	 * service registration.  Register each of them
2997 	 * with CM to obtain a svc_id and svc_hdl.  We do not
2998 	 * register the service with machine's loopback address.
2999 	 */
3000 	(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3001 	(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3002 	(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3003 
3004 	sdesc.sd_handler = rib_srv_cm_handler;
3005 	sdesc.sd_flags = 0;
3006 	ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3007 	    &sdesc, ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT),
3008 	    1, &srv_hdl, &srv_id);
3009 
3010 	for (i = 0; i < num_ports; i++) {
3011 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3012 			continue;
3013 
3014 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3015 			pkey = port_infop[i].p_pkey_tbl[pki];
3016 			if ((pkey & IBSRM_HB) &&
3017 			    (pkey != IB_PKEY_INVALID_FULL)) {
3018 
3019 				/*
3020 				 * Allocate and prepare a service entry
3021 				 */
3022 				new_service =
3023 				    kmem_zalloc(1 * sizeof (rib_service_t),
3024 				    KM_SLEEP);
3025 
3026 				new_service->srv_type = service_type;
3027 				new_service->srv_hdl = srv_hdl;
3028 				new_service->srv_next = NULL;
3029 
3030 				ibt_status = ibt_bind_service(srv_hdl,
3031 				    port_infop[i].p_sgid_tbl[0],
3032 				    NULL, rib_stat, NULL);
3033 
3034 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3035 				    int, ibt_status);
3036 
3037 				if (ibt_status != IBT_SUCCESS) {
3038 					kmem_free(new_service,
3039 					    sizeof (rib_service_t));
3040 					new_service = NULL;
3041 					continue;
3042 				}
3043 
3044 				/*
3045 				 * Add to the service list for this HCA
3046 				 */
3047 				new_service->srv_next = hca->service_list;
3048 				hca->service_list = new_service;
3049 				new_service = NULL;
3050 				nbinds++;
3051 			}
3052 		}
3053 	}
3054 	rw_exit(&hca->service_list_lock);
3055 
3056 	ibt_free_portinfo(port_infop, port_size);
3057 
3058 	if (nbinds == 0) {
3059 		return (RDMA_FAILED);
3060 	} else {
3061 		/*
3062 		 * Put this plugin into accept state, since atleast
3063 		 * one registration was successful.
3064 		 */
3065 		mutex_enter(&plugin_state_lock);
3066 		plugin_state = ACCEPT;
3067 		mutex_exit(&plugin_state_lock);
3068 		return (RDMA_SUCCESS);
3069 	}
3070 }
3071 
3072 void
3073 rib_listen(struct rdma_svc_data *rd)
3074 {
3075 	rdma_stat status = RDMA_SUCCESS;
3076 
3077 	rd->active = 0;
3078 	rd->err_code = RDMA_FAILED;
3079 
3080 	/*
3081 	 * First check if a hca is still attached
3082 	 */
3083 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
3084 	if (rib_stat->hca->state != HCA_INITED) {
3085 		rw_exit(&rib_stat->hca->state_lock);
3086 		return;
3087 	}
3088 	rw_exit(&rib_stat->hca->state_lock);
3089 
3090 	rib_stat->q = &rd->q;
3091 	/*
3092 	 * Right now the only service type is NFS. Hence force feed this
3093 	 * value. Ideally to communicate the service type it should be
3094 	 * passed down in rdma_svc_data.
3095 	 */
3096 	rib_stat->service_type = NFS;
3097 	status = rib_register_service(rib_stat->hca, NFS);
3098 	if (status != RDMA_SUCCESS) {
3099 		rd->err_code = status;
3100 		return;
3101 	}
3102 	/*
3103 	 * Service active on an HCA, check rd->err_code for more
3104 	 * explainable errors.
3105 	 */
3106 	rd->active = 1;
3107 	rd->err_code = status;
3108 }
3109 
3110 /* XXXX */
3111 /* ARGSUSED */
3112 static void
3113 rib_listen_stop(struct rdma_svc_data *svcdata)
3114 {
3115 	rib_hca_t		*hca;
3116 
3117 	/*
3118 	 * KRPC called the RDMATF to stop the listeners, this means
3119 	 * stop sending incomming or recieved requests to KRPC master
3120 	 * transport handle for RDMA-IB. This is also means that the
3121 	 * master transport handle, responsible for us, is going away.
3122 	 */
3123 	mutex_enter(&plugin_state_lock);
3124 	plugin_state = NO_ACCEPT;
3125 	if (svcdata != NULL)
3126 		svcdata->active = 0;
3127 	mutex_exit(&plugin_state_lock);
3128 
3129 	/*
3130 	 * First check if a hca is still attached
3131 	 */
3132 	hca = rib_stat->hca;
3133 	rw_enter(&hca->state_lock, RW_READER);
3134 	if (hca->state != HCA_INITED) {
3135 		rw_exit(&hca->state_lock);
3136 		return;
3137 	}
3138 	rib_close_channels(&hca->srv_conn_list);
3139 	rib_stop_services(hca);
3140 	rw_exit(&hca->state_lock);
3141 }
3142 
3143 /*
3144  * Traverse the HCA's service list to unbind and deregister services.
3145  * Instead of unbinding the service for a service handle by
3146  * calling ibt_unbind_service() for each port/pkey, we unbind
3147  * all the services for the service handle by making only one
3148  * call to ibt_unbind_all_services().  Then, we deregister the
3149  * service for the service handle.
3150  *
3151  * When traversing the entries in service_list, we compare the
3152  * srv_hdl of the current entry with that of the next.  If they
3153  * are different or if the next entry is NULL, the current entry
3154  * marks the last binding of the service handle.  In this case,
3155  * call ibt_unbind_all_services() and deregister the service for
3156  * the service handle.  If they are the same, the current and the
3157  * next entries are bound to the same service handle.  In this
3158  * case, move on to the next entry.
3159  */
3160 static void
3161 rib_stop_services(rib_hca_t *hca)
3162 {
3163 	rib_service_t		*srv_list, *to_remove;
3164 
3165 	/*
3166 	 * unbind and deregister the services for this service type.
3167 	 * Right now there is only one service type. In future it will
3168 	 * be passed down to this function.
3169 	 */
3170 	rw_enter(&hca->service_list_lock, RW_WRITER);
3171 	srv_list = hca->service_list;
3172 	while (srv_list != NULL) {
3173 		to_remove = srv_list;
3174 		srv_list = to_remove->srv_next;
3175 		if (srv_list == NULL || bcmp(to_remove->srv_hdl,
3176 		    srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) {
3177 
3178 			(void) ibt_unbind_all_services(to_remove->srv_hdl);
3179 			(void) ibt_deregister_service(hca->ibt_clnt_hdl,
3180 			    to_remove->srv_hdl);
3181 		}
3182 
3183 		kmem_free(to_remove, sizeof (rib_service_t));
3184 	}
3185 	hca->service_list = NULL;
3186 	rw_exit(&hca->service_list_lock);
3187 }
3188 
3189 static struct svc_recv *
3190 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3191 {
3192 	struct svc_recv	*recvp;
3193 
3194 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3195 	recvp->vaddr = sgl->ds_va;
3196 	recvp->qp = qp;
3197 	recvp->bytes_xfer = 0;
3198 	return (recvp);
3199 }
3200 
3201 static int
3202 rib_free_svc_recv(struct svc_recv *recvp)
3203 {
3204 	kmem_free(recvp, sizeof (*recvp));
3205 
3206 	return (0);
3207 }
3208 
3209 static struct reply *
3210 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3211 {
3212 	struct reply	*rep;
3213 
3214 
3215 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3216 	if (rep == NULL) {
3217 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3218 		return (NULL);
3219 	}
3220 	rep->xid = msgid;
3221 	rep->vaddr_cq = NULL;
3222 	rep->bytes_xfer = 0;
3223 	rep->status = (uint_t)REPLY_WAIT;
3224 	rep->prev = NULL;
3225 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3226 
3227 	mutex_enter(&qp->replylist_lock);
3228 	if (qp->replylist) {
3229 		rep->next = qp->replylist;
3230 		qp->replylist->prev = rep;
3231 	}
3232 	qp->rep_list_size++;
3233 
3234 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3235 	    int, qp->rep_list_size);
3236 
3237 	qp->replylist = rep;
3238 	mutex_exit(&qp->replylist_lock);
3239 
3240 	return (rep);
3241 }
3242 
3243 static rdma_stat
3244 rib_rem_replylist(rib_qp_t *qp)
3245 {
3246 	struct reply	*r, *n;
3247 
3248 	mutex_enter(&qp->replylist_lock);
3249 	for (r = qp->replylist; r != NULL; r = n) {
3250 		n = r->next;
3251 		(void) rib_remreply(qp, r);
3252 	}
3253 	mutex_exit(&qp->replylist_lock);
3254 
3255 	return (RDMA_SUCCESS);
3256 }
3257 
3258 static int
3259 rib_remreply(rib_qp_t *qp, struct reply *rep)
3260 {
3261 
3262 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3263 	if (rep->prev) {
3264 		rep->prev->next = rep->next;
3265 	}
3266 	if (rep->next) {
3267 		rep->next->prev = rep->prev;
3268 	}
3269 	if (qp->replylist == rep)
3270 		qp->replylist = rep->next;
3271 
3272 	cv_destroy(&rep->wait_cv);
3273 	qp->rep_list_size--;
3274 
3275 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3276 	    int, qp->rep_list_size);
3277 
3278 	kmem_free(rep, sizeof (*rep));
3279 
3280 	return (0);
3281 }
3282 
3283 rdma_stat
3284 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3285 	struct mrc *buf_handle)
3286 {
3287 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3288 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3289 	rdma_stat	status;
3290 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3291 
3292 	/*
3293 	 * Note: ALL buffer pools use the same memory type RDMARW.
3294 	 */
3295 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3296 	if (status == RDMA_SUCCESS) {
3297 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3298 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3299 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3300 	} else {
3301 		buf_handle->mrc_linfo = NULL;
3302 		buf_handle->mrc_lmr = 0;
3303 		buf_handle->mrc_rmr = 0;
3304 	}
3305 	return (status);
3306 }
3307 
3308 static rdma_stat
3309 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3310 	ibt_mr_flags_t spec,
3311 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3312 {
3313 	ibt_mr_attr_t	mem_attr;
3314 	ibt_status_t	ibt_status;
3315 	mem_attr.mr_vaddr = (uintptr_t)buf;
3316 	mem_attr.mr_len = (ib_msglen_t)size;
3317 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3318 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3319 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3320 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3321 
3322 	rw_enter(&hca->state_lock, RW_READER);
3323 	if (hca->state == HCA_INITED) {
3324 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3325 		    &mem_attr, mr_hdlp, mr_descp);
3326 		rw_exit(&hca->state_lock);
3327 	} else {
3328 		rw_exit(&hca->state_lock);
3329 		return (RDMA_FAILED);
3330 	}
3331 
3332 	if (ibt_status != IBT_SUCCESS) {
3333 		return (RDMA_FAILED);
3334 	}
3335 	return (RDMA_SUCCESS);
3336 }
3337 
3338 rdma_stat
3339 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3340 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3341 {
3342 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3343 	rib_lrc_entry_t *l;
3344 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3345 	rdma_stat	status;
3346 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3347 
3348 	/*
3349 	 * Non-coherent memory registration.
3350 	 */
3351 	l = (rib_lrc_entry_t *)lrc;
3352 	if (l) {
3353 		if (l->registered) {
3354 			buf_handle->mrc_linfo =
3355 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3356 			buf_handle->mrc_lmr =
3357 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3358 			buf_handle->mrc_rmr =
3359 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3360 			*sync_handle = (RIB_SYNCMEM_HANDLE)
3361 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3362 			return (RDMA_SUCCESS);
3363 		} else {
3364 			/* Always register the whole buffer */
3365 			buf = (caddr_t)l->lrc_buf;
3366 			buflen = l->lrc_len;
3367 		}
3368 	}
3369 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3370 
3371 	if (status == RDMA_SUCCESS) {
3372 		if (l) {
3373 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3374 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3375 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3376 			l->registered		 = TRUE;
3377 		}
3378 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3379 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3380 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3381 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3382 	} else {
3383 		buf_handle->mrc_linfo = NULL;
3384 		buf_handle->mrc_lmr = 0;
3385 		buf_handle->mrc_rmr = 0;
3386 	}
3387 	return (status);
3388 }
3389 
3390 /* ARGSUSED */
3391 rdma_stat
3392 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3393 {
3394 	rib_hca_t *hca = (ctoqp(conn))->hca;
3395 	/*
3396 	 * Allow memory deregistration even if HCA is
3397 	 * getting detached. Need all outstanding
3398 	 * memory registrations to be deregistered
3399 	 * before HCA_DETACH_EVENT can be accepted.
3400 	 */
3401 	(void) ibt_deregister_mr(hca->hca_hdl,
3402 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3403 	return (RDMA_SUCCESS);
3404 }
3405 
3406 /* ARGSUSED */
3407 rdma_stat
3408 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3409 		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3410 {
3411 	rib_lrc_entry_t *l;
3412 	l = (rib_lrc_entry_t *)lrc;
3413 	if (l)
3414 		if (l->registered)
3415 			return (RDMA_SUCCESS);
3416 
3417 	(void) rib_deregistermem(conn, buf, buf_handle);
3418 
3419 	return (RDMA_SUCCESS);
3420 }
3421 
3422 /* ARGSUSED */
3423 rdma_stat
3424 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3425 		int len, int cpu)
3426 {
3427 	ibt_status_t	status;
3428 	rib_hca_t *hca = (ctoqp(conn))->hca;
3429 	ibt_mr_sync_t	mr_segment;
3430 
3431 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3432 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3433 	mr_segment.ms_len = (ib_memlen_t)len;
3434 	if (cpu) {
3435 		/* make incoming data visible to memory */
3436 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3437 	} else {
3438 		/* make memory changes visible to IO */
3439 		mr_segment.ms_flags = IBT_SYNC_READ;
3440 	}
3441 	rw_enter(&hca->state_lock, RW_READER);
3442 	if (hca->state == HCA_INITED) {
3443 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3444 		rw_exit(&hca->state_lock);
3445 	} else {
3446 		rw_exit(&hca->state_lock);
3447 		return (RDMA_FAILED);
3448 	}
3449 
3450 	if (status == IBT_SUCCESS)
3451 		return (RDMA_SUCCESS);
3452 	else {
3453 		return (RDMA_FAILED);
3454 	}
3455 }
3456 
3457 /*
3458  * XXXX	????
3459  */
3460 static rdma_stat
3461 rib_getinfo(rdma_info_t *info)
3462 {
3463 	/*
3464 	 * XXXX	Hack!
3465 	 */
3466 	info->addrlen = 16;
3467 	info->mts = 1000000;
3468 	info->mtu = 1000000;
3469 
3470 	return (RDMA_SUCCESS);
3471 }
3472 
3473 rib_bufpool_t *
3474 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3475 {
3476 	rib_bufpool_t	*rbp = NULL;
3477 	bufpool_t	*bp = NULL;
3478 	caddr_t		buf;
3479 	ibt_mr_attr_t	mem_attr;
3480 	ibt_status_t	ibt_status;
3481 	int		i, j;
3482 
3483 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3484 
3485 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3486 	    num * sizeof (void *), KM_SLEEP);
3487 
3488 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3489 	bp->numelems = num;
3490 
3491 
3492 	switch (ptype) {
3493 	case SEND_BUFFER:
3494 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3495 		bp->rsize = RPC_MSG_SZ;
3496 		break;
3497 	case RECV_BUFFER:
3498 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3499 		bp->rsize = RPC_BUF_SIZE;
3500 		break;
3501 	default:
3502 		goto fail;
3503 	}
3504 
3505 	/*
3506 	 * Register the pool.
3507 	 */
3508 	bp->bufsize = num * bp->rsize;
3509 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3510 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3511 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3512 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3513 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3514 	rw_enter(&hca->state_lock, RW_READER);
3515 
3516 	if (hca->state != HCA_INITED) {
3517 		rw_exit(&hca->state_lock);
3518 		goto fail;
3519 	}
3520 
3521 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3522 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3523 		mem_attr.mr_vaddr = (uintptr_t)buf;
3524 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3525 		mem_attr.mr_as = NULL;
3526 		ibt_status = ibt_register_mr(hca->hca_hdl,
3527 		    hca->pd_hdl, &mem_attr,
3528 		    &rbp->mr_hdl[i],
3529 		    &rbp->mr_desc[i]);
3530 		if (ibt_status != IBT_SUCCESS) {
3531 			for (j = 0; j < i; j++) {
3532 				(void) ibt_deregister_mr(hca->hca_hdl,
3533 				    rbp->mr_hdl[j]);
3534 			}
3535 			rw_exit(&hca->state_lock);
3536 			goto fail;
3537 		}
3538 	}
3539 	rw_exit(&hca->state_lock);
3540 	buf = (caddr_t)bp->buf;
3541 	for (i = 0; i < num; i++, buf += bp->rsize) {
3542 		bp->buflist[i] = (void *)buf;
3543 	}
3544 	bp->buffree = num - 1;	/* no. of free buffers */
3545 	rbp->bpool = bp;
3546 
3547 	return (rbp);
3548 fail:
3549 	if (bp) {
3550 		if (bp->buf)
3551 			kmem_free(bp->buf, bp->bufsize);
3552 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3553 	}
3554 	if (rbp) {
3555 		if (rbp->mr_hdl)
3556 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3557 		if (rbp->mr_desc)
3558 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3559 		kmem_free(rbp, sizeof (rib_bufpool_t));
3560 	}
3561 	return (NULL);
3562 }
3563 
3564 static void
3565 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3566 {
3567 	int i;
3568 	rib_bufpool_t *rbp = NULL;
3569 	bufpool_t *bp;
3570 
3571 	/*
3572 	 * Obtain pool address based on type of pool
3573 	 */
3574 	switch (ptype) {
3575 		case SEND_BUFFER:
3576 			rbp = hca->send_pool;
3577 			break;
3578 		case RECV_BUFFER:
3579 			rbp = hca->recv_pool;
3580 			break;
3581 		default:
3582 			return;
3583 	}
3584 	if (rbp == NULL)
3585 		return;
3586 
3587 	bp = rbp->bpool;
3588 
3589 	/*
3590 	 * Deregister the pool memory and free it.
3591 	 */
3592 	for (i = 0; i < bp->numelems; i++) {
3593 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3594 	}
3595 }
3596 
3597 static void
3598 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3599 {
3600 
3601 	rib_bufpool_t *rbp = NULL;
3602 	bufpool_t *bp;
3603 
3604 	/*
3605 	 * Obtain pool address based on type of pool
3606 	 */
3607 	switch (ptype) {
3608 		case SEND_BUFFER:
3609 			rbp = hca->send_pool;
3610 			break;
3611 		case RECV_BUFFER:
3612 			rbp = hca->recv_pool;
3613 			break;
3614 		default:
3615 			return;
3616 	}
3617 	if (rbp == NULL)
3618 		return;
3619 
3620 	bp = rbp->bpool;
3621 
3622 	/*
3623 	 * Free the pool memory.
3624 	 */
3625 	if (rbp->mr_hdl)
3626 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3627 
3628 	if (rbp->mr_desc)
3629 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
3630 	if (bp->buf)
3631 		kmem_free(bp->buf, bp->bufsize);
3632 	mutex_destroy(&bp->buflock);
3633 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
3634 	kmem_free(rbp, sizeof (rib_bufpool_t));
3635 }
3636 
3637 void
3638 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
3639 {
3640 	/*
3641 	 * Deregister the pool memory and free it.
3642 	 */
3643 	rib_rbufpool_deregister(hca, ptype);
3644 	rib_rbufpool_free(hca, ptype);
3645 }
3646 
3647 /*
3648  * Fetch a buffer from the pool of type specified in rdbuf->type.
3649  */
3650 static rdma_stat
3651 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3652 {
3653 	rib_lrc_entry_t *rlep;
3654 
3655 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
3656 		rlep = rib_get_cache_buf(conn, rdbuf->len);
3657 		rdbuf->rb_private =  (caddr_t)rlep;
3658 		rdbuf->addr = rlep->lrc_buf;
3659 		rdbuf->handle = rlep->lrc_mhandle;
3660 		return (RDMA_SUCCESS);
3661 	}
3662 
3663 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
3664 	if (rdbuf->addr) {
3665 		switch (rdbuf->type) {
3666 		case SEND_BUFFER:
3667 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
3668 			break;
3669 		case RECV_BUFFER:
3670 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
3671 			break;
3672 		default:
3673 			rdbuf->len = 0;
3674 		}
3675 		return (RDMA_SUCCESS);
3676 	} else
3677 		return (RDMA_FAILED);
3678 }
3679 
3680 #if defined(MEASURE_POOL_DEPTH)
3681 static void rib_recv_bufs(uint32_t x) {
3682 
3683 }
3684 
3685 static void rib_send_bufs(uint32_t x) {
3686 
3687 }
3688 #endif
3689 
3690 /*
3691  * Fetch a buffer of specified type.
3692  * Note that rdbuf->handle is mw's rkey.
3693  */
3694 static void *
3695 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3696 {
3697 	rib_qp_t	*qp = ctoqp(conn);
3698 	rib_hca_t	*hca = qp->hca;
3699 	rdma_btype	ptype = rdbuf->type;
3700 	void		*buf;
3701 	rib_bufpool_t	*rbp = NULL;
3702 	bufpool_t	*bp;
3703 	int		i;
3704 
3705 	/*
3706 	 * Obtain pool address based on type of pool
3707 	 */
3708 	switch (ptype) {
3709 	case SEND_BUFFER:
3710 		rbp = hca->send_pool;
3711 		break;
3712 	case RECV_BUFFER:
3713 		rbp = hca->recv_pool;
3714 		break;
3715 	default:
3716 		return (NULL);
3717 	}
3718 	if (rbp == NULL)
3719 		return (NULL);
3720 
3721 	bp = rbp->bpool;
3722 
3723 	mutex_enter(&bp->buflock);
3724 	if (bp->buffree < 0) {
3725 		mutex_exit(&bp->buflock);
3726 		return (NULL);
3727 	}
3728 
3729 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
3730 	buf = bp->buflist[bp->buffree];
3731 	rdbuf->addr = buf;
3732 	rdbuf->len = bp->rsize;
3733 	for (i = bp->numelems - 1; i >= 0; i--) {
3734 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
3735 			rdbuf->handle.mrc_rmr =
3736 			    (uint32_t)rbp->mr_desc[i].md_rkey;
3737 			rdbuf->handle.mrc_linfo =
3738 			    (uintptr_t)rbp->mr_hdl[i];
3739 			rdbuf->handle.mrc_lmr =
3740 			    (uint32_t)rbp->mr_desc[i].md_lkey;
3741 #if defined(MEASURE_POOL_DEPTH)
3742 			if (ptype == SEND_BUFFER)
3743 				rib_send_bufs(MAX_BUFS - (bp->buffree+1));
3744 			if (ptype == RECV_BUFFER)
3745 				rib_recv_bufs(MAX_BUFS - (bp->buffree+1));
3746 #endif
3747 			bp->buffree--;
3748 
3749 			mutex_exit(&bp->buflock);
3750 
3751 			return (buf);
3752 		}
3753 	}
3754 
3755 	mutex_exit(&bp->buflock);
3756 
3757 	return (NULL);
3758 }
3759 
3760 static void
3761 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
3762 {
3763 
3764 	if (rdbuf->type == RDMA_LONG_BUFFER) {
3765 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
3766 		rdbuf->rb_private = NULL;
3767 		return;
3768 	}
3769 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
3770 }
3771 
3772 static void
3773 rib_rbuf_free(CONN *conn, int ptype, void *buf)
3774 {
3775 	rib_qp_t *qp = ctoqp(conn);
3776 	rib_hca_t *hca = qp->hca;
3777 	rib_bufpool_t *rbp = NULL;
3778 	bufpool_t *bp;
3779 
3780 	/*
3781 	 * Obtain pool address based on type of pool
3782 	 */
3783 	switch (ptype) {
3784 	case SEND_BUFFER:
3785 		rbp = hca->send_pool;
3786 		break;
3787 	case RECV_BUFFER:
3788 		rbp = hca->recv_pool;
3789 		break;
3790 	default:
3791 		return;
3792 	}
3793 	if (rbp == NULL)
3794 		return;
3795 
3796 	bp = rbp->bpool;
3797 
3798 	mutex_enter(&bp->buflock);
3799 	if (++bp->buffree >= bp->numelems) {
3800 		/*
3801 		 * Should never happen
3802 		 */
3803 		bp->buffree--;
3804 	} else {
3805 		bp->buflist[bp->buffree] = buf;
3806 	}
3807 	mutex_exit(&bp->buflock);
3808 }
3809 
3810 static rdma_stat
3811 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
3812 {
3813 	rw_enter(&connlist->conn_lock, RW_WRITER);
3814 	if (connlist->conn_hd) {
3815 		cn->c_next = connlist->conn_hd;
3816 		connlist->conn_hd->c_prev = cn;
3817 	}
3818 	connlist->conn_hd = cn;
3819 	rw_exit(&connlist->conn_lock);
3820 
3821 	return (RDMA_SUCCESS);
3822 }
3823 
3824 static rdma_stat
3825 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
3826 {
3827 	rw_enter(&connlist->conn_lock, RW_WRITER);
3828 	if (cn->c_prev) {
3829 		cn->c_prev->c_next = cn->c_next;
3830 	}
3831 	if (cn->c_next) {
3832 		cn->c_next->c_prev = cn->c_prev;
3833 	}
3834 	if (connlist->conn_hd == cn)
3835 		connlist->conn_hd = cn->c_next;
3836 	rw_exit(&connlist->conn_lock);
3837 
3838 	return (RDMA_SUCCESS);
3839 }
3840 
3841 /*
3842  * Connection management.
3843  * IBTF does not support recycling of channels. So connections are only
3844  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
3845  * C_DISCONN_PEND state. No C_IDLE state.
3846  * C_CONN_PEND state: Connection establishment in progress to the server.
3847  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
3848  * It has an RC channel associated with it. ibt_post_send/recv are allowed
3849  * only in this state.
3850  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
3851  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
3852  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
3853  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
3854  * c_ref drops to 0 (this indicates that RPC has no more references to this
3855  * connection), the connection should be destroyed. A connection transitions
3856  * into this state when it is being destroyed.
3857  */
3858 /* ARGSUSED */
3859 static rdma_stat
3860 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn)
3861 {
3862 	CONN *cn;
3863 	int status = RDMA_SUCCESS;
3864 	rib_hca_t *hca = rib_stat->hca;
3865 	rib_qp_t *qp;
3866 	clock_t cv_stat, timout;
3867 	rpcib_ping_t rpt;
3868 
3869 	if (hca == NULL)
3870 		return (RDMA_FAILED);
3871 
3872 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
3873 	if (hca->state == HCA_DETACHED) {
3874 		rw_exit(&rib_stat->hca->state_lock);
3875 		return (RDMA_FAILED);
3876 	}
3877 	rw_exit(&rib_stat->hca->state_lock);
3878 
3879 again:
3880 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
3881 	cn = hca->cl_conn_list.conn_hd;
3882 	while (cn != NULL) {
3883 		/*
3884 		 * First, clear up any connection in the ERROR state
3885 		 */
3886 		mutex_enter(&cn->c_lock);
3887 		if (cn->c_state == C_ERROR_CONN) {
3888 			if (cn->c_ref == 0) {
3889 				/*
3890 				 * Remove connection from list and destroy it.
3891 				 */
3892 				cn->c_state = C_DISCONN_PEND;
3893 				mutex_exit(&cn->c_lock);
3894 				rw_exit(&hca->cl_conn_list.conn_lock);
3895 				(void) rib_disconnect_channel(cn,
3896 				    &hca->cl_conn_list);
3897 				goto again;
3898 			}
3899 			mutex_exit(&cn->c_lock);
3900 			cn = cn->c_next;
3901 			continue;
3902 		}
3903 		if (cn->c_state == C_DISCONN_PEND) {
3904 			mutex_exit(&cn->c_lock);
3905 			cn = cn->c_next;
3906 			continue;
3907 		}
3908 		if ((cn->c_raddr.len == svcaddr->len) &&
3909 		    bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) {
3910 			/*
3911 			 * Our connection. Give up conn list lock
3912 			 * as we are done traversing the list.
3913 			 */
3914 			rw_exit(&hca->cl_conn_list.conn_lock);
3915 			if (cn->c_state == C_CONNECTED) {
3916 				cn->c_ref++;	/* sharing a conn */
3917 				mutex_exit(&cn->c_lock);
3918 				*conn = cn;
3919 				return (status);
3920 			}
3921 			if (cn->c_state == C_CONN_PEND) {
3922 				/*
3923 				 * Hold a reference to this conn before
3924 				 * we give up the lock.
3925 				 */
3926 				cn->c_ref++;
3927 				timout =  ddi_get_lbolt() +
3928 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
3929 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
3930 				    &cn->c_lock, timout)) > 0 &&
3931 				    cn->c_state == C_CONN_PEND)
3932 					;
3933 				if (cv_stat == 0) {
3934 					cn->c_ref--;
3935 					mutex_exit(&cn->c_lock);
3936 					return (RDMA_INTR);
3937 				}
3938 				if (cv_stat < 0) {
3939 					cn->c_ref--;
3940 					mutex_exit(&cn->c_lock);
3941 					return (RDMA_TIMEDOUT);
3942 				}
3943 				if (cn->c_state == C_CONNECTED) {
3944 					*conn = cn;
3945 					mutex_exit(&cn->c_lock);
3946 					return (status);
3947 				} else {
3948 					cn->c_ref--;
3949 					mutex_exit(&cn->c_lock);
3950 					return (RDMA_TIMEDOUT);
3951 				}
3952 			}
3953 		}
3954 		mutex_exit(&cn->c_lock);
3955 		cn = cn->c_next;
3956 	}
3957 	rw_exit(&hca->cl_conn_list.conn_lock);
3958 
3959 	bzero(&rpt, sizeof (rpcib_ping_t));
3960 
3961 	status = rib_ping_srv(addr_type, svcaddr, &rpt);
3962 	if (status != RDMA_SUCCESS) {
3963 		return (RDMA_FAILED);
3964 	}
3965 
3966 	/*
3967 	 * Channel to server doesn't exist yet, create one.
3968 	 */
3969 	if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) {
3970 		return (RDMA_FAILED);
3971 	}
3972 	cn = qptoc(qp);
3973 	cn->c_state = C_CONN_PEND;
3974 	cn->c_ref = 1;
3975 
3976 	/*
3977 	 * Add to conn list.
3978 	 * We had given up the READER lock. In the time since then,
3979 	 * another thread might have created the connection we are
3980 	 * trying here. But for now, that is quiet alright - there
3981 	 * might be two connections between a pair of hosts instead
3982 	 * of one. If we really want to close that window,
3983 	 * then need to check the list after acquiring the
3984 	 * WRITER lock.
3985 	 */
3986 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
3987 	status = rib_conn_to_srv(hca, qp, &rpt);
3988 	mutex_enter(&cn->c_lock);
3989 	if (status == RDMA_SUCCESS) {
3990 		cn->c_state = C_CONNECTED;
3991 		*conn = cn;
3992 	} else {
3993 		cn->c_state = C_ERROR_CONN;
3994 		cn->c_ref--;
3995 	}
3996 	cv_broadcast(&cn->c_cv);
3997 	mutex_exit(&cn->c_lock);
3998 	return (status);
3999 }
4000 
4001 static rdma_stat
4002 rib_conn_release(CONN *conn)
4003 {
4004 	rib_qp_t	*qp = ctoqp(conn);
4005 
4006 	mutex_enter(&conn->c_lock);
4007 	conn->c_ref--;
4008 
4009 	/*
4010 	 * If a conn is C_ERROR_CONN, close the channel.
4011 	 * If it's CONNECTED, keep it that way.
4012 	 */
4013 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4014 		conn->c_state = C_DISCONN_PEND;
4015 		mutex_exit(&conn->c_lock);
4016 		if (qp->mode == RIB_SERVER)
4017 			(void) rib_disconnect_channel(conn,
4018 			    &qp->hca->srv_conn_list);
4019 		else
4020 			(void) rib_disconnect_channel(conn,
4021 			    &qp->hca->cl_conn_list);
4022 		return (RDMA_SUCCESS);
4023 	}
4024 	mutex_exit(&conn->c_lock);
4025 	return (RDMA_SUCCESS);
4026 }
4027 
4028 /*
4029  * Add at front of list
4030  */
4031 static struct rdma_done_list *
4032 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4033 {
4034 	struct rdma_done_list *rd;
4035 
4036 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4037 
4038 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4039 	rd->xid = xid;
4040 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4041 
4042 	rd->prev = NULL;
4043 	rd->next = qp->rdlist;
4044 	if (qp->rdlist != NULL)
4045 		qp->rdlist->prev = rd;
4046 	qp->rdlist = rd;
4047 
4048 	return (rd);
4049 }
4050 
4051 static void
4052 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4053 {
4054 	struct rdma_done_list *r;
4055 
4056 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4057 
4058 	r = rd->next;
4059 	if (r != NULL) {
4060 		r->prev = rd->prev;
4061 	}
4062 
4063 	r = rd->prev;
4064 	if (r != NULL) {
4065 		r->next = rd->next;
4066 	} else {
4067 		qp->rdlist = rd->next;
4068 	}
4069 
4070 	cv_destroy(&rd->rdma_done_cv);
4071 	kmem_free(rd, sizeof (*rd));
4072 }
4073 
4074 static void
4075 rdma_done_rem_list(rib_qp_t *qp)
4076 {
4077 	struct rdma_done_list	*r, *n;
4078 
4079 	mutex_enter(&qp->rdlist_lock);
4080 	for (r = qp->rdlist; r != NULL; r = n) {
4081 		n = r->next;
4082 		rdma_done_rm(qp, r);
4083 	}
4084 	mutex_exit(&qp->rdlist_lock);
4085 }
4086 
4087 static void
4088 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4089 {
4090 	struct rdma_done_list *r = qp->rdlist;
4091 
4092 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4093 
4094 	while (r) {
4095 		if (r->xid == xid) {
4096 			cv_signal(&r->rdma_done_cv);
4097 			return;
4098 		} else {
4099 			r = r->next;
4100 		}
4101 	}
4102 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4103 	    int, xid);
4104 }
4105 
4106 
4107 /*
4108  * Goes through all connections and closes the channel
4109  * This will cause all the WRs on those channels to be
4110  * flushed.
4111  */
4112 static void
4113 rib_close_channels(rib_conn_list_t *connlist)
4114 {
4115 	CONN 		*conn;
4116 	rib_qp_t	*qp;
4117 
4118 	rw_enter(&connlist->conn_lock, RW_READER);
4119 	conn = connlist->conn_hd;
4120 	while (conn != NULL) {
4121 		mutex_enter(&conn->c_lock);
4122 		qp = ctoqp(conn);
4123 		if (conn->c_state == C_CONNECTED) {
4124 			/*
4125 			 * Live connection in CONNECTED state.
4126 			 * Call ibt_close_rc_channel in nonblocking mode
4127 			 * with no callbacks.
4128 			 */
4129 			conn->c_state = C_ERROR_CONN;
4130 			(void) ibt_close_rc_channel(qp->qp_hdl,
4131 			    IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
4132 			(void) ibt_free_channel(qp->qp_hdl);
4133 			qp->qp_hdl = NULL;
4134 		} else {
4135 			if (conn->c_state == C_ERROR_CONN &&
4136 			    qp->qp_hdl != NULL) {
4137 				/*
4138 				 * Connection in ERROR state but
4139 				 * channel is not yet freed.
4140 				 */
4141 				(void) ibt_close_rc_channel(qp->qp_hdl,
4142 				    IBT_NOCALLBACKS, NULL, 0, NULL,
4143 				    NULL, 0);
4144 				(void) ibt_free_channel(qp->qp_hdl);
4145 				qp->qp_hdl = NULL;
4146 			}
4147 		}
4148 		mutex_exit(&conn->c_lock);
4149 		conn = conn->c_next;
4150 	}
4151 	rw_exit(&connlist->conn_lock);
4152 }
4153 
4154 /*
4155  * Frees up all connections that are no longer being referenced
4156  */
4157 static void
4158 rib_purge_connlist(rib_conn_list_t *connlist)
4159 {
4160 	CONN 		*conn;
4161 
4162 top:
4163 	rw_enter(&connlist->conn_lock, RW_READER);
4164 	conn = connlist->conn_hd;
4165 	while (conn != NULL) {
4166 		mutex_enter(&conn->c_lock);
4167 
4168 		/*
4169 		 * At this point connection is either in ERROR
4170 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4171 		 * then some other thread is culling that connection.
4172 		 * If not and if c_ref is 0, then destroy the connection.
4173 		 */
4174 		if (conn->c_ref == 0 &&
4175 		    conn->c_state != C_DISCONN_PEND) {
4176 			/*
4177 			 * Cull the connection
4178 			 */
4179 			conn->c_state = C_DISCONN_PEND;
4180 			mutex_exit(&conn->c_lock);
4181 			rw_exit(&connlist->conn_lock);
4182 			(void) rib_disconnect_channel(conn, connlist);
4183 			goto top;
4184 		} else {
4185 			/*
4186 			 * conn disconnect already scheduled or will
4187 			 * happen from conn_release when c_ref drops to 0.
4188 			 */
4189 			mutex_exit(&conn->c_lock);
4190 		}
4191 		conn = conn->c_next;
4192 	}
4193 	rw_exit(&connlist->conn_lock);
4194 
4195 	/*
4196 	 * At this point, only connections with c_ref != 0 are on the list
4197 	 */
4198 }
4199 
4200 /*
4201  * Cleans and closes up all uses of the HCA
4202  */
4203 static void
4204 rib_detach_hca(rib_hca_t *hca)
4205 {
4206 
4207 	/*
4208 	 * Stop all services on the HCA
4209 	 * Go through cl_conn_list and close all rc_channels
4210 	 * Go through svr_conn_list and close all rc_channels
4211 	 * Free connections whose c_ref has dropped to 0
4212 	 * Destroy all CQs
4213 	 * Deregister and released all buffer pool memory after all
4214 	 * connections are destroyed
4215 	 * Free the protection domain
4216 	 * ibt_close_hca()
4217 	 */
4218 	rw_enter(&hca->state_lock, RW_WRITER);
4219 	if (hca->state == HCA_DETACHED) {
4220 		rw_exit(&hca->state_lock);
4221 		return;
4222 	}
4223 
4224 	hca->state = HCA_DETACHED;
4225 	rib_stat->nhca_inited--;
4226 
4227 	rib_stop_services(hca);
4228 	rib_close_channels(&hca->cl_conn_list);
4229 	rib_close_channels(&hca->srv_conn_list);
4230 
4231 	rib_mod.rdma_count--;
4232 
4233 	rw_exit(&hca->state_lock);
4234 
4235 	/*
4236 	 * purge will free all datastructures used by CQ handlers. We don't
4237 	 * want to receive completions after purge, so we'll free the CQs now.
4238 	 */
4239 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4240 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4241 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4242 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4243 
4244 	rib_purge_connlist(&hca->cl_conn_list);
4245 	rib_purge_connlist(&hca->srv_conn_list);
4246 
4247 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4248 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4249 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4250 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4251 	if (stats_enabled) {
4252 		kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4253 		    GLOBAL_ZONEID);
4254 	}
4255 
4256 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4257 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4258 	if (hca->srv_conn_list.conn_hd == NULL &&
4259 	    hca->cl_conn_list.conn_hd == NULL) {
4260 		/*
4261 		 * conn_lists are NULL, so destroy
4262 		 * buffers, close hca and be done.
4263 		 */
4264 		rib_rbufpool_destroy(hca, RECV_BUFFER);
4265 		rib_rbufpool_destroy(hca, SEND_BUFFER);
4266 		rib_destroy_cache(hca);
4267 		rdma_unregister_mod(&rib_mod);
4268 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4269 		(void) ibt_close_hca(hca->hca_hdl);
4270 		hca->hca_hdl = NULL;
4271 	}
4272 	rw_exit(&hca->cl_conn_list.conn_lock);
4273 	rw_exit(&hca->srv_conn_list.conn_lock);
4274 
4275 	if (hca->hca_hdl != NULL) {
4276 		mutex_enter(&hca->inuse_lock);
4277 		while (hca->inuse)
4278 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4279 		mutex_exit(&hca->inuse_lock);
4280 
4281 		rdma_unregister_mod(&rib_mod);
4282 
4283 		/*
4284 		 * conn_lists are now NULL, so destroy
4285 		 * buffers, close hca and be done.
4286 		 */
4287 		rib_rbufpool_destroy(hca, RECV_BUFFER);
4288 		rib_rbufpool_destroy(hca, SEND_BUFFER);
4289 		rib_destroy_cache(hca);
4290 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4291 		(void) ibt_close_hca(hca->hca_hdl);
4292 		hca->hca_hdl = NULL;
4293 	}
4294 }
4295 
4296 static void
4297 rib_server_side_cache_reclaim(void *argp)
4298 {
4299 	cache_avl_struct_t    *rcas;
4300 	rib_lrc_entry_t		*rb;
4301 	rib_hca_t *hca = (rib_hca_t *)argp;
4302 
4303 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4304 	rcas = avl_first(&hca->avl_tree);
4305 	if (rcas != NULL)
4306 		avl_remove(&hca->avl_tree, rcas);
4307 
4308 	while (rcas != NULL) {
4309 		while (rcas->r.forw != &rcas->r) {
4310 			rcas->elements--;
4311 			rib_total_buffers --;
4312 			rb = rcas->r.forw;
4313 			remque(rb);
4314 			if (rb->registered)
4315 				(void) rib_deregistermem_via_hca(hca,
4316 				    rb->lrc_buf, rb->lrc_mhandle);
4317 			cache_allocation -= rb->lrc_len;
4318 			kmem_free(rb->lrc_buf, rb->lrc_len);
4319 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4320 		}
4321 		mutex_destroy(&rcas->node_lock);
4322 		kmem_cache_free(hca->server_side_cache, rcas);
4323 		rcas = avl_first(&hca->avl_tree);
4324 		if (rcas != NULL)
4325 			avl_remove(&hca->avl_tree, rcas);
4326 	}
4327 	rw_exit(&hca->avl_rw_lock);
4328 }
4329 
4330 static void
4331 rib_server_side_cache_cleanup(void *argp)
4332 {
4333 	cache_avl_struct_t    *rcas;
4334 	rib_lrc_entry_t		*rb;
4335 	rib_hca_t *hca = (rib_hca_t *)argp;
4336 
4337 	rw_enter(&hca->avl_rw_lock, RW_READER);
4338 	if (cache_allocation < cache_limit) {
4339 		rw_exit(&hca->avl_rw_lock);
4340 		return;
4341 	}
4342 	rw_exit(&hca->avl_rw_lock);
4343 
4344 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4345 	rcas = avl_last(&hca->avl_tree);
4346 	if (rcas != NULL)
4347 		avl_remove(&hca->avl_tree, rcas);
4348 
4349 	while (rcas != NULL) {
4350 		while (rcas->r.forw != &rcas->r) {
4351 			rcas->elements--;
4352 			rib_total_buffers --;
4353 			rb = rcas->r.forw;
4354 			remque(rb);
4355 			if (rb->registered)
4356 				(void) rib_deregistermem_via_hca(hca,
4357 				    rb->lrc_buf, rb->lrc_mhandle);
4358 			cache_allocation -= rb->lrc_len;
4359 			kmem_free(rb->lrc_buf, rb->lrc_len);
4360 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4361 		}
4362 		mutex_destroy(&rcas->node_lock);
4363 		if (hca->server_side_cache) {
4364 			kmem_cache_free(hca->server_side_cache, rcas);
4365 		}
4366 		if ((cache_allocation) < cache_limit) {
4367 			rw_exit(&hca->avl_rw_lock);
4368 			return;
4369 		}
4370 
4371 		rcas = avl_last(&hca->avl_tree);
4372 		if (rcas != NULL)
4373 			avl_remove(&hca->avl_tree, rcas);
4374 	}
4375 	rw_exit(&hca->avl_rw_lock);
4376 }
4377 
4378 static int
4379 avl_compare(const void *t1, const void *t2)
4380 {
4381 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
4382 		return (0);
4383 
4384 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
4385 		return (-1);
4386 
4387 	return (1);
4388 }
4389 
4390 static void
4391 rib_destroy_cache(rib_hca_t *hca)
4392 {
4393 	if (hca->reg_cache_clean_up != NULL) {
4394 		ddi_taskq_destroy(hca->reg_cache_clean_up);
4395 		hca->reg_cache_clean_up = NULL;
4396 	}
4397 	if (hca->avl_init) {
4398 		rib_server_side_cache_reclaim((void *)hca);
4399 		if (hca->server_side_cache) {
4400 			kmem_cache_destroy(hca->server_side_cache);
4401 			hca->server_side_cache = NULL;
4402 		}
4403 		avl_destroy(&hca->avl_tree);
4404 		mutex_destroy(&hca->cache_allocation);
4405 		rw_destroy(&hca->avl_rw_lock);
4406 	}
4407 	hca->avl_init = FALSE;
4408 }
4409 
4410 static void
4411 rib_force_cleanup(void *hca)
4412 {
4413 	if (((rib_hca_t *)hca)->reg_cache_clean_up != NULL)
4414 		(void) ddi_taskq_dispatch(
4415 		    ((rib_hca_t *)hca)->reg_cache_clean_up,
4416 		    rib_server_side_cache_cleanup,
4417 		    (void *)hca, DDI_NOSLEEP);
4418 }
4419 
4420 static rib_lrc_entry_t *
4421 rib_get_cache_buf(CONN *conn, uint32_t len)
4422 {
4423 	cache_avl_struct_t	cas, *rcas;
4424 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4425 	rib_lrc_entry_t *reply_buf;
4426 	avl_index_t where = NULL;
4427 	uint64_t c_alloc = 0;
4428 
4429 	if (!hca->avl_init)
4430 		goto  error_alloc;
4431 
4432 	cas.len = len;
4433 
4434 	rw_enter(&hca->avl_rw_lock, RW_READER);
4435 
4436 	mutex_enter(&hca->cache_allocation);
4437 	c_alloc = cache_allocation;
4438 	mutex_exit(&hca->cache_allocation);
4439 
4440 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
4441 	    &where)) == NULL) {
4442 		/* Am I above the cache limit */
4443 		if ((c_alloc + len) >= cache_limit) {
4444 			rib_force_cleanup((void *)hca);
4445 			rw_exit(&hca->avl_rw_lock);
4446 			cache_misses_above_the_limit ++;
4447 
4448 			/* Allocate and register the buffer directly */
4449 			goto error_alloc;
4450 		}
4451 
4452 		rw_exit(&hca->avl_rw_lock);
4453 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
4454 
4455 		/* Recheck to make sure no other thread added the entry in */
4456 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
4457 		    &cas, &where)) == NULL) {
4458 			/* Allocate an avl tree entry */
4459 			rcas = (cache_avl_struct_t *)
4460 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
4461 
4462 			bzero(rcas, sizeof (cache_avl_struct_t));
4463 			rcas->elements = 0;
4464 			rcas->r.forw = &rcas->r;
4465 			rcas->r.back = &rcas->r;
4466 			rcas->len = len;
4467 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
4468 			avl_insert(&hca->avl_tree, rcas, where);
4469 		}
4470 	}
4471 
4472 	mutex_enter(&rcas->node_lock);
4473 
4474 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
4475 		rib_total_buffers--;
4476 		cache_hits++;
4477 		reply_buf = rcas->r.forw;
4478 		remque(reply_buf);
4479 		rcas->elements--;
4480 		mutex_exit(&rcas->node_lock);
4481 		rw_exit(&hca->avl_rw_lock);
4482 		mutex_enter(&hca->cache_allocation);
4483 		cache_allocation -= len;
4484 		mutex_exit(&hca->cache_allocation);
4485 	} else {
4486 		/* Am I above the cache limit */
4487 		mutex_exit(&rcas->node_lock);
4488 		if ((c_alloc + len) >= cache_limit) {
4489 			rib_force_cleanup((void *)hca);
4490 			rw_exit(&hca->avl_rw_lock);
4491 			cache_misses_above_the_limit ++;
4492 			/* Allocate and register the buffer directly */
4493 			goto error_alloc;
4494 		}
4495 		rw_exit(&hca->avl_rw_lock);
4496 		cache_misses ++;
4497 		/* Allocate a reply_buf entry */
4498 		reply_buf = (rib_lrc_entry_t *)
4499 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4500 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
4501 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
4502 		reply_buf->lrc_len  = len;
4503 		reply_buf->registered = FALSE;
4504 		reply_buf->avl_node = (void *)rcas;
4505 	}
4506 
4507 	return (reply_buf);
4508 
4509 error_alloc:
4510 	reply_buf = (rib_lrc_entry_t *)
4511 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4512 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
4513 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
4514 	reply_buf->lrc_len = len;
4515 	reply_buf->registered = FALSE;
4516 	reply_buf->avl_node = NULL;
4517 
4518 	return (reply_buf);
4519 }
4520 
4521 /*
4522  * Return a pre-registered back to the cache (without
4523  * unregistering the buffer)..
4524  */
4525 
4526 static void
4527 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
4528 {
4529 	cache_avl_struct_t    cas, *rcas;
4530 	avl_index_t where = NULL;
4531 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4532 
4533 	if (!hca->avl_init)
4534 		goto  error_free;
4535 
4536 	cas.len = reg_buf->lrc_len;
4537 	rw_enter(&hca->avl_rw_lock, RW_READER);
4538 	if ((rcas = (cache_avl_struct_t *)
4539 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
4540 		rw_exit(&hca->avl_rw_lock);
4541 		goto error_free;
4542 	} else {
4543 		rib_total_buffers ++;
4544 		cas.len = reg_buf->lrc_len;
4545 		mutex_enter(&rcas->node_lock);
4546 		insque(reg_buf, &rcas->r);
4547 		rcas->elements ++;
4548 		mutex_exit(&rcas->node_lock);
4549 		rw_exit(&hca->avl_rw_lock);
4550 		mutex_enter(&hca->cache_allocation);
4551 		cache_allocation += cas.len;
4552 		mutex_exit(&hca->cache_allocation);
4553 	}
4554 
4555 	return;
4556 
4557 error_free:
4558 
4559 	if (reg_buf->registered)
4560 		(void) rib_deregistermem_via_hca(hca,
4561 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
4562 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
4563 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
4564 }
4565 
4566 static rdma_stat
4567 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
4568 	uint_t buflen, struct mrc *buf_handle)
4569 {
4570 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
4571 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
4572 	rdma_stat	status;
4573 
4574 
4575 	/*
4576 	 * Note: ALL buffer pools use the same memory type RDMARW.
4577 	 */
4578 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
4579 	if (status == RDMA_SUCCESS) {
4580 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
4581 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
4582 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
4583 	} else {
4584 		buf_handle->mrc_linfo = NULL;
4585 		buf_handle->mrc_lmr = 0;
4586 		buf_handle->mrc_rmr = 0;
4587 	}
4588 	return (status);
4589 }
4590 
4591 /* ARGSUSED */
4592 static rdma_stat
4593 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
4594     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
4595 {
4596 
4597 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
4598 	return (RDMA_SUCCESS);
4599 }
4600 
4601 /* ARGSUSED */
4602 static rdma_stat
4603 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
4604 {
4605 
4606 	(void) ibt_deregister_mr(hca->hca_hdl,
4607 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
4608 	return (RDMA_SUCCESS);
4609 }
4610 
4611 /*
4612  * Check if the IP interface named by `lifrp' is RDMA-capable.
4613  */
4614 static boolean_t
4615 rpcib_rdma_capable_interface(struct lifreq *lifrp)
4616 {
4617 	char ifname[LIFNAMSIZ];
4618 	char *cp;
4619 
4620 	if (lifrp->lifr_type == IFT_IB)
4621 		return (B_TRUE);
4622 
4623 	/*
4624 	 * Strip off the logical interface portion before getting
4625 	 * intimate with the name.
4626 	 */
4627 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
4628 	if ((cp = strchr(ifname, ':')) != NULL)
4629 		*cp = '\0';
4630 
4631 	return (strcmp("lo0", ifname) == 0);
4632 }
4633 
4634 static int
4635 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
4636 {
4637 	vnode_t *kvp, *vp;
4638 	TIUSER  *tiptr;
4639 	struct  strioctl iocb;
4640 	k_sigset_t smask;
4641 	int	err = 0;
4642 
4643 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) {
4644 		if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE,
4645 		    &tiptr, CRED()) == 0) {
4646 			vp = tiptr->fp->f_vnode;
4647 		} else {
4648 			VN_RELE(kvp);
4649 			return (EPROTO);
4650 		}
4651 	} else {
4652 		return (EPROTO);
4653 	}
4654 
4655 	iocb.ic_cmd = cmd;
4656 	iocb.ic_timout = 0;
4657 	iocb.ic_len = len;
4658 	iocb.ic_dp = (caddr_t)arg;
4659 	sigintr(&smask, 0);
4660 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4661 	sigunintr(&smask);
4662 	(void) t_kclose(tiptr, 0);
4663 	VN_RELE(kvp);
4664 	return (err);
4665 }
4666 
4667 /*
4668  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
4669  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
4670  */
4671 static int
4672 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
4673 {
4674 	int err;
4675 	struct lifnum lifn;
4676 
4677 	bzero(&lifn, sizeof (struct lifnum));
4678 	lifn.lifn_family = AF_UNSPEC;
4679 
4680 	err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
4681 	if (err != 0)
4682 		return (err);
4683 
4684 	/*
4685 	 * Pad the interface count to account for additional interfaces that
4686 	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
4687 	 */
4688 	lifn.lifn_count += 4;
4689 
4690 	bzero(lifcp, sizeof (struct lifconf));
4691 	lifcp->lifc_family = AF_UNSPEC;
4692 	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
4693 	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
4694 
4695 	err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
4696 	if (err != 0) {
4697 		kmem_free(lifcp->lifc_buf, *bufsizep);
4698 		return (err);
4699 	}
4700 	return (0);
4701 }
4702 
4703 static boolean_t
4704 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
4705 {
4706 	uint_t i, nifs;
4707 	uint_t bufsize;
4708 	struct lifconf lifc;
4709 	struct lifreq *lifrp;
4710 	struct sockaddr_in *sinp;
4711 	struct sockaddr_in6 *sin6p;
4712 
4713 	bzero(addrs4, sizeof (rpcib_ipaddrs_t));
4714 	bzero(addrs6, sizeof (rpcib_ipaddrs_t));
4715 
4716 	if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
4717 		return (B_FALSE);
4718 
4719 	if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
4720 		kmem_free(lifc.lifc_buf, bufsize);
4721 		return (B_FALSE);
4722 	}
4723 
4724 	/*
4725 	 * Worst case is that all of the addresses are IB-capable and have
4726 	 * the same address family, so size our buffers accordingly.
4727 	 */
4728 	addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
4729 	addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
4730 	addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
4731 	addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
4732 
4733 	for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
4734 		if (!rpcib_rdma_capable_interface(lifrp))
4735 			continue;
4736 
4737 		if (lifrp->lifr_addr.ss_family == AF_INET) {
4738 			sinp = addrs4->ri_list;
4739 			bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
4740 			    sizeof (struct sockaddr_in));
4741 		} else if (lifrp->lifr_addr.ss_family == AF_INET6) {
4742 			sin6p = addrs6->ri_list;
4743 			bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
4744 			    sizeof (struct sockaddr_in6));
4745 		}
4746 	}
4747 
4748 	kmem_free(lifc.lifc_buf, bufsize);
4749 	return (B_TRUE);
4750 }
4751 
4752 /* ARGSUSED */
4753 static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) {
4754 
4755 	if (KSTAT_WRITE == rw) {
4756 		return (EACCES);
4757 	}
4758 	rpcib_kstat.cache_limit.value.ui64 =
4759 	    (uint64_t)cache_limit;
4760 	rpcib_kstat.cache_allocation.value.ui64 =
4761 	    (uint64_t)cache_allocation;
4762 	rpcib_kstat.cache_hits.value.ui64 =
4763 	    (uint64_t)cache_hits;
4764 	rpcib_kstat.cache_misses.value.ui64 =
4765 	    (uint64_t)cache_misses;
4766 	rpcib_kstat.cache_misses_above_the_limit.value.ui64 =
4767 	    (uint64_t)cache_misses_above_the_limit;
4768 	return (0);
4769 }
4770