xref: /illumos-gate/usr/src/uts/common/rpc/rpcib.c (revision 3d393ee6c37fa10ac512ed6d36109ad616dc7c1a)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
28  *
29  * Portions of this source code is developed by the team members of
30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31  * headed by Professor Dhabaleswar K. (DK) Panda.
32  *
33  * Acknowledgements to contributions from developors:
34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
35  *   Lei Chai      : chail@cse.ohio-state.edu
36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
37  *
38  */
39 
40 /*
41  * The rpcib plugin. Implements the interface for RDMATF's
42  * interaction with IBTF.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/user.h>
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/file.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/stropts.h>
56 #include <sys/errno.h>
57 #include <sys/kmem.h>
58 #include <sys/debug.h>
59 #include <sys/pathname.h>
60 #include <sys/kstat.h>
61 #include <sys/t_lock.h>
62 #include <sys/ddi.h>
63 #include <sys/cmn_err.h>
64 #include <sys/time.h>
65 #include <sys/isa_defs.h>
66 #include <sys/callb.h>
67 #include <sys/sunddi.h>
68 #include <sys/sunndi.h>
69 #include <sys/sdt.h>
70 #include <sys/ib/ibtl/ibti.h>
71 #include <rpc/rpc.h>
72 #include <rpc/ib.h>
73 #include <sys/modctl.h>
74 #include <sys/kstr.h>
75 #include <sys/sockio.h>
76 #include <sys/vnode.h>
77 #include <sys/tiuser.h>
78 #include <net/if.h>
79 #include <net/if_types.h>
80 #include <sys/cred.h>
81 #include <rpc/rpc_rdma.h>
82 #include <nfs/nfs.h>
83 #include <sys/atomic.h>
84 
85 #define	NFS_RDMA_PORT	2050
86 
87 /*
88  * Convenience structure used by rpcib_get_ib_addresses()
89  */
90 typedef struct rpcib_ipaddrs {
91 	void	*ri_list;	/* pointer to list of addresses */
92 	uint_t	ri_count;	/* number of addresses in list */
93 	uint_t	ri_size;	/* size of ri_list in bytes */
94 } rpcib_ipaddrs_t;
95 
96 /*
97  * Prototype declarations for driver ops
98  */
99 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
100 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
101 				void *, void **);
102 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
103 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
104 static int	rpcib_do_ip_ioctl(int, int, void *);
105 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
106 static int rpcib_cache_kstat_update(kstat_t *, int);
107 static void rib_force_cleanup(void *);
108 
109 struct {
110 	kstat_named_t cache_limit;
111 	kstat_named_t cache_allocation;
112 	kstat_named_t cache_hits;
113 	kstat_named_t cache_misses;
114 	kstat_named_t cache_misses_above_the_limit;
115 } rpcib_kstat = {
116 	{"cache_limit",			KSTAT_DATA_UINT64 },
117 	{"cache_allocation",		KSTAT_DATA_UINT64 },
118 	{"cache_hits",			KSTAT_DATA_UINT64 },
119 	{"cache_misses",		KSTAT_DATA_UINT64 },
120 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
121 };
122 
123 /* rpcib cb_ops */
124 static struct cb_ops rpcib_cbops = {
125 	nulldev,		/* open */
126 	nulldev,		/* close */
127 	nodev,			/* strategy */
128 	nodev,			/* print */
129 	nodev,			/* dump */
130 	nodev,			/* read */
131 	nodev,			/* write */
132 	nodev,			/* ioctl */
133 	nodev,			/* devmap */
134 	nodev,			/* mmap */
135 	nodev,			/* segmap */
136 	nochpoll,		/* poll */
137 	ddi_prop_op,		/* prop_op */
138 	NULL,			/* stream */
139 	D_MP,			/* cb_flag */
140 	CB_REV,			/* rev */
141 	nodev,			/* int (*cb_aread)() */
142 	nodev			/* int (*cb_awrite)() */
143 };
144 
145 /*
146  * Device options
147  */
148 static struct dev_ops rpcib_ops = {
149 	DEVO_REV,		/* devo_rev, */
150 	0,			/* refcnt  */
151 	rpcib_getinfo,		/* info */
152 	nulldev,		/* identify */
153 	nulldev,		/* probe */
154 	rpcib_attach,		/* attach */
155 	rpcib_detach,		/* detach */
156 	nodev,			/* reset */
157 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
158 	NULL,			/* bus operations */
159 	NULL,			/* power */
160 	ddi_quiesce_not_needed,		/* quiesce */
161 };
162 
163 /*
164  * Module linkage information.
165  */
166 
167 static struct modldrv rib_modldrv = {
168 	&mod_driverops,		/* Driver module */
169 	"RPCIB plugin driver",	/* Driver name and version */
170 	&rpcib_ops,		/* Driver ops */
171 };
172 
173 static struct modlinkage rib_modlinkage = {
174 	MODREV_1,
175 	(void *)&rib_modldrv,
176 	NULL
177 };
178 
179 typedef struct rib_lrc_entry {
180 	struct rib_lrc_entry *forw;
181 	struct rib_lrc_entry *back;
182 	char *lrc_buf;
183 
184 	uint32_t lrc_len;
185 	void  *avl_node;
186 	bool_t registered;
187 
188 	struct mrc lrc_mhandle;
189 	bool_t lrc_on_freed_list;
190 } rib_lrc_entry_t;
191 
192 typedef	struct cache_struct	{
193 	rib_lrc_entry_t		r;
194 	uint32_t		len;
195 	uint32_t		elements;
196 	kmutex_t		node_lock;
197 	avl_node_t		avl_link;
198 } cache_avl_struct_t;
199 
200 static uint64_t	rib_total_buffers = 0;
201 uint64_t	cache_limit = 100 * 1024 * 1024;
202 static volatile uint64_t	cache_allocation = 0;
203 static uint64_t	cache_watermark = 80 * 1024 * 1024;
204 static uint64_t	cache_hits = 0;
205 static uint64_t	cache_misses = 0;
206 static uint64_t	cache_cold_misses = 0;
207 static uint64_t	cache_hot_misses = 0;
208 static uint64_t	cache_misses_above_the_limit = 0;
209 static bool_t	stats_enabled = FALSE;
210 
211 static uint64_t max_unsignaled_rws = 5;
212 
213 /*
214  * rib_stat: private data pointer used when registering
215  *	with the IBTF.  It is returned to the consumer
216  *	in all callbacks.
217  */
218 static rpcib_state_t *rib_stat = NULL;
219 
220 #define	RNR_RETRIES	IBT_RNR_RETRY_1
221 #define	MAX_PORTS	2
222 
223 int preposted_rbufs = RDMA_BUFS_GRANT;
224 int send_threshold = 1;
225 
226 /*
227  * State of the plugin.
228  * ACCEPT = accepting new connections and requests.
229  * NO_ACCEPT = not accepting new connection and requests.
230  * This should eventually move to rpcib_state_t structure, since this
231  * will tell in which state the plugin is for a particular type of service
232  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
233  * state for one and in no_accept state for the other.
234  */
235 int		plugin_state;
236 kmutex_t	plugin_state_lock;
237 
238 ldi_ident_t rpcib_li;
239 
240 /*
241  * RPCIB RDMATF operations
242  */
243 #if defined(MEASURE_POOL_DEPTH)
244 static void rib_posted_rbufs(uint32_t x) { return; }
245 #endif
246 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
247 static rdma_stat rib_disconnect(CONN *conn);
248 static void rib_listen(struct rdma_svc_data *rd);
249 static void rib_listen_stop(struct rdma_svc_data *rd);
250 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
251 	uint_t buflen, struct mrc *buf_handle);
252 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
253 	struct mrc buf_handle);
254 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
255 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
256 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
257 		struct mrc buf_handle);
258 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
259 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
260 	void *lrc);
261 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
262 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
263 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
264 	caddr_t buf, int len, int cpu);
265 
266 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
267 
268 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
269 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
270 
271 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
272 
273 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
274 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
275 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
276 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
277 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
278 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
279 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
280 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
281 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **);
282 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
283 static rdma_stat rib_conn_release(CONN *conn);
284 static rdma_stat rib_getinfo(rdma_info_t *info);
285 
286 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
287 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
288 static void rib_destroy_cache(rib_hca_t *hca);
289 static	void	rib_server_side_cache_reclaim(void *argp);
290 static int avl_compare(const void *t1, const void *t2);
291 
292 static void rib_stop_services(rib_hca_t *);
293 static void rib_close_channels(rib_conn_list_t *);
294 
295 /*
296  * RPCIB addressing operations
297  */
298 
299 /*
300  * RDMA operations the RPCIB module exports
301  */
302 static rdmaops_t rib_ops = {
303 	rib_reachable,
304 	rib_conn_get,
305 	rib_conn_release,
306 	rib_listen,
307 	rib_listen_stop,
308 	rib_registermem,
309 	rib_deregistermem,
310 	rib_registermemsync,
311 	rib_deregistermemsync,
312 	rib_syncmem,
313 	rib_reg_buf_alloc,
314 	rib_reg_buf_free,
315 	rib_send,
316 	rib_send_resp,
317 	rib_post_resp,
318 	rib_post_resp_remove,
319 	rib_post_recv,
320 	rib_recv,
321 	rib_read,
322 	rib_write,
323 	rib_getinfo,
324 };
325 
326 /*
327  * RDMATF RPCIB plugin details
328  */
329 static rdma_mod_t rib_mod = {
330 	"ibtf",		/* api name */
331 	RDMATF_VERS_1,
332 	0,
333 	&rib_ops,	/* rdma op vector for ibtf */
334 };
335 
336 static rdma_stat open_hcas(rpcib_state_t *);
337 static rdma_stat rib_qp_init(rib_qp_t *, int);
338 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
339 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
340 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
341 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
342 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
343 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
344 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
345 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
346 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
347 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *,
348 	ibt_ip_addr_t *, ibt_ip_addr_t *);
349 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
350 	rib_qp_t **);
351 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
352 	rib_qp_t **);
353 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
354 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
355 static int rib_free_sendwait(struct send_wid *);
356 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
357 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
358 static void rdma_done_rem_list(rib_qp_t *);
359 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
360 
361 static void rib_async_handler(void *,
362 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
363 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
364 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
365 static int rib_free_svc_recv(struct svc_recv *);
366 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
367 static void rib_free_wid(struct recv_wid *);
368 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
369 static void rib_detach_hca(rib_hca_t *);
370 static rdma_stat rib_chk_srv_ibaddr(struct netbuf *, int,
371 	ibt_path_info_t *, ibt_ip_addr_t *, ibt_ip_addr_t *);
372 
373 /*
374  * Registration with IBTF as a consumer
375  */
376 static struct ibt_clnt_modinfo_s rib_modinfo = {
377 	IBTI_V_CURR,
378 	IBT_GENERIC,
379 	rib_async_handler,	/* async event handler */
380 	NULL,			/* Memory Region Handler */
381 	"nfs/ib"
382 };
383 
384 /*
385  * Global strucuture
386  */
387 
388 typedef struct rpcib_s {
389 	dev_info_t	*rpcib_dip;
390 	kmutex_t	rpcib_mutex;
391 } rpcib_t;
392 
393 rpcib_t rpcib;
394 
395 /*
396  * /etc/system controlled variable to control
397  * debugging in rpcib kernel module.
398  * Set it to values greater that 1 to control
399  * the amount of debugging messages required.
400  */
401 int rib_debug = 0;
402 
403 int
404 _init(void)
405 {
406 	int error;
407 
408 	error = mod_install((struct modlinkage *)&rib_modlinkage);
409 	if (error != 0) {
410 		/*
411 		 * Could not load module
412 		 */
413 		return (error);
414 	}
415 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
416 	return (0);
417 }
418 
419 int
420 _fini()
421 {
422 	int status;
423 
424 	/*
425 	 * Remove module
426 	 */
427 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
428 		return (status);
429 	}
430 	mutex_destroy(&plugin_state_lock);
431 	return (0);
432 }
433 
434 int
435 _info(struct modinfo *modinfop)
436 {
437 	return (mod_info(&rib_modlinkage, modinfop));
438 }
439 
440 /*
441  * rpcib_getinfo()
442  * Given the device number, return the devinfo pointer or the
443  * instance number.
444  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
445  */
446 
447 /*ARGSUSED*/
448 static int
449 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
450 {
451 	int ret = DDI_SUCCESS;
452 
453 	switch (cmd) {
454 	case DDI_INFO_DEVT2DEVINFO:
455 		if (rpcib.rpcib_dip != NULL)
456 			*result = rpcib.rpcib_dip;
457 		else {
458 			*result = NULL;
459 			ret = DDI_FAILURE;
460 		}
461 		break;
462 
463 	case DDI_INFO_DEVT2INSTANCE:
464 		*result = NULL;
465 		break;
466 
467 	default:
468 		ret = DDI_FAILURE;
469 	}
470 	return (ret);
471 }
472 
473 static int
474 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
475 {
476 	ibt_status_t	ibt_status;
477 	rdma_stat	r_status;
478 
479 	switch (cmd) {
480 	case DDI_ATTACH:
481 		break;
482 	case DDI_RESUME:
483 		return (DDI_SUCCESS);
484 	default:
485 		return (DDI_FAILURE);
486 	}
487 
488 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
489 
490 	mutex_enter(&rpcib.rpcib_mutex);
491 	if (rpcib.rpcib_dip != NULL) {
492 		mutex_exit(&rpcib.rpcib_mutex);
493 		return (DDI_FAILURE);
494 	}
495 	rpcib.rpcib_dip = dip;
496 	mutex_exit(&rpcib.rpcib_mutex);
497 	/*
498 	 * Create the "rpcib" minor-node.
499 	 */
500 	if (ddi_create_minor_node(dip,
501 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
502 		/* Error message, no cmn_err as they print on console */
503 		return (DDI_FAILURE);
504 	}
505 
506 	if (rib_stat == NULL) {
507 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
508 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
509 	}
510 
511 	rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids);
512 	if (rib_stat->hca_count < 1) {
513 		mutex_destroy(&rib_stat->open_hca_lock);
514 		kmem_free(rib_stat, sizeof (*rib_stat));
515 		rib_stat = NULL;
516 		return (DDI_FAILURE);
517 	}
518 
519 	ibt_status = ibt_attach(&rib_modinfo, dip,
520 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
521 
522 	if (ibt_status != IBT_SUCCESS) {
523 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
524 		mutex_destroy(&rib_stat->open_hca_lock);
525 		kmem_free(rib_stat, sizeof (*rib_stat));
526 		rib_stat = NULL;
527 		return (DDI_FAILURE);
528 	}
529 
530 	mutex_enter(&rib_stat->open_hca_lock);
531 	if (open_hcas(rib_stat) != RDMA_SUCCESS) {
532 		mutex_exit(&rib_stat->open_hca_lock);
533 		goto open_fail;
534 	}
535 	mutex_exit(&rib_stat->open_hca_lock);
536 
537 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
538 	    DDI_PROP_SUCCESS) {
539 		cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
540 		    "failed.");
541 		goto register_fail;
542 	}
543 
544 	/*
545 	 * Register with rdmatf
546 	 */
547 	rib_mod.rdma_count = rib_stat->nhca_inited;
548 	r_status = rdma_register_mod(&rib_mod);
549 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
550 		cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
551 		    "status = %d", r_status);
552 		goto register_fail;
553 	}
554 
555 	return (DDI_SUCCESS);
556 
557 register_fail:
558 	rib_detach_hca(rib_stat->hca);
559 open_fail:
560 	ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
561 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
562 	mutex_destroy(&rib_stat->open_hca_lock);
563 	kmem_free(rib_stat, sizeof (*rib_stat));
564 	rib_stat = NULL;
565 	return (DDI_FAILURE);
566 }
567 
568 /*ARGSUSED*/
569 static int
570 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
571 {
572 	switch (cmd) {
573 
574 	case DDI_DETACH:
575 		break;
576 
577 	case DDI_SUSPEND:
578 	default:
579 		return (DDI_FAILURE);
580 	}
581 
582 	/*
583 	 * Detach the hca and free resources
584 	 */
585 	mutex_enter(&plugin_state_lock);
586 	plugin_state = NO_ACCEPT;
587 	mutex_exit(&plugin_state_lock);
588 	rib_detach_hca(rib_stat->hca);
589 	ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
590 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
591 	mutex_destroy(&rib_stat->open_hca_lock);
592 	if (rib_stat->hcas) {
593 		kmem_free(rib_stat->hcas, rib_stat->hca_count *
594 		    sizeof (rib_hca_t));
595 		rib_stat->hcas = NULL;
596 	}
597 	kmem_free(rib_stat, sizeof (*rib_stat));
598 	rib_stat = NULL;
599 
600 	mutex_enter(&rpcib.rpcib_mutex);
601 	rpcib.rpcib_dip = NULL;
602 	mutex_exit(&rpcib.rpcib_mutex);
603 	mutex_destroy(&rpcib.rpcib_mutex);
604 	return (DDI_SUCCESS);
605 }
606 
607 
608 static void rib_rbufpool_free(rib_hca_t *, int);
609 static void rib_rbufpool_deregister(rib_hca_t *, int);
610 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
611 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
612 static rdma_stat rib_rem_replylist(rib_qp_t *);
613 static int rib_remreply(rib_qp_t *, struct reply *);
614 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
615 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
616 
617 
618 /*
619  * One CQ pair per HCA
620  */
621 static rdma_stat
622 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
623 	rib_cq_t **cqp, rpcib_state_t *ribstat)
624 {
625 	rib_cq_t	*cq;
626 	ibt_cq_attr_t	cq_attr;
627 	uint32_t	real_size;
628 	ibt_status_t	status;
629 	rdma_stat	error = RDMA_SUCCESS;
630 
631 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
632 	cq->rib_hca = hca;
633 	cq_attr.cq_size = cq_size;
634 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
635 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
636 	    &real_size);
637 	if (status != IBT_SUCCESS) {
638 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
639 		    " status=%d", status);
640 		error = RDMA_FAILED;
641 		goto fail;
642 	}
643 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat);
644 
645 	/*
646 	 * Enable CQ callbacks. CQ Callbacks are single shot
647 	 * (e.g. you have to call ibt_enable_cq_notify()
648 	 * after each callback to get another one).
649 	 */
650 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
651 	if (status != IBT_SUCCESS) {
652 		cmn_err(CE_WARN, "rib_create_cq: "
653 		    "enable_cq_notify failed, status %d", status);
654 		error = RDMA_FAILED;
655 		goto fail;
656 	}
657 	*cqp = cq;
658 
659 	return (error);
660 fail:
661 	if (cq->rib_cq_hdl)
662 		(void) ibt_free_cq(cq->rib_cq_hdl);
663 	if (cq)
664 		kmem_free(cq, sizeof (rib_cq_t));
665 	return (error);
666 }
667 
668 static rdma_stat
669 open_hcas(rpcib_state_t *ribstat)
670 {
671 	rib_hca_t		*hca;
672 	ibt_status_t		ibt_status;
673 	rdma_stat		status;
674 	ibt_hca_portinfo_t	*pinfop;
675 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
676 	uint_t			size, cq_size;
677 	int			i;
678 	kstat_t *ksp;
679 	cache_avl_struct_t example_avl_node;
680 	char rssc_name[32];
681 
682 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
683 
684 	if (ribstat->hcas == NULL)
685 		ribstat->hcas = kmem_zalloc(ribstat->hca_count *
686 		    sizeof (rib_hca_t), KM_SLEEP);
687 
688 	/*
689 	 * Open a hca and setup for RDMA
690 	 */
691 	for (i = 0; i < ribstat->hca_count; i++) {
692 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
693 		    ribstat->hca_guids[i],
694 		    &ribstat->hcas[i].hca_hdl);
695 		if (ibt_status != IBT_SUCCESS) {
696 			continue;
697 		}
698 		ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
699 		hca = &(ribstat->hcas[i]);
700 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
701 		hca->state = HCA_INITED;
702 
703 		/*
704 		 * query HCA info
705 		 */
706 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
707 		if (ibt_status != IBT_SUCCESS) {
708 			goto fail1;
709 		}
710 
711 		/*
712 		 * One PD (Protection Domain) per HCA.
713 		 * A qp is allowed to access a memory region
714 		 * only when it's in the same PD as that of
715 		 * the memory region.
716 		 */
717 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
718 		if (ibt_status != IBT_SUCCESS) {
719 			goto fail1;
720 		}
721 
722 		/*
723 		 * query HCA ports
724 		 */
725 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
726 		    0, &pinfop, &hca->hca_nports, &size);
727 		if (ibt_status != IBT_SUCCESS) {
728 			goto fail2;
729 		}
730 		hca->hca_ports = pinfop;
731 		hca->hca_pinfosz = size;
732 		pinfop = NULL;
733 
734 		cq_size = DEF_CQ_SIZE; /* default cq size */
735 		/*
736 		 * Create 2 pairs of cq's (1 pair for client
737 		 * and the other pair for server) on this hca.
738 		 * If number of qp's gets too large, then several
739 		 * cq's will be needed.
740 		 */
741 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
742 		    &hca->svc_rcq, ribstat);
743 		if (status != RDMA_SUCCESS) {
744 			goto fail3;
745 		}
746 
747 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
748 		    &hca->svc_scq, ribstat);
749 		if (status != RDMA_SUCCESS) {
750 			goto fail3;
751 		}
752 
753 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
754 		    &hca->clnt_rcq, ribstat);
755 		if (status != RDMA_SUCCESS) {
756 			goto fail3;
757 		}
758 
759 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
760 		    &hca->clnt_scq, ribstat);
761 		if (status != RDMA_SUCCESS) {
762 			goto fail3;
763 		}
764 
765 		/*
766 		 * Create buffer pools.
767 		 * Note rib_rbuf_create also allocates memory windows.
768 		 */
769 		hca->recv_pool = rib_rbufpool_create(hca,
770 		    RECV_BUFFER, MAX_BUFS);
771 		if (hca->recv_pool == NULL) {
772 			goto fail3;
773 		}
774 
775 		hca->send_pool = rib_rbufpool_create(hca,
776 		    SEND_BUFFER, MAX_BUFS);
777 		if (hca->send_pool == NULL) {
778 			rib_rbufpool_destroy(hca, RECV_BUFFER);
779 			goto fail3;
780 		}
781 
782 		if (hca->server_side_cache == NULL) {
783 			(void) sprintf(rssc_name,
784 			    "rib_server_side_cache_%04d", i);
785 			hca->server_side_cache = kmem_cache_create(
786 			    rssc_name,
787 			    sizeof (cache_avl_struct_t), 0,
788 			    NULL,
789 			    NULL,
790 			    rib_server_side_cache_reclaim,
791 			    hca, NULL, 0);
792 		}
793 
794 		avl_create(&hca->avl_tree,
795 		    avl_compare,
796 		    sizeof (cache_avl_struct_t),
797 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
798 		    (uint_t)(uintptr_t)&example_avl_node);
799 
800 		rw_init(&hca->avl_rw_lock,
801 		    NULL, RW_DRIVER, hca->iblock);
802 		mutex_init(&hca->cache_allocation,
803 		    NULL, MUTEX_DRIVER, NULL);
804 		hca->avl_init = TRUE;
805 
806 		/* Create kstats for the cache */
807 		ASSERT(INGLOBALZONE(curproc));
808 
809 		if (!stats_enabled) {
810 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
811 			    KSTAT_TYPE_NAMED,
812 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
813 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
814 			    GLOBAL_ZONEID);
815 			if (ksp) {
816 				ksp->ks_data = (void *) &rpcib_kstat;
817 				ksp->ks_update = rpcib_cache_kstat_update;
818 				kstat_install(ksp);
819 				stats_enabled = TRUE;
820 			}
821 		}
822 		if (NULL == hca->reg_cache_clean_up) {
823 			hca->reg_cache_clean_up = ddi_taskq_create(NULL,
824 			    "REG_CACHE_CLEANUP", 1, TASKQ_DEFAULTPRI, 0);
825 		}
826 
827 		/*
828 		 * Initialize the registered service list and
829 		 * the lock
830 		 */
831 		hca->service_list = NULL;
832 		rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
833 
834 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
835 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
836 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
837 		    hca->iblock);
838 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
839 		    hca->iblock);
840 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
841 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
842 		hca->inuse = TRUE;
843 		/*
844 		 * XXX One hca only. Add multi-hca functionality if needed
845 		 * later.
846 		 */
847 		ribstat->hca = hca;
848 		ribstat->nhca_inited++;
849 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
850 		break;
851 
852 fail3:
853 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
854 fail2:
855 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
856 fail1:
857 		(void) ibt_close_hca(hca->hca_hdl);
858 
859 	}
860 	if (ribstat->hca != NULL)
861 		return (RDMA_SUCCESS);
862 	else
863 		return (RDMA_FAILED);
864 }
865 
866 /*
867  * Callback routines
868  */
869 
870 /*
871  * SCQ handlers
872  */
873 /* ARGSUSED */
874 static void
875 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
876 {
877 	ibt_status_t	ibt_status;
878 	ibt_wc_t	wc;
879 	int		i;
880 
881 	/*
882 	 * Re-enable cq notify here to avoid missing any
883 	 * completion queue notification.
884 	 */
885 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
886 
887 	ibt_status = IBT_SUCCESS;
888 	while (ibt_status != IBT_CQ_EMPTY) {
889 	bzero(&wc, sizeof (wc));
890 	ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
891 	if (ibt_status != IBT_SUCCESS)
892 		return;
893 
894 	/*
895 	 * Got a send completion
896 	 */
897 	if (wc.wc_id != NULL) {	/* XXX can it be otherwise ???? */
898 		struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
899 		CONN	*conn = qptoc(wd->qp);
900 
901 		mutex_enter(&wd->sendwait_lock);
902 		switch (wc.wc_status) {
903 		case IBT_WC_SUCCESS:
904 			wd->status = RDMA_SUCCESS;
905 			break;
906 		case IBT_WC_WR_FLUSHED_ERR:
907 			wd->status = RDMA_FAILED;
908 			break;
909 		default:
910 /*
911  *    RC Send Q Error Code		Local state     Remote State
912  *    ==================== 		===========     ============
913  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
914  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
915  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
916  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
917  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
918  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
919  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
920  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
921  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
922  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
923  *    IBT_WC_WR_FLUSHED_ERR               None            None
924  */
925 			/*
926 			 * Channel in error state. Set connection to
927 			 * ERROR and cleanup will happen either from
928 			 * conn_release  or from rib_conn_get
929 			 */
930 			wd->status = RDMA_FAILED;
931 			mutex_enter(&conn->c_lock);
932 			if (conn->c_state != C_DISCONN_PEND)
933 				conn->c_state = C_ERROR_CONN;
934 			mutex_exit(&conn->c_lock);
935 			break;
936 		}
937 
938 		if (wd->cv_sig == 1) {
939 			/*
940 			 * Notify poster
941 			 */
942 			cv_signal(&wd->wait_cv);
943 			mutex_exit(&wd->sendwait_lock);
944 		} else {
945 			/*
946 			 * Poster not waiting for notification.
947 			 * Free the send buffers and send_wid
948 			 */
949 			for (i = 0; i < wd->nsbufs; i++) {
950 				rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
951 				    (void *)(uintptr_t)wd->sbufaddr[i]);
952 				}
953 			mutex_exit(&wd->sendwait_lock);
954 			(void) rib_free_sendwait(wd);
955 			}
956 		}
957 	}
958 }
959 
960 /* ARGSUSED */
961 static void
962 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
963 {
964 	ibt_status_t	ibt_status;
965 	ibt_wc_t	wc;
966 	int		i;
967 
968 	/*
969 	 * Re-enable cq notify here to avoid missing any
970 	 * completion queue notification.
971 	 */
972 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
973 
974 	ibt_status = IBT_SUCCESS;
975 	while (ibt_status != IBT_CQ_EMPTY) {
976 		bzero(&wc, sizeof (wc));
977 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
978 		if (ibt_status != IBT_SUCCESS)
979 			return;
980 
981 		/*
982 		 * Got a send completion
983 		 */
984 		if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
985 			struct send_wid *wd =
986 			    (struct send_wid *)(uintptr_t)wc.wc_id;
987 			mutex_enter(&wd->sendwait_lock);
988 			if (wd->cv_sig == 1) {
989 				/*
990 				 * Update completion status and notify poster
991 				 */
992 				if (wc.wc_status == IBT_WC_SUCCESS)
993 					wd->status = RDMA_SUCCESS;
994 				else
995 					wd->status = RDMA_FAILED;
996 				cv_signal(&wd->wait_cv);
997 				mutex_exit(&wd->sendwait_lock);
998 			} else {
999 				/*
1000 				 * Poster not waiting for notification.
1001 				 * Free the send buffers and send_wid
1002 				 */
1003 				for (i = 0; i < wd->nsbufs; i++) {
1004 					rib_rbuf_free(qptoc(wd->qp),
1005 					    SEND_BUFFER,
1006 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1007 				}
1008 				mutex_exit(&wd->sendwait_lock);
1009 				(void) rib_free_sendwait(wd);
1010 			}
1011 		}
1012 	}
1013 }
1014 
1015 /*
1016  * RCQ handler
1017  */
1018 /* ARGSUSED */
1019 static void
1020 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1021 {
1022 	rib_qp_t	*qp;
1023 	ibt_status_t	ibt_status;
1024 	ibt_wc_t	wc;
1025 	struct recv_wid	*rwid;
1026 
1027 	/*
1028 	 * Re-enable cq notify here to avoid missing any
1029 	 * completion queue notification.
1030 	 */
1031 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1032 
1033 	ibt_status = IBT_SUCCESS;
1034 	while (ibt_status != IBT_CQ_EMPTY) {
1035 		bzero(&wc, sizeof (wc));
1036 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1037 		if (ibt_status != IBT_SUCCESS)
1038 			return;
1039 
1040 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1041 		qp = rwid->qp;
1042 		if (wc.wc_status == IBT_WC_SUCCESS) {
1043 			XDR	inxdrs, *xdrs;
1044 			uint_t	xid, vers, op, find_xid = 0;
1045 			struct reply	*r;
1046 			CONN *conn = qptoc(qp);
1047 			uint32_t rdma_credit = 0;
1048 
1049 			xdrs = &inxdrs;
1050 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1051 			    wc.wc_bytes_xfer, XDR_DECODE);
1052 			/*
1053 			 * Treat xid as opaque (xid is the first entity
1054 			 * in the rpc rdma message).
1055 			 */
1056 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1057 
1058 			/* Skip xid and set the xdr position accordingly. */
1059 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1060 			(void) xdr_u_int(xdrs, &vers);
1061 			(void) xdr_u_int(xdrs, &rdma_credit);
1062 			(void) xdr_u_int(xdrs, &op);
1063 			XDR_DESTROY(xdrs);
1064 
1065 			if (vers != RPCRDMA_VERS) {
1066 				/*
1067 				 * Invalid RPC/RDMA version. Cannot
1068 				 * interoperate.  Set connection to
1069 				 * ERROR state and bail out.
1070 				 */
1071 				mutex_enter(&conn->c_lock);
1072 				if (conn->c_state != C_DISCONN_PEND)
1073 					conn->c_state = C_ERROR_CONN;
1074 				mutex_exit(&conn->c_lock);
1075 				rib_rbuf_free(conn, RECV_BUFFER,
1076 				    (void *)(uintptr_t)rwid->addr);
1077 				rib_free_wid(rwid);
1078 				continue;
1079 			}
1080 
1081 			mutex_enter(&qp->replylist_lock);
1082 			for (r = qp->replylist; r != NULL; r = r->next) {
1083 				if (r->xid == xid) {
1084 					find_xid = 1;
1085 					switch (op) {
1086 					case RDMA_MSG:
1087 					case RDMA_NOMSG:
1088 					case RDMA_MSGP:
1089 						r->status = RDMA_SUCCESS;
1090 						r->vaddr_cq = rwid->addr;
1091 						r->bytes_xfer =
1092 						    wc.wc_bytes_xfer;
1093 						cv_signal(&r->wait_cv);
1094 						break;
1095 					default:
1096 						rib_rbuf_free(qptoc(qp),
1097 						    RECV_BUFFER,
1098 						    (void *)(uintptr_t)
1099 						    rwid->addr);
1100 						break;
1101 					}
1102 					break;
1103 				}
1104 			}
1105 			mutex_exit(&qp->replylist_lock);
1106 			if (find_xid == 0) {
1107 				/* RPC caller not waiting for reply */
1108 
1109 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1110 				    int, xid);
1111 
1112 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1113 				    (void *)(uintptr_t)rwid->addr);
1114 			}
1115 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1116 			CONN *conn = qptoc(qp);
1117 
1118 			/*
1119 			 * Connection being flushed. Just free
1120 			 * the posted buffer
1121 			 */
1122 			rib_rbuf_free(conn, RECV_BUFFER,
1123 			    (void *)(uintptr_t)rwid->addr);
1124 		} else {
1125 			CONN *conn = qptoc(qp);
1126 /*
1127  *  RC Recv Q Error Code		Local state     Remote State
1128  *  ====================		===========     ============
1129  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1130  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1131  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1132  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1133  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1134  *  IBT_WC_WR_FLUSHED_ERR               None            None
1135  */
1136 			/*
1137 			 * Channel in error state. Set connection
1138 			 * in ERROR state.
1139 			 */
1140 			mutex_enter(&conn->c_lock);
1141 			if (conn->c_state != C_DISCONN_PEND)
1142 				conn->c_state = C_ERROR_CONN;
1143 			mutex_exit(&conn->c_lock);
1144 			rib_rbuf_free(conn, RECV_BUFFER,
1145 			    (void *)(uintptr_t)rwid->addr);
1146 		}
1147 		rib_free_wid(rwid);
1148 	}
1149 }
1150 
1151 /* Server side */
1152 /* ARGSUSED */
1153 static void
1154 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1155 {
1156 	rdma_recv_data_t *rdp;
1157 	rib_qp_t	*qp;
1158 	ibt_status_t	ibt_status;
1159 	ibt_wc_t	wc;
1160 	struct svc_recv	*s_recvp;
1161 	CONN		*conn;
1162 	mblk_t		*mp;
1163 
1164 	/*
1165 	 * Re-enable cq notify here to avoid missing any
1166 	 * completion queue notification.
1167 	 */
1168 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1169 
1170 	ibt_status = IBT_SUCCESS;
1171 	while (ibt_status != IBT_CQ_EMPTY) {
1172 		bzero(&wc, sizeof (wc));
1173 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1174 		if (ibt_status != IBT_SUCCESS)
1175 			return;
1176 
1177 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1178 		qp = s_recvp->qp;
1179 		conn = qptoc(qp);
1180 		mutex_enter(&qp->posted_rbufs_lock);
1181 		qp->n_posted_rbufs--;
1182 #if defined(MEASURE_POOL_DEPTH)
1183 		rib_posted_rbufs(preposted_rbufs -  qp->n_posted_rbufs);
1184 #endif
1185 		if (qp->n_posted_rbufs == 0)
1186 			cv_signal(&qp->posted_rbufs_cv);
1187 		mutex_exit(&qp->posted_rbufs_lock);
1188 
1189 		if (wc.wc_status == IBT_WC_SUCCESS) {
1190 			XDR	inxdrs, *xdrs;
1191 			uint_t	xid, vers, op;
1192 			uint32_t rdma_credit;
1193 
1194 			xdrs = &inxdrs;
1195 			/* s_recvp->vaddr stores data */
1196 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1197 			    wc.wc_bytes_xfer, XDR_DECODE);
1198 
1199 			/*
1200 			 * Treat xid as opaque (xid is the first entity
1201 			 * in the rpc rdma message).
1202 			 */
1203 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1204 			/* Skip xid and set the xdr position accordingly. */
1205 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1206 			if (!xdr_u_int(xdrs, &vers) ||
1207 			    !xdr_u_int(xdrs, &rdma_credit) ||
1208 			    !xdr_u_int(xdrs, &op)) {
1209 				rib_rbuf_free(conn, RECV_BUFFER,
1210 				    (void *)(uintptr_t)s_recvp->vaddr);
1211 				XDR_DESTROY(xdrs);
1212 				(void) rib_free_svc_recv(s_recvp);
1213 				continue;
1214 			}
1215 			XDR_DESTROY(xdrs);
1216 
1217 			if (vers != RPCRDMA_VERS) {
1218 				/*
1219 				 * Invalid RPC/RDMA version.
1220 				 * Drop rpc rdma message.
1221 				 */
1222 				rib_rbuf_free(conn, RECV_BUFFER,
1223 				    (void *)(uintptr_t)s_recvp->vaddr);
1224 				(void) rib_free_svc_recv(s_recvp);
1225 				continue;
1226 			}
1227 			/*
1228 			 * Is this for RDMA_DONE?
1229 			 */
1230 			if (op == RDMA_DONE) {
1231 				rib_rbuf_free(conn, RECV_BUFFER,
1232 				    (void *)(uintptr_t)s_recvp->vaddr);
1233 				/*
1234 				 * Wake up the thread waiting on
1235 				 * a RDMA_DONE for xid
1236 				 */
1237 				mutex_enter(&qp->rdlist_lock);
1238 				rdma_done_notify(qp, xid);
1239 				mutex_exit(&qp->rdlist_lock);
1240 				(void) rib_free_svc_recv(s_recvp);
1241 				continue;
1242 			}
1243 
1244 			mutex_enter(&plugin_state_lock);
1245 			if (plugin_state == ACCEPT) {
1246 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1247 				    == NULL)
1248 					(void) strwaitbuf(
1249 					    sizeof (*rdp), BPRI_LO);
1250 				/*
1251 				 * Plugin is in accept state, hence the master
1252 				 * transport queue for this is still accepting
1253 				 * requests. Hence we can call svc_queuereq to
1254 				 * queue this recieved msg.
1255 				 */
1256 				rdp = (rdma_recv_data_t *)mp->b_rptr;
1257 				rdp->conn = conn;
1258 				rdp->rpcmsg.addr =
1259 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1260 				rdp->rpcmsg.type = RECV_BUFFER;
1261 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1262 				rdp->status = wc.wc_status;
1263 				mutex_enter(&conn->c_lock);
1264 				conn->c_ref++;
1265 				mutex_exit(&conn->c_lock);
1266 				mp->b_wptr += sizeof (*rdp);
1267 				svc_queuereq((queue_t *)rib_stat->q, mp);
1268 				mutex_exit(&plugin_state_lock);
1269 			} else {
1270 				/*
1271 				 * The master transport for this is going
1272 				 * away and the queue is not accepting anymore
1273 				 * requests for krpc, so don't do anything, just
1274 				 * free the msg.
1275 				 */
1276 				mutex_exit(&plugin_state_lock);
1277 				rib_rbuf_free(conn, RECV_BUFFER,
1278 				    (void *)(uintptr_t)s_recvp->vaddr);
1279 			}
1280 		} else {
1281 			rib_rbuf_free(conn, RECV_BUFFER,
1282 			    (void *)(uintptr_t)s_recvp->vaddr);
1283 		}
1284 		(void) rib_free_svc_recv(s_recvp);
1285 	}
1286 }
1287 
1288 /*
1289  * Handles DR event of IBT_HCA_DETACH_EVENT.
1290  */
1291 /* ARGSUSED */
1292 static void
1293 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1294 	ibt_async_code_t code, ibt_async_event_t *event)
1295 {
1296 
1297 	switch (code) {
1298 	case IBT_HCA_ATTACH_EVENT:
1299 		/* ignore */
1300 		break;
1301 	case IBT_HCA_DETACH_EVENT:
1302 	{
1303 		ASSERT(rib_stat->hca->hca_hdl == hca_hdl);
1304 		rib_detach_hca(rib_stat->hca);
1305 #ifdef DEBUG
1306 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1307 #endif
1308 		break;
1309 	}
1310 #ifdef DEBUG
1311 	case IBT_EVENT_PATH_MIGRATED:
1312 		cmn_err(CE_NOTE, "rib_async_handler(): "
1313 		    "IBT_EVENT_PATH_MIGRATED\n");
1314 		break;
1315 	case IBT_EVENT_SQD:
1316 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1317 		break;
1318 	case IBT_EVENT_COM_EST:
1319 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1320 		break;
1321 	case IBT_ERROR_CATASTROPHIC_CHAN:
1322 		cmn_err(CE_NOTE, "rib_async_handler(): "
1323 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1324 		break;
1325 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1326 		cmn_err(CE_NOTE, "rib_async_handler(): "
1327 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1328 		break;
1329 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1330 		cmn_err(CE_NOTE, "rib_async_handler(): "
1331 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1332 		break;
1333 	case IBT_ERROR_PATH_MIGRATE_REQ:
1334 		cmn_err(CE_NOTE, "rib_async_handler(): "
1335 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1336 		break;
1337 	case IBT_ERROR_CQ:
1338 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1339 		break;
1340 	case IBT_ERROR_PORT_DOWN:
1341 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1342 		break;
1343 	case IBT_EVENT_PORT_UP:
1344 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1345 		break;
1346 	case IBT_ASYNC_OPAQUE1:
1347 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1348 		break;
1349 	case IBT_ASYNC_OPAQUE2:
1350 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1351 		break;
1352 	case IBT_ASYNC_OPAQUE3:
1353 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1354 		break;
1355 	case IBT_ASYNC_OPAQUE4:
1356 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1357 		break;
1358 #endif
1359 	default:
1360 		break;
1361 	}
1362 }
1363 
1364 /*
1365  * Client's reachable function.
1366  */
1367 static rdma_stat
1368 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1369 {
1370 	rib_hca_t	*hca;
1371 	rdma_stat	status;
1372 
1373 	/*
1374 	 * First check if a hca is still attached
1375 	 */
1376 	*handle = NULL;
1377 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
1378 	if (rib_stat->hca->state != HCA_INITED) {
1379 		rw_exit(&rib_stat->hca->state_lock);
1380 		return (RDMA_FAILED);
1381 	}
1382 	status = rib_ping_srv(addr_type, raddr, &hca);
1383 	rw_exit(&rib_stat->hca->state_lock);
1384 
1385 	if (status == RDMA_SUCCESS) {
1386 		*handle = (void *)hca;
1387 		return (RDMA_SUCCESS);
1388 	} else {
1389 		*handle = NULL;
1390 		DTRACE_PROBE(rpcib__i__pingfailed);
1391 		return (RDMA_FAILED);
1392 	}
1393 }
1394 
1395 /* Client side qp creation */
1396 static rdma_stat
1397 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1398 {
1399 	rib_qp_t	*kqp = NULL;
1400 	CONN		*conn;
1401 	rdma_clnt_cred_ctrl_t *cc_info;
1402 
1403 	ASSERT(qp != NULL);
1404 	*qp = NULL;
1405 
1406 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1407 	conn = qptoc(kqp);
1408 	kqp->hca = hca;
1409 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1410 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1411 
1412 	kqp->mode = RIB_CLIENT;
1413 	kqp->chan_flags = IBT_BLOCKING;
1414 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1415 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1416 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1417 	/*
1418 	 * Initialize
1419 	 */
1420 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1421 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1422 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1423 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1424 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1425 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1426 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1427 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1428 	/*
1429 	 * Initialize the client credit control
1430 	 * portion of the rdmaconn struct.
1431 	 */
1432 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1433 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1434 	cc_info->clnt_cc_granted_ops = 0;
1435 	cc_info->clnt_cc_in_flight_ops = 0;
1436 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1437 
1438 	*qp = kqp;
1439 	return (RDMA_SUCCESS);
1440 }
1441 
1442 /* Server side qp creation */
1443 static rdma_stat
1444 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1445 {
1446 	rib_qp_t	*kqp = NULL;
1447 	ibt_chan_sizes_t	chan_sizes;
1448 	ibt_rc_chan_alloc_args_t	qp_attr;
1449 	ibt_status_t		ibt_status;
1450 	rdma_srv_cred_ctrl_t *cc_info;
1451 
1452 	*qp = NULL;
1453 
1454 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1455 	kqp->hca = hca;
1456 	kqp->port_num = port;
1457 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1458 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1459 
1460 	/*
1461 	 * Create the qp handle
1462 	 */
1463 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1464 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1465 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1466 	qp_attr.rc_pd = hca->pd_hdl;
1467 	qp_attr.rc_hca_port_num = port;
1468 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1469 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1470 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1471 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1472 	qp_attr.rc_clone_chan = NULL;
1473 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1474 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1475 
1476 	rw_enter(&hca->state_lock, RW_READER);
1477 	if (hca->state != HCA_DETACHED) {
1478 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1479 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1480 		    &chan_sizes);
1481 	} else {
1482 		rw_exit(&hca->state_lock);
1483 		goto fail;
1484 	}
1485 	rw_exit(&hca->state_lock);
1486 
1487 	if (ibt_status != IBT_SUCCESS) {
1488 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1489 		    int, ibt_status);
1490 		goto fail;
1491 	}
1492 
1493 	kqp->mode = RIB_SERVER;
1494 	kqp->chan_flags = IBT_BLOCKING;
1495 	kqp->q = q;	/* server ONLY */
1496 
1497 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1498 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1499 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1500 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1501 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1502 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1503 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1504 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1505 	/*
1506 	 * Set the private data area to qp to be used in callbacks
1507 	 */
1508 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1509 	kqp->rdmaconn.c_state = C_CONNECTED;
1510 
1511 	/*
1512 	 * Initialize the server credit control
1513 	 * portion of the rdmaconn struct.
1514 	 */
1515 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1516 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1517 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1518 	cc_info->srv_cc_cur_buffers_used = 0;
1519 	cc_info->srv_cc_posted = preposted_rbufs;
1520 
1521 	*qp = kqp;
1522 
1523 	return (RDMA_SUCCESS);
1524 fail:
1525 	if (kqp)
1526 		kmem_free(kqp, sizeof (rib_qp_t));
1527 
1528 	return (RDMA_FAILED);
1529 }
1530 
1531 /* ARGSUSED */
1532 ibt_cm_status_t
1533 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1534     ibt_cm_return_args_t *ret_args, void *priv_data,
1535     ibt_priv_data_len_t len)
1536 {
1537 	rpcib_state_t   *ribstat;
1538 	rib_hca_t	*hca;
1539 
1540 	ribstat = (rpcib_state_t *)clnt_hdl;
1541 	hca = (rib_hca_t *)ribstat->hca;
1542 
1543 	switch (event->cm_type) {
1544 
1545 	/* got a connection close event */
1546 	case IBT_CM_EVENT_CONN_CLOSED:
1547 	{
1548 		CONN	*conn;
1549 		rib_qp_t *qp;
1550 
1551 		/* check reason why connection was closed */
1552 		switch (event->cm_event.closed) {
1553 		case IBT_CM_CLOSED_DREP_RCVD:
1554 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1555 		case IBT_CM_CLOSED_DUP:
1556 		case IBT_CM_CLOSED_ABORT:
1557 		case IBT_CM_CLOSED_ALREADY:
1558 			/*
1559 			 * These cases indicate the local end initiated
1560 			 * the closing of the channel. Nothing to do here.
1561 			 */
1562 			break;
1563 		default:
1564 			/*
1565 			 * Reason for CONN_CLOSED event must be one of
1566 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1567 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1568 			 * the remote end is closing the channel. In these
1569 			 * cases free the channel and transition to error
1570 			 * state
1571 			 */
1572 			qp = ibt_get_chan_private(event->cm_channel);
1573 			conn = qptoc(qp);
1574 			mutex_enter(&conn->c_lock);
1575 			if (conn->c_state == C_DISCONN_PEND) {
1576 				mutex_exit(&conn->c_lock);
1577 				break;
1578 			}
1579 
1580 			conn->c_state = C_ERROR_CONN;
1581 
1582 			/*
1583 			 * Free the rc_channel. Channel has already
1584 			 * transitioned to ERROR state and WRs have been
1585 			 * FLUSHED_ERR already.
1586 			 */
1587 			(void) ibt_free_channel(qp->qp_hdl);
1588 			qp->qp_hdl = NULL;
1589 
1590 			/*
1591 			 * Free the conn if c_ref is down to 0 already
1592 			 */
1593 			if (conn->c_ref == 0) {
1594 				/*
1595 				 * Remove from list and free conn
1596 				 */
1597 				conn->c_state = C_DISCONN_PEND;
1598 				mutex_exit(&conn->c_lock);
1599 				(void) rib_disconnect_channel(conn,
1600 				    &hca->cl_conn_list);
1601 			} else {
1602 				mutex_exit(&conn->c_lock);
1603 			}
1604 #ifdef DEBUG
1605 			if (rib_debug)
1606 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1607 				    "(CONN_CLOSED) channel disconnected");
1608 #endif
1609 			break;
1610 		}
1611 		break;
1612 	}
1613 	default:
1614 		break;
1615 	}
1616 	return (IBT_CM_ACCEPT);
1617 }
1618 
1619 /* Check server ib address */
1620 rdma_stat
1621 rib_chk_srv_ibaddr(struct netbuf *raddr,
1622 	int addr_type, ibt_path_info_t *path, ibt_ip_addr_t *s_ip,
1623 	ibt_ip_addr_t *d_ip)
1624 {
1625 	struct sockaddr_in	*sin4;
1626 	struct sockaddr_in6	*sin6;
1627 	ibt_status_t		ibt_status;
1628 	ibt_ip_path_attr_t	ipattr;
1629 	uint8_t npaths = 0;
1630 	ibt_path_ip_src_t	srcip;
1631 
1632 	ASSERT(raddr->buf != NULL);
1633 
1634 	(void) bzero(path, sizeof (ibt_path_info_t));
1635 
1636 	switch (addr_type) {
1637 	case AF_INET:
1638 		sin4 = (struct sockaddr_in *)raddr->buf;
1639 		d_ip->family = AF_INET;
1640 		d_ip->un.ip4addr = sin4->sin_addr.s_addr;
1641 		break;
1642 
1643 	case AF_INET6:
1644 		sin6 = (struct sockaddr_in6 *)raddr->buf;
1645 		d_ip->family = AF_INET6;
1646 		d_ip->un.ip6addr = sin6->sin6_addr;
1647 		break;
1648 
1649 	default:
1650 		return (RDMA_INVAL);
1651 	}
1652 
1653 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1654 	bzero(&srcip, sizeof (ibt_path_ip_src_t));
1655 
1656 	ipattr.ipa_dst_ip 	= d_ip;
1657 	ipattr.ipa_hca_guid 	= rib_stat->hca->hca_guid;
1658 	ipattr.ipa_ndst		= 1;
1659 	ipattr.ipa_max_paths	= 1;
1660 	npaths = 0;
1661 
1662 	ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1663 	    IBT_PATH_NO_FLAGS,
1664 	    &ipattr,
1665 	    path,
1666 	    &npaths,
1667 	    &srcip);
1668 
1669 	if (ibt_status != IBT_SUCCESS ||
1670 	    npaths < 1 ||
1671 	    path->pi_hca_guid != rib_stat->hca->hca_guid) {
1672 
1673 		bzero(s_ip, sizeof (ibt_path_ip_src_t));
1674 		return (RDMA_FAILED);
1675 	}
1676 
1677 	if (srcip.ip_primary.family == AF_INET) {
1678 		s_ip->family = AF_INET;
1679 		s_ip->un.ip4addr = srcip.ip_primary.un.ip4addr;
1680 	} else {
1681 		s_ip->family = AF_INET6;
1682 		s_ip->un.ip6addr = srcip.ip_primary.un.ip6addr;
1683 	}
1684 
1685 	return (RDMA_SUCCESS);
1686 }
1687 
1688 
1689 /*
1690  * Connect to the server.
1691  */
1692 rdma_stat
1693 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path,
1694 		ibt_ip_addr_t *s_ip, ibt_ip_addr_t *d_ip)
1695 {
1696 	ibt_chan_open_args_t	chan_args;	/* channel args */
1697 	ibt_chan_sizes_t	chan_sizes;
1698 	ibt_rc_chan_alloc_args_t	qp_attr;
1699 	ibt_status_t		ibt_status;
1700 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1701 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1702 	ibt_ip_cm_info_t	ipcm_info;
1703 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1704 
1705 
1706 	(void) bzero(&chan_args, sizeof (chan_args));
1707 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1708 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1709 
1710 	switch (ipcm_info.src_addr.family = s_ip->family) {
1711 	case AF_INET:
1712 		ipcm_info.src_addr.un.ip4addr = s_ip->un.ip4addr;
1713 		break;
1714 	case AF_INET6:
1715 		ipcm_info.src_addr.un.ip6addr = s_ip->un.ip6addr;
1716 		break;
1717 	}
1718 
1719 	switch (ipcm_info.dst_addr.family = d_ip->family) {
1720 	case AF_INET:
1721 		ipcm_info.dst_addr.un.ip4addr = d_ip->un.ip4addr;
1722 		break;
1723 	case AF_INET6:
1724 		ipcm_info.dst_addr.un.ip6addr = d_ip->un.ip6addr;
1725 		break;
1726 	}
1727 
1728 	ipcm_info.src_port = NFS_RDMA_PORT;
1729 
1730 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1731 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1732 
1733 	if (ibt_status != IBT_SUCCESS) {
1734 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1735 		return (-1);
1736 	}
1737 
1738 	qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num;
1739 	/* Alloc a RC channel */
1740 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1741 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1742 	qp_attr.rc_pd = hca->pd_hdl;
1743 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1744 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1745 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1746 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1747 	qp_attr.rc_clone_chan = NULL;
1748 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1749 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1750 
1751 	path->pi_sid = ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT);
1752 	chan_args.oc_path = path;
1753 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1754 	chan_args.oc_cm_clnt_private = (void *)rib_stat;
1755 	chan_args.oc_rdma_ra_out = 4;
1756 	chan_args.oc_rdma_ra_in = 4;
1757 	chan_args.oc_path_retry_cnt = 2;
1758 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1759 	chan_args.oc_priv_data = cmp_ip_pvt;
1760 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1761 
1762 refresh:
1763 	rw_enter(&hca->state_lock, RW_READER);
1764 	if (hca->state != HCA_DETACHED) {
1765 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1766 		    IBT_ACHAN_NO_FLAGS,
1767 		    &qp_attr, &qp->qp_hdl,
1768 		    &chan_sizes);
1769 	} else {
1770 		rw_exit(&hca->state_lock);
1771 		return (RDMA_FAILED);
1772 	}
1773 	rw_exit(&hca->state_lock);
1774 
1775 	if (ibt_status != IBT_SUCCESS) {
1776 		DTRACE_PROBE1(rpcib__i_conntosrv,
1777 		    int, ibt_status);
1778 		return (RDMA_FAILED);
1779 	}
1780 
1781 	/* Connect to the Server */
1782 	(void) bzero(&ret_args, sizeof (ret_args));
1783 	mutex_enter(&qp->cb_lock);
1784 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1785 	    IBT_BLOCKING, &chan_args, &ret_args);
1786 	if (ibt_status != IBT_SUCCESS) {
1787 		DTRACE_PROBE2(rpcib__i_openrctosrv,
1788 		    int, ibt_status, int, ret_args.rc_status);
1789 
1790 		(void) ibt_free_channel(qp->qp_hdl);
1791 		qp->qp_hdl = NULL;
1792 		mutex_exit(&qp->cb_lock);
1793 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1794 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1795 			/*
1796 			 * Got IBT_CM_CONN_STALE probably because of stale
1797 			 * data on the passive end of a channel that existed
1798 			 * prior to reboot. Retry establishing a channel
1799 			 * REFRESH_ATTEMPTS times, during which time the
1800 			 * stale conditions on the server might clear up.
1801 			 */
1802 			goto refresh;
1803 		}
1804 		return (RDMA_FAILED);
1805 	}
1806 	mutex_exit(&qp->cb_lock);
1807 	/*
1808 	 * Set the private data area to qp to be used in callbacks
1809 	 */
1810 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1811 	return (RDMA_SUCCESS);
1812 }
1813 
1814 rdma_stat
1815 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca)
1816 {
1817 	uint_t			i;
1818 	ibt_path_info_t		path;
1819 	ibt_status_t		ibt_status;
1820 	uint8_t			num_paths_p;
1821 	ibt_ip_path_attr_t	ipattr;
1822 	ibt_ip_addr_t		dstip;
1823 	ibt_path_ip_src_t	srcip;
1824 	rpcib_ipaddrs_t		addrs4;
1825 	rpcib_ipaddrs_t		addrs6;
1826 	struct sockaddr_in	*sinp;
1827 	struct sockaddr_in6	*sin6p;
1828 	rdma_stat		retval = RDMA_SUCCESS;
1829 
1830 	*hca = NULL;
1831 	ASSERT(raddr->buf != NULL);
1832 
1833 	bzero(&path, sizeof (ibt_path_info_t));
1834 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1835 	bzero(&srcip, sizeof (ibt_path_ip_src_t));
1836 
1837 	if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1838 	    (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1839 		retval = RDMA_FAILED;
1840 		goto done;
1841 	}
1842 
1843 	/* Prep the destination address */
1844 	switch (addr_type) {
1845 	case AF_INET:
1846 		sinp = (struct sockaddr_in *)raddr->buf;
1847 		dstip.family = AF_INET;
1848 		dstip.un.ip4addr = sinp->sin_addr.s_addr;
1849 		sinp = addrs4.ri_list;
1850 
1851 		for (i = 0; i < addrs4.ri_count; i++) {
1852 			num_paths_p = 0;
1853 			ipattr.ipa_dst_ip 	= &dstip;
1854 			ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
1855 			ipattr.ipa_ndst		= 1;
1856 			ipattr.ipa_max_paths	= 1;
1857 			ipattr.ipa_src_ip.family = dstip.family;
1858 			ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr;
1859 
1860 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1861 			    IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p,
1862 			    &srcip);
1863 			if (ibt_status == IBT_SUCCESS &&
1864 			    num_paths_p != 0 &&
1865 			    path.pi_hca_guid == rib_stat->hca->hca_guid) {
1866 				*hca = rib_stat->hca;
1867 				goto done;
1868 			}
1869 		}
1870 		retval = RDMA_FAILED;
1871 		break;
1872 
1873 	case AF_INET6:
1874 		sin6p = (struct sockaddr_in6 *)raddr->buf;
1875 		dstip.family = AF_INET6;
1876 		dstip.un.ip6addr = sin6p->sin6_addr;
1877 		sin6p = addrs6.ri_list;
1878 
1879 		for (i = 0; i < addrs6.ri_count; i++) {
1880 			num_paths_p = 0;
1881 			ipattr.ipa_dst_ip 	= &dstip;
1882 			ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
1883 			ipattr.ipa_ndst		= 1;
1884 			ipattr.ipa_max_paths	= 1;
1885 			ipattr.ipa_src_ip.family = dstip.family;
1886 			ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr;
1887 
1888 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1889 			    IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p,
1890 			    &srcip);
1891 			if (ibt_status == IBT_SUCCESS &&
1892 			    num_paths_p != 0 &&
1893 			    path.pi_hca_guid == rib_stat->hca->hca_guid) {
1894 				*hca = rib_stat->hca;
1895 				goto done;
1896 			}
1897 		}
1898 		retval = RDMA_FAILED;
1899 		break;
1900 
1901 	default:
1902 		retval = RDMA_INVAL;
1903 		break;
1904 	}
1905 done:
1906 	if (addrs4.ri_size > 0)
1907 		kmem_free(addrs4.ri_list, addrs4.ri_size);
1908 	if (addrs6.ri_size > 0)
1909 		kmem_free(addrs6.ri_list, addrs6.ri_size);
1910 	return (retval);
1911 }
1912 
1913 /*
1914  * Close channel, remove from connection list and
1915  * free up resources allocated for that channel.
1916  */
1917 rdma_stat
1918 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
1919 {
1920 	rib_qp_t	*qp = ctoqp(conn);
1921 	rib_hca_t	*hca;
1922 
1923 	/*
1924 	 * c_ref == 0 and connection is in C_DISCONN_PEND
1925 	 */
1926 	hca = qp->hca;
1927 	if (conn_list != NULL)
1928 		(void) rib_rm_conn(conn, conn_list);
1929 
1930 	if (qp->qp_hdl != NULL) {
1931 		/*
1932 		 * If the channel has not been establised,
1933 		 * ibt_flush_channel is called to flush outstanding WRs
1934 		 * on the Qs.  Otherwise, ibt_close_rc_channel() is
1935 		 * called.  The channel is then freed.
1936 		 */
1937 		if (conn_list != NULL)
1938 			(void) ibt_close_rc_channel(qp->qp_hdl,
1939 			    IBT_BLOCKING, NULL, 0, NULL, NULL, 0);
1940 		else
1941 			(void) ibt_flush_channel(qp->qp_hdl);
1942 
1943 		mutex_enter(&qp->posted_rbufs_lock);
1944 		while (qp->n_posted_rbufs)
1945 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
1946 		mutex_exit(&qp->posted_rbufs_lock);
1947 		(void) ibt_free_channel(qp->qp_hdl);
1948 		qp->qp_hdl = NULL;
1949 	}
1950 
1951 	ASSERT(qp->rdlist == NULL);
1952 
1953 	if (qp->replylist != NULL) {
1954 		(void) rib_rem_replylist(qp);
1955 	}
1956 
1957 	cv_destroy(&qp->cb_conn_cv);
1958 	cv_destroy(&qp->posted_rbufs_cv);
1959 	mutex_destroy(&qp->cb_lock);
1960 
1961 	mutex_destroy(&qp->replylist_lock);
1962 	mutex_destroy(&qp->posted_rbufs_lock);
1963 	mutex_destroy(&qp->rdlist_lock);
1964 
1965 	cv_destroy(&conn->c_cv);
1966 	mutex_destroy(&conn->c_lock);
1967 
1968 	if (conn->c_raddr.buf != NULL) {
1969 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
1970 	}
1971 	if (conn->c_laddr.buf != NULL) {
1972 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
1973 	}
1974 
1975 	/*
1976 	 * Credit control cleanup.
1977 	 */
1978 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
1979 		rdma_clnt_cred_ctrl_t *cc_info;
1980 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1981 		cv_destroy(&cc_info->clnt_cc_cv);
1982 	}
1983 
1984 	kmem_free(qp, sizeof (rib_qp_t));
1985 
1986 	/*
1987 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
1988 	 * then the hca is no longer being used.
1989 	 */
1990 	if (conn_list != NULL) {
1991 		rw_enter(&hca->state_lock, RW_READER);
1992 		if (hca->state == HCA_DETACHED) {
1993 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
1994 			if (hca->srv_conn_list.conn_hd == NULL) {
1995 				rw_enter(&hca->cl_conn_list.conn_lock,
1996 				    RW_READER);
1997 
1998 				if (hca->cl_conn_list.conn_hd == NULL) {
1999 					mutex_enter(&hca->inuse_lock);
2000 					hca->inuse = FALSE;
2001 					cv_signal(&hca->cb_cv);
2002 					mutex_exit(&hca->inuse_lock);
2003 				}
2004 				rw_exit(&hca->cl_conn_list.conn_lock);
2005 			}
2006 			rw_exit(&hca->srv_conn_list.conn_lock);
2007 		}
2008 		rw_exit(&hca->state_lock);
2009 	}
2010 
2011 	return (RDMA_SUCCESS);
2012 }
2013 
2014 /*
2015  * Wait for send completion notification. Only on receiving a
2016  * notification be it a successful or error completion, free the
2017  * send_wid.
2018  */
2019 static rdma_stat
2020 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2021 {
2022 	clock_t timout, cv_wait_ret;
2023 	rdma_stat error = RDMA_SUCCESS;
2024 	int	i;
2025 
2026 	/*
2027 	 * Wait for send to complete
2028 	 */
2029 	ASSERT(wd != NULL);
2030 	mutex_enter(&wd->sendwait_lock);
2031 	if (wd->status == (uint_t)SEND_WAIT) {
2032 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2033 		    ddi_get_lbolt();
2034 
2035 		if (qp->mode == RIB_SERVER) {
2036 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2037 			    &wd->sendwait_lock, timout)) > 0 &&
2038 			    wd->status == (uint_t)SEND_WAIT)
2039 				;
2040 			switch (cv_wait_ret) {
2041 			case -1:	/* timeout */
2042 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2043 
2044 				wd->cv_sig = 0;		/* no signal needed */
2045 				error = RDMA_TIMEDOUT;
2046 				break;
2047 			default:	/* got send completion */
2048 				break;
2049 			}
2050 		} else {
2051 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2052 			    &wd->sendwait_lock, timout)) > 0 &&
2053 			    wd->status == (uint_t)SEND_WAIT)
2054 				;
2055 			switch (cv_wait_ret) {
2056 			case -1:	/* timeout */
2057 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2058 
2059 				wd->cv_sig = 0;		/* no signal needed */
2060 				error = RDMA_TIMEDOUT;
2061 				break;
2062 			case 0:		/* interrupted */
2063 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2064 
2065 				wd->cv_sig = 0;		/* no signal needed */
2066 				error = RDMA_INTR;
2067 				break;
2068 			default:	/* got send completion */
2069 				break;
2070 			}
2071 		}
2072 	}
2073 
2074 	if (wd->status != (uint_t)SEND_WAIT) {
2075 		/* got send completion */
2076 		if (wd->status != RDMA_SUCCESS) {
2077 			error = wd->status;
2078 		if (wd->status != RDMA_CONNLOST)
2079 			error = RDMA_FAILED;
2080 		}
2081 		for (i = 0; i < wd->nsbufs; i++) {
2082 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2083 			    (void *)(uintptr_t)wd->sbufaddr[i]);
2084 		}
2085 		mutex_exit(&wd->sendwait_lock);
2086 		(void) rib_free_sendwait(wd);
2087 	} else {
2088 		mutex_exit(&wd->sendwait_lock);
2089 	}
2090 	return (error);
2091 }
2092 
2093 static struct send_wid *
2094 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2095 {
2096 	struct send_wid	*wd;
2097 
2098 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2099 	wd->xid = xid;
2100 	wd->cv_sig = cv_sig;
2101 	wd->qp = qp;
2102 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2103 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2104 	wd->status = (uint_t)SEND_WAIT;
2105 
2106 	return (wd);
2107 }
2108 
2109 static int
2110 rib_free_sendwait(struct send_wid *wdesc)
2111 {
2112 	cv_destroy(&wdesc->wait_cv);
2113 	mutex_destroy(&wdesc->sendwait_lock);
2114 	kmem_free(wdesc, sizeof (*wdesc));
2115 
2116 	return (0);
2117 }
2118 
2119 static rdma_stat
2120 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2121 {
2122 	mutex_enter(&qp->replylist_lock);
2123 	if (rep != NULL) {
2124 		(void) rib_remreply(qp, rep);
2125 		mutex_exit(&qp->replylist_lock);
2126 		return (RDMA_SUCCESS);
2127 	}
2128 	mutex_exit(&qp->replylist_lock);
2129 	return (RDMA_FAILED);
2130 }
2131 
2132 /*
2133  * Send buffers are freed here only in case of error in posting
2134  * on QP. If the post succeeded, the send buffers are freed upon
2135  * send completion in rib_sendwait() or in the scq_handler.
2136  */
2137 rdma_stat
2138 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2139 	int send_sig, int cv_sig, caddr_t *swid)
2140 {
2141 	struct send_wid	*wdesc;
2142 	struct clist	*clp;
2143 	ibt_status_t	ibt_status = IBT_SUCCESS;
2144 	rdma_stat	ret = RDMA_SUCCESS;
2145 	ibt_send_wr_t	tx_wr;
2146 	int		i, nds;
2147 	ibt_wr_ds_t	sgl[DSEG_MAX];
2148 	uint_t		total_msg_size;
2149 	rib_qp_t	*qp;
2150 
2151 	qp = ctoqp(conn);
2152 
2153 	ASSERT(cl != NULL);
2154 
2155 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2156 
2157 	nds = 0;
2158 	total_msg_size = 0;
2159 	clp = cl;
2160 	while (clp != NULL) {
2161 		if (nds >= DSEG_MAX) {
2162 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2163 			return (RDMA_FAILED);
2164 		}
2165 		sgl[nds].ds_va = clp->w.c_saddr;
2166 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2167 		sgl[nds].ds_len = clp->c_len;
2168 		total_msg_size += clp->c_len;
2169 		clp = clp->c_next;
2170 		nds++;
2171 	}
2172 
2173 	if (send_sig) {
2174 		/* Set SEND_SIGNAL flag. */
2175 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2176 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2177 		*swid = (caddr_t)wdesc;
2178 	} else {
2179 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2180 		wdesc = rib_init_sendwait(msgid, 0, qp);
2181 		*swid = (caddr_t)wdesc;
2182 	}
2183 	wdesc->nsbufs = nds;
2184 	for (i = 0; i < nds; i++) {
2185 		wdesc->sbufaddr[i] = sgl[i].ds_va;
2186 	}
2187 
2188 	tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2189 	tx_wr.wr_opcode = IBT_WRC_SEND;
2190 	tx_wr.wr_trans = IBT_RC_SRV;
2191 	tx_wr.wr_nds = nds;
2192 	tx_wr.wr_sgl = sgl;
2193 
2194 	mutex_enter(&conn->c_lock);
2195 	if (conn->c_state == C_CONNECTED) {
2196 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2197 	}
2198 	if (conn->c_state != C_CONNECTED ||
2199 	    ibt_status != IBT_SUCCESS) {
2200 		if (conn->c_state != C_DISCONN_PEND)
2201 			conn->c_state = C_ERROR_CONN;
2202 		mutex_exit(&conn->c_lock);
2203 		for (i = 0; i < nds; i++) {
2204 			rib_rbuf_free(conn, SEND_BUFFER,
2205 			    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2206 		}
2207 
2208 		(void) rib_free_sendwait(wdesc);
2209 
2210 		return (RDMA_CONNLOST);
2211 	}
2212 	mutex_exit(&conn->c_lock);
2213 
2214 	if (send_sig) {
2215 		if (cv_sig) {
2216 			/*
2217 			 * cv_wait for send to complete.
2218 			 * We can fail due to a timeout or signal or
2219 			 * unsuccessful send.
2220 			 */
2221 			ret = rib_sendwait(qp, wdesc);
2222 
2223 			return (ret);
2224 		}
2225 	}
2226 
2227 	return (RDMA_SUCCESS);
2228 }
2229 
2230 
2231 rdma_stat
2232 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2233 {
2234 	rdma_stat	ret;
2235 	caddr_t		wd;
2236 
2237 	/* send-wait & cv_signal */
2238 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2239 	return (ret);
2240 }
2241 
2242 /*
2243  * Server interface (svc_rdma_ksend).
2244  * Send RPC reply and wait for RDMA_DONE.
2245  */
2246 rdma_stat
2247 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2248 {
2249 	rdma_stat ret = RDMA_SUCCESS;
2250 	struct rdma_done_list *rd;
2251 	clock_t timout, cv_wait_ret;
2252 	caddr_t *wid = NULL;
2253 	rib_qp_t *qp = ctoqp(conn);
2254 
2255 	mutex_enter(&qp->rdlist_lock);
2256 	rd = rdma_done_add(qp, msgid);
2257 
2258 	/* No cv_signal (whether send-wait or no-send-wait) */
2259 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2260 
2261 	if (ret != RDMA_SUCCESS) {
2262 		rdma_done_rm(qp, rd);
2263 	} else {
2264 		/*
2265 		 * Wait for RDMA_DONE from remote end
2266 		 */
2267 		timout =
2268 		    drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2269 		cv_wait_ret = cv_timedwait(&rd->rdma_done_cv,
2270 		    &qp->rdlist_lock,
2271 		    timout);
2272 
2273 		rdma_done_rm(qp, rd);
2274 
2275 		if (cv_wait_ret < 0) {
2276 			ret = RDMA_TIMEDOUT;
2277 		}
2278 	}
2279 
2280 	mutex_exit(&qp->rdlist_lock);
2281 	return (ret);
2282 }
2283 
2284 static struct recv_wid *
2285 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2286 {
2287 	struct recv_wid	*rwid;
2288 
2289 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2290 	rwid->xid = msgid;
2291 	rwid->addr = sgl->ds_va;
2292 	rwid->qp = qp;
2293 
2294 	return (rwid);
2295 }
2296 
2297 static void
2298 rib_free_wid(struct recv_wid *rwid)
2299 {
2300 	kmem_free(rwid, sizeof (struct recv_wid));
2301 }
2302 
2303 rdma_stat
2304 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2305 {
2306 	rib_qp_t	*qp = ctoqp(conn);
2307 	struct clist	*clp = cl;
2308 	struct reply	*rep;
2309 	struct recv_wid	*rwid;
2310 	int		nds;
2311 	ibt_wr_ds_t	sgl[DSEG_MAX];
2312 	ibt_recv_wr_t	recv_wr;
2313 	rdma_stat	ret;
2314 	ibt_status_t	ibt_status;
2315 
2316 	/*
2317 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2318 	 */
2319 
2320 	nds = 0;
2321 	while (cl != NULL) {
2322 		if (nds >= DSEG_MAX) {
2323 			ret = RDMA_FAILED;
2324 			goto done;
2325 		}
2326 		sgl[nds].ds_va = cl->w.c_saddr;
2327 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2328 		sgl[nds].ds_len = cl->c_len;
2329 		cl = cl->c_next;
2330 		nds++;
2331 	}
2332 
2333 	if (nds != 1) {
2334 		ret = RDMA_FAILED;
2335 		goto done;
2336 	}
2337 
2338 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2339 	recv_wr.wr_nds = nds;
2340 	recv_wr.wr_sgl = sgl;
2341 
2342 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2343 	if (rwid) {
2344 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2345 	} else {
2346 		ret = RDMA_NORESOURCE;
2347 		goto done;
2348 	}
2349 	rep = rib_addreplylist(qp, msgid);
2350 	if (!rep) {
2351 		rib_free_wid(rwid);
2352 		ret = RDMA_NORESOURCE;
2353 		goto done;
2354 	}
2355 
2356 	mutex_enter(&conn->c_lock);
2357 
2358 	if (conn->c_state == C_CONNECTED) {
2359 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2360 	}
2361 
2362 	if (conn->c_state != C_CONNECTED ||
2363 	    ibt_status != IBT_SUCCESS) {
2364 		if (conn->c_state != C_DISCONN_PEND)
2365 			conn->c_state = C_ERROR_CONN;
2366 		mutex_exit(&conn->c_lock);
2367 		rib_free_wid(rwid);
2368 		(void) rib_rem_rep(qp, rep);
2369 		ret = RDMA_CONNLOST;
2370 		goto done;
2371 	}
2372 	mutex_exit(&conn->c_lock);
2373 	return (RDMA_SUCCESS);
2374 
2375 done:
2376 	while (clp != NULL) {
2377 		rib_rbuf_free(conn, RECV_BUFFER,
2378 		    (void *)(uintptr_t)clp->w.c_saddr3);
2379 		clp = clp->c_next;
2380 	}
2381 	return (ret);
2382 }
2383 
2384 rdma_stat
2385 rib_svc_post(CONN* conn, struct clist *cl)
2386 {
2387 	rib_qp_t	*qp = ctoqp(conn);
2388 	struct svc_recv	*s_recvp;
2389 	int		nds;
2390 	ibt_wr_ds_t	sgl[DSEG_MAX];
2391 	ibt_recv_wr_t	recv_wr;
2392 	ibt_status_t	ibt_status;
2393 
2394 	nds = 0;
2395 	while (cl != NULL) {
2396 		if (nds >= DSEG_MAX) {
2397 			return (RDMA_FAILED);
2398 		}
2399 		sgl[nds].ds_va = cl->w.c_saddr;
2400 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2401 		sgl[nds].ds_len = cl->c_len;
2402 		cl = cl->c_next;
2403 		nds++;
2404 	}
2405 
2406 	if (nds != 1) {
2407 		rib_rbuf_free(conn, RECV_BUFFER,
2408 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2409 
2410 		return (RDMA_FAILED);
2411 	}
2412 
2413 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2414 	recv_wr.wr_nds = nds;
2415 	recv_wr.wr_sgl = sgl;
2416 
2417 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2418 	/* Use s_recvp's addr as wr id */
2419 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2420 	mutex_enter(&conn->c_lock);
2421 	if (conn->c_state == C_CONNECTED) {
2422 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2423 	}
2424 	if (conn->c_state != C_CONNECTED ||
2425 	    ibt_status != IBT_SUCCESS) {
2426 		if (conn->c_state != C_DISCONN_PEND)
2427 			conn->c_state = C_ERROR_CONN;
2428 		mutex_exit(&conn->c_lock);
2429 		rib_rbuf_free(conn, RECV_BUFFER,
2430 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2431 		(void) rib_free_svc_recv(s_recvp);
2432 
2433 		return (RDMA_CONNLOST);
2434 	}
2435 	mutex_exit(&conn->c_lock);
2436 
2437 	return (RDMA_SUCCESS);
2438 }
2439 
2440 /* Client */
2441 rdma_stat
2442 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2443 {
2444 
2445 	return (rib_clnt_post(conn, cl, msgid));
2446 }
2447 
2448 /* Client */
2449 rdma_stat
2450 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2451 {
2452 	rib_qp_t	*qp = ctoqp(conn);
2453 	struct reply	*rep;
2454 
2455 	mutex_enter(&qp->replylist_lock);
2456 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2457 		if (rep->xid == msgid) {
2458 			if (rep->vaddr_cq) {
2459 				rib_rbuf_free(conn, RECV_BUFFER,
2460 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2461 			}
2462 			(void) rib_remreply(qp, rep);
2463 			break;
2464 		}
2465 	}
2466 	mutex_exit(&qp->replylist_lock);
2467 
2468 	return (RDMA_SUCCESS);
2469 }
2470 
2471 /* Server */
2472 rdma_stat
2473 rib_post_recv(CONN *conn, struct clist *cl)
2474 {
2475 	rib_qp_t	*qp = ctoqp(conn);
2476 
2477 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2478 		mutex_enter(&qp->posted_rbufs_lock);
2479 		qp->n_posted_rbufs++;
2480 		mutex_exit(&qp->posted_rbufs_lock);
2481 		return (RDMA_SUCCESS);
2482 	}
2483 	return (RDMA_FAILED);
2484 }
2485 
2486 /*
2487  * Client side only interface to "recv" the rpc reply buf
2488  * posted earlier by rib_post_resp(conn, cl, msgid).
2489  */
2490 rdma_stat
2491 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2492 {
2493 	struct reply *rep = NULL;
2494 	clock_t timout, cv_wait_ret;
2495 	rdma_stat ret = RDMA_SUCCESS;
2496 	rib_qp_t *qp = ctoqp(conn);
2497 
2498 	/*
2499 	 * Find the reply structure for this msgid
2500 	 */
2501 	mutex_enter(&qp->replylist_lock);
2502 
2503 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2504 		if (rep->xid == msgid)
2505 			break;
2506 	}
2507 
2508 	if (rep != NULL) {
2509 		/*
2510 		 * If message not yet received, wait.
2511 		 */
2512 		if (rep->status == (uint_t)REPLY_WAIT) {
2513 			timout = ddi_get_lbolt() +
2514 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2515 
2516 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2517 			    &qp->replylist_lock, timout)) > 0 &&
2518 			    rep->status == (uint_t)REPLY_WAIT)
2519 				;
2520 
2521 			switch (cv_wait_ret) {
2522 			case -1:	/* timeout */
2523 				ret = RDMA_TIMEDOUT;
2524 				break;
2525 			case 0:
2526 				ret = RDMA_INTR;
2527 				break;
2528 			default:
2529 				break;
2530 			}
2531 		}
2532 
2533 		if (rep->status == RDMA_SUCCESS) {
2534 			struct clist *cl = NULL;
2535 
2536 			/*
2537 			 * Got message successfully
2538 			 */
2539 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2540 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2541 			*clp = cl;
2542 		} else {
2543 			if (rep->status != (uint_t)REPLY_WAIT) {
2544 				/*
2545 				 * Got error in reply message. Free
2546 				 * recv buffer here.
2547 				 */
2548 				ret = rep->status;
2549 				rib_rbuf_free(conn, RECV_BUFFER,
2550 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2551 			}
2552 		}
2553 		(void) rib_remreply(qp, rep);
2554 	} else {
2555 		/*
2556 		 * No matching reply structure found for given msgid on the
2557 		 * reply wait list.
2558 		 */
2559 		ret = RDMA_INVAL;
2560 		DTRACE_PROBE(rpcib__i__nomatchxid2);
2561 	}
2562 
2563 	/*
2564 	 * Done.
2565 	 */
2566 	mutex_exit(&qp->replylist_lock);
2567 	return (ret);
2568 }
2569 
2570 /*
2571  * RDMA write a buffer to the remote address.
2572  */
2573 rdma_stat
2574 rib_write(CONN *conn, struct clist *cl, int wait)
2575 {
2576 	ibt_send_wr_t	tx_wr;
2577 	int		cv_sig;
2578 	int		i;
2579 	ibt_wr_ds_t	sgl[DSEG_MAX];
2580 	struct send_wid	*wdesc;
2581 	ibt_status_t	ibt_status;
2582 	rdma_stat	ret = RDMA_SUCCESS;
2583 	rib_qp_t	*qp = ctoqp(conn);
2584 	uint64_t	n_writes = 0;
2585 	bool_t		force_wait = FALSE;
2586 
2587 	if (cl == NULL) {
2588 		return (RDMA_FAILED);
2589 	}
2590 
2591 
2592 	while ((cl != NULL)) {
2593 		if (cl->c_len > 0) {
2594 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2595 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2596 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2597 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2598 			sgl[0].ds_va = cl->w.c_saddr;
2599 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2600 			sgl[0].ds_len = cl->c_len;
2601 
2602 			if (wait) {
2603 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2604 				cv_sig = 1;
2605 			} else {
2606 				if (n_writes > max_unsignaled_rws) {
2607 					n_writes = 0;
2608 					force_wait = TRUE;
2609 					tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2610 					cv_sig = 1;
2611 				} else {
2612 					tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2613 					cv_sig = 0;
2614 				}
2615 			}
2616 
2617 			wdesc = rib_init_sendwait(0, cv_sig, qp);
2618 			tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2619 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2620 			tx_wr.wr_trans = IBT_RC_SRV;
2621 			tx_wr.wr_nds = 1;
2622 			tx_wr.wr_sgl = sgl;
2623 
2624 			mutex_enter(&conn->c_lock);
2625 			if (conn->c_state == C_CONNECTED) {
2626 				ibt_status =
2627 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2628 			}
2629 			if (conn->c_state != C_CONNECTED ||
2630 			    ibt_status != IBT_SUCCESS) {
2631 				if (conn->c_state != C_DISCONN_PEND)
2632 					conn->c_state = C_ERROR_CONN;
2633 				mutex_exit(&conn->c_lock);
2634 				(void) rib_free_sendwait(wdesc);
2635 				return (RDMA_CONNLOST);
2636 			}
2637 			mutex_exit(&conn->c_lock);
2638 
2639 			/*
2640 			 * Wait for send to complete
2641 			 */
2642 			if (wait || force_wait) {
2643 				force_wait = FALSE;
2644 				ret = rib_sendwait(qp, wdesc);
2645 				if (ret != 0) {
2646 					return (ret);
2647 				}
2648 			} else {
2649 				mutex_enter(&wdesc->sendwait_lock);
2650 				for (i = 0; i < wdesc->nsbufs; i++) {
2651 					rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2652 					    (void *)(uintptr_t)
2653 					    wdesc->sbufaddr[i]);
2654 				}
2655 				mutex_exit(&wdesc->sendwait_lock);
2656 				(void) rib_free_sendwait(wdesc);
2657 			}
2658 			n_writes ++;
2659 		}
2660 		cl = cl->c_next;
2661 	}
2662 	return (RDMA_SUCCESS);
2663 }
2664 
2665 /*
2666  * RDMA Read a buffer from the remote address.
2667  */
2668 rdma_stat
2669 rib_read(CONN *conn, struct clist *cl, int wait)
2670 {
2671 	ibt_send_wr_t	rx_wr;
2672 	int		cv_sig;
2673 	int		i;
2674 	ibt_wr_ds_t	sgl;
2675 	struct send_wid	*wdesc;
2676 	ibt_status_t	ibt_status = IBT_SUCCESS;
2677 	rdma_stat	ret = RDMA_SUCCESS;
2678 	rib_qp_t	*qp = ctoqp(conn);
2679 
2680 	if (cl == NULL) {
2681 		return (RDMA_FAILED);
2682 	}
2683 
2684 	while (cl != NULL) {
2685 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2686 		/*
2687 		 * Remote address is at the head chunk item in list.
2688 		 */
2689 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2690 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2691 
2692 		sgl.ds_va = cl->u.c_daddr;
2693 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2694 		sgl.ds_len = cl->c_len;
2695 
2696 		if (wait) {
2697 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2698 			cv_sig = 1;
2699 		} else {
2700 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2701 			cv_sig = 0;
2702 		}
2703 
2704 		wdesc = rib_init_sendwait(0, cv_sig, qp);
2705 		rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2706 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2707 		rx_wr.wr_trans = IBT_RC_SRV;
2708 		rx_wr.wr_nds = 1;
2709 		rx_wr.wr_sgl = &sgl;
2710 
2711 		mutex_enter(&conn->c_lock);
2712 		if (conn->c_state == C_CONNECTED) {
2713 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2714 		}
2715 		if (conn->c_state != C_CONNECTED ||
2716 		    ibt_status != IBT_SUCCESS) {
2717 			if (conn->c_state != C_DISCONN_PEND)
2718 				conn->c_state = C_ERROR_CONN;
2719 			mutex_exit(&conn->c_lock);
2720 			(void) rib_free_sendwait(wdesc);
2721 			return (RDMA_CONNLOST);
2722 		}
2723 		mutex_exit(&conn->c_lock);
2724 
2725 		/*
2726 		 * Wait for send to complete if this is the
2727 		 * last item in the list.
2728 		 */
2729 		if (wait && cl->c_next == NULL) {
2730 			ret = rib_sendwait(qp, wdesc);
2731 			if (ret != 0) {
2732 				return (ret);
2733 			}
2734 		} else {
2735 			mutex_enter(&wdesc->sendwait_lock);
2736 			for (i = 0; i < wdesc->nsbufs; i++) {
2737 				rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2738 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2739 			}
2740 			mutex_exit(&wdesc->sendwait_lock);
2741 			(void) rib_free_sendwait(wdesc);
2742 		}
2743 		cl = cl->c_next;
2744 	}
2745 	return (RDMA_SUCCESS);
2746 }
2747 
2748 /*
2749  * rib_srv_cm_handler()
2750  *    Connection Manager callback to handle RC connection requests.
2751  */
2752 /* ARGSUSED */
2753 static ibt_cm_status_t
2754 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2755 	ibt_cm_return_args_t *ret_args, void *priv_data,
2756 	ibt_priv_data_len_t len)
2757 {
2758 	queue_t		*q;
2759 	rib_qp_t	*qp;
2760 	rpcib_state_t	*ribstat;
2761 	rib_hca_t	*hca;
2762 	rdma_stat	status = RDMA_SUCCESS;
2763 	int		i;
2764 	struct clist	cl;
2765 	rdma_buf_t	rdbuf = {0};
2766 	void		*buf = NULL;
2767 	CONN		*conn;
2768 	ibt_ip_cm_info_t	ipinfo;
2769 	struct sockaddr_in *s;
2770 	struct sockaddr_in6 *s6;
2771 	int sin_size = sizeof (struct sockaddr_in);
2772 	int in_size = sizeof (struct in_addr);
2773 	int sin6_size = sizeof (struct sockaddr_in6);
2774 
2775 	ASSERT(any != NULL);
2776 	ASSERT(event != NULL);
2777 
2778 	ribstat = (rpcib_state_t *)any;
2779 	hca = (rib_hca_t *)ribstat->hca;
2780 	ASSERT(hca != NULL);
2781 
2782 	/* got a connection request */
2783 	switch (event->cm_type) {
2784 	case IBT_CM_EVENT_REQ_RCV:
2785 		/*
2786 		 * If the plugin is in the NO_ACCEPT state, bail out.
2787 		 */
2788 		mutex_enter(&plugin_state_lock);
2789 		if (plugin_state == NO_ACCEPT) {
2790 			mutex_exit(&plugin_state_lock);
2791 			return (IBT_CM_REJECT);
2792 		}
2793 		mutex_exit(&plugin_state_lock);
2794 
2795 		/*
2796 		 * Need to send a MRA MAD to CM so that it does not
2797 		 * timeout on us.
2798 		 */
2799 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2800 		    event->cm_event.req.req_timeout * 8, NULL, 0);
2801 
2802 		mutex_enter(&rib_stat->open_hca_lock);
2803 		q = rib_stat->q;
2804 		mutex_exit(&rib_stat->open_hca_lock);
2805 
2806 		status = rib_svc_create_chan(hca, (caddr_t)q,
2807 		    event->cm_event.req.req_prim_hca_port, &qp);
2808 
2809 		if (status) {
2810 			return (IBT_CM_REJECT);
2811 		}
2812 
2813 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2814 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2815 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2816 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2817 
2818 		/*
2819 		 * Pre-posts RECV buffers
2820 		 */
2821 		conn = qptoc(qp);
2822 		for (i = 0; i < preposted_rbufs; i++) {
2823 			bzero(&rdbuf, sizeof (rdbuf));
2824 			rdbuf.type = RECV_BUFFER;
2825 			buf = rib_rbuf_alloc(conn, &rdbuf);
2826 			if (buf == NULL) {
2827 				(void) rib_disconnect_channel(conn, NULL);
2828 				return (IBT_CM_REJECT);
2829 			}
2830 
2831 			bzero(&cl, sizeof (cl));
2832 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
2833 			cl.c_len = rdbuf.len;
2834 			cl.c_smemhandle.mrc_lmr =
2835 			    rdbuf.handle.mrc_lmr; /* lkey */
2836 			cl.c_next = NULL;
2837 			status = rib_post_recv(conn, &cl);
2838 			if (status != RDMA_SUCCESS) {
2839 				(void) rib_disconnect_channel(conn, NULL);
2840 				return (IBT_CM_REJECT);
2841 			}
2842 		}
2843 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
2844 
2845 		/*
2846 		 * Get the address translation
2847 		 */
2848 		rw_enter(&hca->state_lock, RW_READER);
2849 		if (hca->state == HCA_DETACHED) {
2850 			rw_exit(&hca->state_lock);
2851 			return (IBT_CM_REJECT);
2852 		}
2853 		rw_exit(&hca->state_lock);
2854 
2855 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
2856 
2857 		if (ibt_get_ip_data(event->cm_priv_data_len,
2858 		    event->cm_priv_data,
2859 		    &ipinfo) != IBT_SUCCESS) {
2860 
2861 			return (IBT_CM_REJECT);
2862 		}
2863 
2864 		switch (ipinfo.src_addr.family) {
2865 		case AF_INET:
2866 
2867 			conn->c_raddr.maxlen =
2868 			    conn->c_raddr.len = sin_size;
2869 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
2870 
2871 			s = (struct sockaddr_in *)conn->c_raddr.buf;
2872 			s->sin_family = AF_INET;
2873 
2874 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
2875 			    &s->sin_addr, in_size);
2876 
2877 			break;
2878 
2879 		case AF_INET6:
2880 
2881 			conn->c_raddr.maxlen =
2882 			    conn->c_raddr.len = sin6_size;
2883 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
2884 
2885 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
2886 			s6->sin6_family = AF_INET6;
2887 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
2888 			    &s6->sin6_addr,
2889 			    sizeof (struct in6_addr));
2890 
2891 			break;
2892 
2893 		default:
2894 			return (IBT_CM_REJECT);
2895 		}
2896 
2897 		break;
2898 
2899 	case IBT_CM_EVENT_CONN_CLOSED:
2900 	{
2901 		CONN		*conn;
2902 		rib_qp_t	*qp;
2903 
2904 		switch (event->cm_event.closed) {
2905 		case IBT_CM_CLOSED_DREP_RCVD:
2906 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
2907 		case IBT_CM_CLOSED_DUP:
2908 		case IBT_CM_CLOSED_ABORT:
2909 		case IBT_CM_CLOSED_ALREADY:
2910 			/*
2911 			 * These cases indicate the local end initiated
2912 			 * the closing of the channel. Nothing to do here.
2913 			 */
2914 			break;
2915 		default:
2916 			/*
2917 			 * Reason for CONN_CLOSED event must be one of
2918 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
2919 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
2920 			 * the remote end is closing the channel. In these
2921 			 * cases free the channel and transition to error
2922 			 * state
2923 			 */
2924 			qp = ibt_get_chan_private(event->cm_channel);
2925 			conn = qptoc(qp);
2926 			mutex_enter(&conn->c_lock);
2927 			if (conn->c_state == C_DISCONN_PEND) {
2928 				mutex_exit(&conn->c_lock);
2929 				break;
2930 			}
2931 			conn->c_state = C_ERROR_CONN;
2932 
2933 			/*
2934 			 * Free the rc_channel. Channel has already
2935 			 * transitioned to ERROR state and WRs have been
2936 			 * FLUSHED_ERR already.
2937 			 */
2938 			(void) ibt_free_channel(qp->qp_hdl);
2939 			qp->qp_hdl = NULL;
2940 
2941 			/*
2942 			 * Free the conn if c_ref goes down to 0
2943 			 */
2944 			if (conn->c_ref == 0) {
2945 				/*
2946 				 * Remove from list and free conn
2947 				 */
2948 				conn->c_state = C_DISCONN_PEND;
2949 				mutex_exit(&conn->c_lock);
2950 				(void) rib_disconnect_channel(conn,
2951 				    &hca->srv_conn_list);
2952 			} else {
2953 				mutex_exit(&conn->c_lock);
2954 			}
2955 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
2956 			break;
2957 		}
2958 		break;
2959 	}
2960 	case IBT_CM_EVENT_CONN_EST:
2961 		/*
2962 		 * RTU received, hence connection established.
2963 		 */
2964 		if (rib_debug > 1)
2965 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2966 			    "(CONN_EST) channel established");
2967 		break;
2968 
2969 	default:
2970 		if (rib_debug > 2) {
2971 			/* Let CM handle the following events. */
2972 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
2973 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2974 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
2975 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
2976 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2977 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
2978 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
2979 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2980 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
2981 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
2982 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2983 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
2984 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
2985 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2986 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
2987 			}
2988 		}
2989 		return (IBT_CM_DEFAULT);
2990 	}
2991 
2992 	/* accept all other CM messages (i.e. let the CM handle them) */
2993 	return (IBT_CM_ACCEPT);
2994 }
2995 
2996 static rdma_stat
2997 rib_register_service(rib_hca_t *hca, int service_type)
2998 {
2999 	ibt_srv_desc_t		sdesc;
3000 	ibt_hca_portinfo_t	*port_infop;
3001 	ib_svc_id_t		srv_id;
3002 	ibt_srv_hdl_t		srv_hdl;
3003 	uint_t			port_size;
3004 	uint_t			pki, i, num_ports, nbinds;
3005 	ibt_status_t		ibt_status;
3006 	rib_service_t		*new_service;
3007 	ib_pkey_t		pkey;
3008 
3009 	/*
3010 	 * Query all ports for the given HCA
3011 	 */
3012 	rw_enter(&hca->state_lock, RW_READER);
3013 	if (hca->state != HCA_DETACHED) {
3014 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3015 		    &num_ports, &port_size);
3016 		rw_exit(&hca->state_lock);
3017 	} else {
3018 		rw_exit(&hca->state_lock);
3019 		return (RDMA_FAILED);
3020 	}
3021 	if (ibt_status != IBT_SUCCESS) {
3022 		return (RDMA_FAILED);
3023 	}
3024 
3025 	DTRACE_PROBE1(rpcib__i__regservice_numports,
3026 	    int, num_ports);
3027 
3028 	for (i = 0; i < num_ports; i++) {
3029 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3030 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3031 			    int, i+1);
3032 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3033 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
3034 			    int, i+1);
3035 		}
3036 	}
3037 
3038 	/*
3039 	 * Get all the IP addresses on this system to register the
3040 	 * given "service type" on all DNS recognized IP addrs.
3041 	 * Each service type such as NFS will have all the systems
3042 	 * IP addresses as its different names. For now the only
3043 	 * type of service we support in RPCIB is NFS.
3044 	 */
3045 	rw_enter(&hca->service_list_lock, RW_WRITER);
3046 	/*
3047 	 * Start registering and binding service to active
3048 	 * on active ports on this HCA.
3049 	 */
3050 	nbinds = 0;
3051 	new_service = NULL;
3052 
3053 	/*
3054 	 * We use IP addresses as the service names for
3055 	 * service registration.  Register each of them
3056 	 * with CM to obtain a svc_id and svc_hdl.  We do not
3057 	 * register the service with machine's loopback address.
3058 	 */
3059 	(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3060 	(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3061 	(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3062 
3063 	sdesc.sd_handler = rib_srv_cm_handler;
3064 	sdesc.sd_flags = 0;
3065 	ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3066 	    &sdesc, ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT),
3067 	    1, &srv_hdl, &srv_id);
3068 
3069 	for (i = 0; i < num_ports; i++) {
3070 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3071 			continue;
3072 
3073 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3074 			pkey = port_infop[i].p_pkey_tbl[pki];
3075 			if ((pkey & IBSRM_HB) &&
3076 			    (pkey != IB_PKEY_INVALID_FULL)) {
3077 
3078 				/*
3079 				 * Allocate and prepare a service entry
3080 				 */
3081 				new_service =
3082 				    kmem_zalloc(1 * sizeof (rib_service_t),
3083 				    KM_SLEEP);
3084 
3085 				new_service->srv_type = service_type;
3086 				new_service->srv_hdl = srv_hdl;
3087 				new_service->srv_next = NULL;
3088 
3089 				ibt_status = ibt_bind_service(srv_hdl,
3090 				    port_infop[i].p_sgid_tbl[0],
3091 				    NULL, rib_stat, NULL);
3092 
3093 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3094 				    int, ibt_status);
3095 
3096 				if (ibt_status != IBT_SUCCESS) {
3097 					kmem_free(new_service,
3098 					    sizeof (rib_service_t));
3099 					new_service = NULL;
3100 					continue;
3101 				}
3102 
3103 				/*
3104 				 * Add to the service list for this HCA
3105 				 */
3106 				new_service->srv_next = hca->service_list;
3107 				hca->service_list = new_service;
3108 				new_service = NULL;
3109 				nbinds++;
3110 			}
3111 		}
3112 	}
3113 	rw_exit(&hca->service_list_lock);
3114 
3115 	ibt_free_portinfo(port_infop, port_size);
3116 
3117 	if (nbinds == 0) {
3118 		return (RDMA_FAILED);
3119 	} else {
3120 		/*
3121 		 * Put this plugin into accept state, since atleast
3122 		 * one registration was successful.
3123 		 */
3124 		mutex_enter(&plugin_state_lock);
3125 		plugin_state = ACCEPT;
3126 		mutex_exit(&plugin_state_lock);
3127 		return (RDMA_SUCCESS);
3128 	}
3129 }
3130 
3131 void
3132 rib_listen(struct rdma_svc_data *rd)
3133 {
3134 	rdma_stat status = RDMA_SUCCESS;
3135 
3136 	rd->active = 0;
3137 	rd->err_code = RDMA_FAILED;
3138 
3139 	/*
3140 	 * First check if a hca is still attached
3141 	 */
3142 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
3143 	if (rib_stat->hca->state != HCA_INITED) {
3144 		rw_exit(&rib_stat->hca->state_lock);
3145 		return;
3146 	}
3147 	rw_exit(&rib_stat->hca->state_lock);
3148 
3149 	rib_stat->q = &rd->q;
3150 	/*
3151 	 * Right now the only service type is NFS. Hence force feed this
3152 	 * value. Ideally to communicate the service type it should be
3153 	 * passed down in rdma_svc_data.
3154 	 */
3155 	rib_stat->service_type = NFS;
3156 	status = rib_register_service(rib_stat->hca, NFS);
3157 	if (status != RDMA_SUCCESS) {
3158 		rd->err_code = status;
3159 		return;
3160 	}
3161 	/*
3162 	 * Service active on an HCA, check rd->err_code for more
3163 	 * explainable errors.
3164 	 */
3165 	rd->active = 1;
3166 	rd->err_code = status;
3167 }
3168 
3169 /* XXXX */
3170 /* ARGSUSED */
3171 static void
3172 rib_listen_stop(struct rdma_svc_data *svcdata)
3173 {
3174 	rib_hca_t		*hca;
3175 
3176 	/*
3177 	 * KRPC called the RDMATF to stop the listeners, this means
3178 	 * stop sending incomming or recieved requests to KRPC master
3179 	 * transport handle for RDMA-IB. This is also means that the
3180 	 * master transport handle, responsible for us, is going away.
3181 	 */
3182 	mutex_enter(&plugin_state_lock);
3183 	plugin_state = NO_ACCEPT;
3184 	if (svcdata != NULL)
3185 		svcdata->active = 0;
3186 	mutex_exit(&plugin_state_lock);
3187 
3188 	/*
3189 	 * First check if a hca is still attached
3190 	 */
3191 	hca = rib_stat->hca;
3192 	rw_enter(&hca->state_lock, RW_READER);
3193 	if (hca->state != HCA_INITED) {
3194 		rw_exit(&hca->state_lock);
3195 		return;
3196 	}
3197 	rib_close_channels(&hca->srv_conn_list);
3198 	rib_stop_services(hca);
3199 	rw_exit(&hca->state_lock);
3200 }
3201 
3202 /*
3203  * Traverse the HCA's service list to unbind and deregister services.
3204  * Instead of unbinding the service for a service handle by
3205  * calling ibt_unbind_service() for each port/pkey, we unbind
3206  * all the services for the service handle by making only one
3207  * call to ibt_unbind_all_services().  Then, we deregister the
3208  * service for the service handle.
3209  *
3210  * When traversing the entries in service_list, we compare the
3211  * srv_hdl of the current entry with that of the next.  If they
3212  * are different or if the next entry is NULL, the current entry
3213  * marks the last binding of the service handle.  In this case,
3214  * call ibt_unbind_all_services() and deregister the service for
3215  * the service handle.  If they are the same, the current and the
3216  * next entries are bound to the same service handle.  In this
3217  * case, move on to the next entry.
3218  */
3219 static void
3220 rib_stop_services(rib_hca_t *hca)
3221 {
3222 	rib_service_t		*srv_list, *to_remove;
3223 
3224 	/*
3225 	 * unbind and deregister the services for this service type.
3226 	 * Right now there is only one service type. In future it will
3227 	 * be passed down to this function.
3228 	 */
3229 	rw_enter(&hca->service_list_lock, RW_WRITER);
3230 	srv_list = hca->service_list;
3231 	while (srv_list != NULL) {
3232 		to_remove = srv_list;
3233 		srv_list = to_remove->srv_next;
3234 		if (srv_list == NULL || bcmp(to_remove->srv_hdl,
3235 		    srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) {
3236 
3237 			(void) ibt_unbind_all_services(to_remove->srv_hdl);
3238 			(void) ibt_deregister_service(hca->ibt_clnt_hdl,
3239 			    to_remove->srv_hdl);
3240 		}
3241 
3242 		kmem_free(to_remove, sizeof (rib_service_t));
3243 	}
3244 	hca->service_list = NULL;
3245 	rw_exit(&hca->service_list_lock);
3246 }
3247 
3248 static struct svc_recv *
3249 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3250 {
3251 	struct svc_recv	*recvp;
3252 
3253 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3254 	recvp->vaddr = sgl->ds_va;
3255 	recvp->qp = qp;
3256 	recvp->bytes_xfer = 0;
3257 	return (recvp);
3258 }
3259 
3260 static int
3261 rib_free_svc_recv(struct svc_recv *recvp)
3262 {
3263 	kmem_free(recvp, sizeof (*recvp));
3264 
3265 	return (0);
3266 }
3267 
3268 static struct reply *
3269 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3270 {
3271 	struct reply	*rep;
3272 
3273 
3274 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3275 	if (rep == NULL) {
3276 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3277 		return (NULL);
3278 	}
3279 	rep->xid = msgid;
3280 	rep->vaddr_cq = NULL;
3281 	rep->bytes_xfer = 0;
3282 	rep->status = (uint_t)REPLY_WAIT;
3283 	rep->prev = NULL;
3284 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3285 
3286 	mutex_enter(&qp->replylist_lock);
3287 	if (qp->replylist) {
3288 		rep->next = qp->replylist;
3289 		qp->replylist->prev = rep;
3290 	}
3291 	qp->rep_list_size++;
3292 
3293 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3294 	    int, qp->rep_list_size);
3295 
3296 	qp->replylist = rep;
3297 	mutex_exit(&qp->replylist_lock);
3298 
3299 	return (rep);
3300 }
3301 
3302 static rdma_stat
3303 rib_rem_replylist(rib_qp_t *qp)
3304 {
3305 	struct reply	*r, *n;
3306 
3307 	mutex_enter(&qp->replylist_lock);
3308 	for (r = qp->replylist; r != NULL; r = n) {
3309 		n = r->next;
3310 		(void) rib_remreply(qp, r);
3311 	}
3312 	mutex_exit(&qp->replylist_lock);
3313 
3314 	return (RDMA_SUCCESS);
3315 }
3316 
3317 static int
3318 rib_remreply(rib_qp_t *qp, struct reply *rep)
3319 {
3320 
3321 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3322 	if (rep->prev) {
3323 		rep->prev->next = rep->next;
3324 	}
3325 	if (rep->next) {
3326 		rep->next->prev = rep->prev;
3327 	}
3328 	if (qp->replylist == rep)
3329 		qp->replylist = rep->next;
3330 
3331 	cv_destroy(&rep->wait_cv);
3332 	qp->rep_list_size--;
3333 
3334 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3335 	    int, qp->rep_list_size);
3336 
3337 	kmem_free(rep, sizeof (*rep));
3338 
3339 	return (0);
3340 }
3341 
3342 rdma_stat
3343 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3344 	struct mrc *buf_handle)
3345 {
3346 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3347 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3348 	rdma_stat	status;
3349 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3350 
3351 	/*
3352 	 * Note: ALL buffer pools use the same memory type RDMARW.
3353 	 */
3354 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3355 	if (status == RDMA_SUCCESS) {
3356 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3357 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3358 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3359 	} else {
3360 		buf_handle->mrc_linfo = NULL;
3361 		buf_handle->mrc_lmr = 0;
3362 		buf_handle->mrc_rmr = 0;
3363 	}
3364 	return (status);
3365 }
3366 
3367 static rdma_stat
3368 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3369 	ibt_mr_flags_t spec,
3370 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3371 {
3372 	ibt_mr_attr_t	mem_attr;
3373 	ibt_status_t	ibt_status;
3374 	mem_attr.mr_vaddr = (uintptr_t)buf;
3375 	mem_attr.mr_len = (ib_msglen_t)size;
3376 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3377 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3378 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3379 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3380 
3381 	rw_enter(&hca->state_lock, RW_READER);
3382 	if (hca->state == HCA_INITED) {
3383 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3384 		    &mem_attr, mr_hdlp, mr_descp);
3385 		rw_exit(&hca->state_lock);
3386 	} else {
3387 		rw_exit(&hca->state_lock);
3388 		return (RDMA_FAILED);
3389 	}
3390 
3391 	if (ibt_status != IBT_SUCCESS) {
3392 		return (RDMA_FAILED);
3393 	}
3394 	return (RDMA_SUCCESS);
3395 }
3396 
3397 rdma_stat
3398 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3399 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3400 {
3401 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3402 	rib_lrc_entry_t *l;
3403 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3404 	rdma_stat	status;
3405 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3406 
3407 	/*
3408 	 * Non-coherent memory registration.
3409 	 */
3410 	l = (rib_lrc_entry_t *)lrc;
3411 	if (l) {
3412 		if (l->registered) {
3413 			buf_handle->mrc_linfo =
3414 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3415 			buf_handle->mrc_lmr =
3416 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3417 			buf_handle->mrc_rmr =
3418 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3419 			*sync_handle = (RIB_SYNCMEM_HANDLE)
3420 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3421 			return (RDMA_SUCCESS);
3422 		} else {
3423 			/* Always register the whole buffer */
3424 			buf = (caddr_t)l->lrc_buf;
3425 			buflen = l->lrc_len;
3426 		}
3427 	}
3428 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3429 
3430 	if (status == RDMA_SUCCESS) {
3431 		if (l) {
3432 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3433 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3434 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3435 			l->registered		 = TRUE;
3436 		}
3437 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3438 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3439 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3440 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3441 	} else {
3442 		buf_handle->mrc_linfo = NULL;
3443 		buf_handle->mrc_lmr = 0;
3444 		buf_handle->mrc_rmr = 0;
3445 	}
3446 	return (status);
3447 }
3448 
3449 /* ARGSUSED */
3450 rdma_stat
3451 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3452 {
3453 	rib_hca_t *hca = (ctoqp(conn))->hca;
3454 	/*
3455 	 * Allow memory deregistration even if HCA is
3456 	 * getting detached. Need all outstanding
3457 	 * memory registrations to be deregistered
3458 	 * before HCA_DETACH_EVENT can be accepted.
3459 	 */
3460 	(void) ibt_deregister_mr(hca->hca_hdl,
3461 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3462 	return (RDMA_SUCCESS);
3463 }
3464 
3465 /* ARGSUSED */
3466 rdma_stat
3467 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3468 		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3469 {
3470 	rib_lrc_entry_t *l;
3471 	l = (rib_lrc_entry_t *)lrc;
3472 	if (l)
3473 		if (l->registered)
3474 			return (RDMA_SUCCESS);
3475 
3476 	(void) rib_deregistermem(conn, buf, buf_handle);
3477 
3478 	return (RDMA_SUCCESS);
3479 }
3480 
3481 /* ARGSUSED */
3482 rdma_stat
3483 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3484 		int len, int cpu)
3485 {
3486 	ibt_status_t	status;
3487 	rib_hca_t *hca = (ctoqp(conn))->hca;
3488 	ibt_mr_sync_t	mr_segment;
3489 
3490 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3491 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3492 	mr_segment.ms_len = (ib_memlen_t)len;
3493 	if (cpu) {
3494 		/* make incoming data visible to memory */
3495 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3496 	} else {
3497 		/* make memory changes visible to IO */
3498 		mr_segment.ms_flags = IBT_SYNC_READ;
3499 	}
3500 	rw_enter(&hca->state_lock, RW_READER);
3501 	if (hca->state == HCA_INITED) {
3502 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3503 		rw_exit(&hca->state_lock);
3504 	} else {
3505 		rw_exit(&hca->state_lock);
3506 		return (RDMA_FAILED);
3507 	}
3508 
3509 	if (status == IBT_SUCCESS)
3510 		return (RDMA_SUCCESS);
3511 	else {
3512 		return (RDMA_FAILED);
3513 	}
3514 }
3515 
3516 /*
3517  * XXXX	????
3518  */
3519 static rdma_stat
3520 rib_getinfo(rdma_info_t *info)
3521 {
3522 	/*
3523 	 * XXXX	Hack!
3524 	 */
3525 	info->addrlen = 16;
3526 	info->mts = 1000000;
3527 	info->mtu = 1000000;
3528 
3529 	return (RDMA_SUCCESS);
3530 }
3531 
3532 rib_bufpool_t *
3533 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3534 {
3535 	rib_bufpool_t	*rbp = NULL;
3536 	bufpool_t	*bp = NULL;
3537 	caddr_t		buf;
3538 	ibt_mr_attr_t	mem_attr;
3539 	ibt_status_t	ibt_status;
3540 	int		i, j;
3541 
3542 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3543 
3544 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3545 	    num * sizeof (void *), KM_SLEEP);
3546 
3547 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3548 	bp->numelems = num;
3549 
3550 
3551 	switch (ptype) {
3552 	case SEND_BUFFER:
3553 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3554 		bp->rsize = RPC_MSG_SZ;
3555 		break;
3556 	case RECV_BUFFER:
3557 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3558 		bp->rsize = RPC_BUF_SIZE;
3559 		break;
3560 	default:
3561 		goto fail;
3562 	}
3563 
3564 	/*
3565 	 * Register the pool.
3566 	 */
3567 	bp->bufsize = num * bp->rsize;
3568 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3569 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3570 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3571 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3572 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3573 	rw_enter(&hca->state_lock, RW_READER);
3574 
3575 	if (hca->state != HCA_INITED) {
3576 		rw_exit(&hca->state_lock);
3577 		goto fail;
3578 	}
3579 
3580 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3581 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3582 		mem_attr.mr_vaddr = (uintptr_t)buf;
3583 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3584 		mem_attr.mr_as = NULL;
3585 		ibt_status = ibt_register_mr(hca->hca_hdl,
3586 		    hca->pd_hdl, &mem_attr,
3587 		    &rbp->mr_hdl[i],
3588 		    &rbp->mr_desc[i]);
3589 		if (ibt_status != IBT_SUCCESS) {
3590 			for (j = 0; j < i; j++) {
3591 				(void) ibt_deregister_mr(hca->hca_hdl,
3592 				    rbp->mr_hdl[j]);
3593 			}
3594 			rw_exit(&hca->state_lock);
3595 			goto fail;
3596 		}
3597 	}
3598 	rw_exit(&hca->state_lock);
3599 	buf = (caddr_t)bp->buf;
3600 	for (i = 0; i < num; i++, buf += bp->rsize) {
3601 		bp->buflist[i] = (void *)buf;
3602 	}
3603 	bp->buffree = num - 1;	/* no. of free buffers */
3604 	rbp->bpool = bp;
3605 
3606 	return (rbp);
3607 fail:
3608 	if (bp) {
3609 		if (bp->buf)
3610 			kmem_free(bp->buf, bp->bufsize);
3611 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3612 	}
3613 	if (rbp) {
3614 		if (rbp->mr_hdl)
3615 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3616 		if (rbp->mr_desc)
3617 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3618 		kmem_free(rbp, sizeof (rib_bufpool_t));
3619 	}
3620 	return (NULL);
3621 }
3622 
3623 static void
3624 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3625 {
3626 	int i;
3627 	rib_bufpool_t *rbp = NULL;
3628 	bufpool_t *bp;
3629 
3630 	/*
3631 	 * Obtain pool address based on type of pool
3632 	 */
3633 	switch (ptype) {
3634 		case SEND_BUFFER:
3635 			rbp = hca->send_pool;
3636 			break;
3637 		case RECV_BUFFER:
3638 			rbp = hca->recv_pool;
3639 			break;
3640 		default:
3641 			return;
3642 	}
3643 	if (rbp == NULL)
3644 		return;
3645 
3646 	bp = rbp->bpool;
3647 
3648 	/*
3649 	 * Deregister the pool memory and free it.
3650 	 */
3651 	for (i = 0; i < bp->numelems; i++) {
3652 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3653 	}
3654 }
3655 
3656 static void
3657 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3658 {
3659 
3660 	rib_bufpool_t *rbp = NULL;
3661 	bufpool_t *bp;
3662 
3663 	/*
3664 	 * Obtain pool address based on type of pool
3665 	 */
3666 	switch (ptype) {
3667 		case SEND_BUFFER:
3668 			rbp = hca->send_pool;
3669 			break;
3670 		case RECV_BUFFER:
3671 			rbp = hca->recv_pool;
3672 			break;
3673 		default:
3674 			return;
3675 	}
3676 	if (rbp == NULL)
3677 		return;
3678 
3679 	bp = rbp->bpool;
3680 
3681 	/*
3682 	 * Free the pool memory.
3683 	 */
3684 	if (rbp->mr_hdl)
3685 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3686 
3687 	if (rbp->mr_desc)
3688 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
3689 	if (bp->buf)
3690 		kmem_free(bp->buf, bp->bufsize);
3691 	mutex_destroy(&bp->buflock);
3692 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
3693 	kmem_free(rbp, sizeof (rib_bufpool_t));
3694 }
3695 
3696 void
3697 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
3698 {
3699 	/*
3700 	 * Deregister the pool memory and free it.
3701 	 */
3702 	rib_rbufpool_deregister(hca, ptype);
3703 	rib_rbufpool_free(hca, ptype);
3704 }
3705 
3706 /*
3707  * Fetch a buffer from the pool of type specified in rdbuf->type.
3708  */
3709 static rdma_stat
3710 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3711 {
3712 	rib_lrc_entry_t *rlep;
3713 
3714 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
3715 		rlep = rib_get_cache_buf(conn, rdbuf->len);
3716 		rdbuf->rb_private =  (caddr_t)rlep;
3717 		rdbuf->addr = rlep->lrc_buf;
3718 		rdbuf->handle = rlep->lrc_mhandle;
3719 		return (RDMA_SUCCESS);
3720 	}
3721 
3722 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
3723 	if (rdbuf->addr) {
3724 		switch (rdbuf->type) {
3725 		case SEND_BUFFER:
3726 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
3727 			break;
3728 		case RECV_BUFFER:
3729 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
3730 			break;
3731 		default:
3732 			rdbuf->len = 0;
3733 		}
3734 		return (RDMA_SUCCESS);
3735 	} else
3736 		return (RDMA_FAILED);
3737 }
3738 
3739 #if defined(MEASURE_POOL_DEPTH)
3740 static void rib_recv_bufs(uint32_t x) {
3741 
3742 }
3743 
3744 static void rib_send_bufs(uint32_t x) {
3745 
3746 }
3747 #endif
3748 
3749 /*
3750  * Fetch a buffer of specified type.
3751  * Note that rdbuf->handle is mw's rkey.
3752  */
3753 static void *
3754 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3755 {
3756 	rib_qp_t	*qp = ctoqp(conn);
3757 	rib_hca_t	*hca = qp->hca;
3758 	rdma_btype	ptype = rdbuf->type;
3759 	void		*buf;
3760 	rib_bufpool_t	*rbp = NULL;
3761 	bufpool_t	*bp;
3762 	int		i;
3763 
3764 	/*
3765 	 * Obtain pool address based on type of pool
3766 	 */
3767 	switch (ptype) {
3768 	case SEND_BUFFER:
3769 		rbp = hca->send_pool;
3770 		break;
3771 	case RECV_BUFFER:
3772 		rbp = hca->recv_pool;
3773 		break;
3774 	default:
3775 		return (NULL);
3776 	}
3777 	if (rbp == NULL)
3778 		return (NULL);
3779 
3780 	bp = rbp->bpool;
3781 
3782 	mutex_enter(&bp->buflock);
3783 	if (bp->buffree < 0) {
3784 		mutex_exit(&bp->buflock);
3785 		return (NULL);
3786 	}
3787 
3788 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
3789 	buf = bp->buflist[bp->buffree];
3790 	rdbuf->addr = buf;
3791 	rdbuf->len = bp->rsize;
3792 	for (i = bp->numelems - 1; i >= 0; i--) {
3793 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
3794 			rdbuf->handle.mrc_rmr =
3795 			    (uint32_t)rbp->mr_desc[i].md_rkey;
3796 			rdbuf->handle.mrc_linfo =
3797 			    (uintptr_t)rbp->mr_hdl[i];
3798 			rdbuf->handle.mrc_lmr =
3799 			    (uint32_t)rbp->mr_desc[i].md_lkey;
3800 #if defined(MEASURE_POOL_DEPTH)
3801 			if (ptype == SEND_BUFFER)
3802 				rib_send_bufs(MAX_BUFS - (bp->buffree+1));
3803 			if (ptype == RECV_BUFFER)
3804 				rib_recv_bufs(MAX_BUFS - (bp->buffree+1));
3805 #endif
3806 			bp->buffree--;
3807 
3808 			mutex_exit(&bp->buflock);
3809 
3810 			return (buf);
3811 		}
3812 	}
3813 
3814 	mutex_exit(&bp->buflock);
3815 
3816 	return (NULL);
3817 }
3818 
3819 static void
3820 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
3821 {
3822 
3823 	if (rdbuf->type == RDMA_LONG_BUFFER) {
3824 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
3825 		rdbuf->rb_private = NULL;
3826 		return;
3827 	}
3828 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
3829 }
3830 
3831 static void
3832 rib_rbuf_free(CONN *conn, int ptype, void *buf)
3833 {
3834 	rib_qp_t *qp = ctoqp(conn);
3835 	rib_hca_t *hca = qp->hca;
3836 	rib_bufpool_t *rbp = NULL;
3837 	bufpool_t *bp;
3838 
3839 	/*
3840 	 * Obtain pool address based on type of pool
3841 	 */
3842 	switch (ptype) {
3843 	case SEND_BUFFER:
3844 		rbp = hca->send_pool;
3845 		break;
3846 	case RECV_BUFFER:
3847 		rbp = hca->recv_pool;
3848 		break;
3849 	default:
3850 		return;
3851 	}
3852 	if (rbp == NULL)
3853 		return;
3854 
3855 	bp = rbp->bpool;
3856 
3857 	mutex_enter(&bp->buflock);
3858 	if (++bp->buffree >= bp->numelems) {
3859 		/*
3860 		 * Should never happen
3861 		 */
3862 		bp->buffree--;
3863 	} else {
3864 		bp->buflist[bp->buffree] = buf;
3865 	}
3866 	mutex_exit(&bp->buflock);
3867 }
3868 
3869 static rdma_stat
3870 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
3871 {
3872 	rw_enter(&connlist->conn_lock, RW_WRITER);
3873 	if (connlist->conn_hd) {
3874 		cn->c_next = connlist->conn_hd;
3875 		connlist->conn_hd->c_prev = cn;
3876 	}
3877 	connlist->conn_hd = cn;
3878 	rw_exit(&connlist->conn_lock);
3879 
3880 	return (RDMA_SUCCESS);
3881 }
3882 
3883 static rdma_stat
3884 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
3885 {
3886 	rw_enter(&connlist->conn_lock, RW_WRITER);
3887 	if (cn->c_prev) {
3888 		cn->c_prev->c_next = cn->c_next;
3889 	}
3890 	if (cn->c_next) {
3891 		cn->c_next->c_prev = cn->c_prev;
3892 	}
3893 	if (connlist->conn_hd == cn)
3894 		connlist->conn_hd = cn->c_next;
3895 	rw_exit(&connlist->conn_lock);
3896 
3897 	return (RDMA_SUCCESS);
3898 }
3899 
3900 /*
3901  * Connection management.
3902  * IBTF does not support recycling of channels. So connections are only
3903  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
3904  * C_DISCONN_PEND state. No C_IDLE state.
3905  * C_CONN_PEND state: Connection establishment in progress to the server.
3906  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
3907  * It has an RC channel associated with it. ibt_post_send/recv are allowed
3908  * only in this state.
3909  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
3910  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
3911  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
3912  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
3913  * c_ref drops to 0 (this indicates that RPC has no more references to this
3914  * connection), the connection should be destroyed. A connection transitions
3915  * into this state when it is being destroyed.
3916  */
3917 /* ARGSUSED */
3918 static rdma_stat
3919 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn)
3920 {
3921 	CONN *cn;
3922 	int status = RDMA_SUCCESS;
3923 	rib_hca_t *hca = rib_stat->hca;
3924 	rib_qp_t *qp;
3925 	clock_t cv_stat, timout;
3926 	ibt_path_info_t path;
3927 	ibt_ip_addr_t s_ip, d_ip;
3928 
3929 	if (hca == NULL)
3930 		return (RDMA_FAILED);
3931 
3932 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
3933 	if (hca->state == HCA_DETACHED) {
3934 		rw_exit(&rib_stat->hca->state_lock);
3935 		return (RDMA_FAILED);
3936 	}
3937 	rw_exit(&rib_stat->hca->state_lock);
3938 
3939 again:
3940 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
3941 	cn = hca->cl_conn_list.conn_hd;
3942 	while (cn != NULL) {
3943 		/*
3944 		 * First, clear up any connection in the ERROR state
3945 		 */
3946 		mutex_enter(&cn->c_lock);
3947 		if (cn->c_state == C_ERROR_CONN) {
3948 			if (cn->c_ref == 0) {
3949 				/*
3950 				 * Remove connection from list and destroy it.
3951 				 */
3952 				cn->c_state = C_DISCONN_PEND;
3953 				mutex_exit(&cn->c_lock);
3954 				rw_exit(&hca->cl_conn_list.conn_lock);
3955 				(void) rib_disconnect_channel(cn,
3956 				    &hca->cl_conn_list);
3957 				goto again;
3958 			}
3959 			mutex_exit(&cn->c_lock);
3960 			cn = cn->c_next;
3961 			continue;
3962 		}
3963 		if (cn->c_state == C_DISCONN_PEND) {
3964 			mutex_exit(&cn->c_lock);
3965 			cn = cn->c_next;
3966 			continue;
3967 		}
3968 		if ((cn->c_raddr.len == svcaddr->len) &&
3969 		    bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) {
3970 			/*
3971 			 * Our connection. Give up conn list lock
3972 			 * as we are done traversing the list.
3973 			 */
3974 			rw_exit(&hca->cl_conn_list.conn_lock);
3975 			if (cn->c_state == C_CONNECTED) {
3976 				cn->c_ref++;	/* sharing a conn */
3977 				mutex_exit(&cn->c_lock);
3978 				*conn = cn;
3979 				return (status);
3980 			}
3981 			if (cn->c_state == C_CONN_PEND) {
3982 				/*
3983 				 * Hold a reference to this conn before
3984 				 * we give up the lock.
3985 				 */
3986 				cn->c_ref++;
3987 				timout =  ddi_get_lbolt() +
3988 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
3989 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
3990 				    &cn->c_lock, timout)) > 0 &&
3991 				    cn->c_state == C_CONN_PEND)
3992 					;
3993 				if (cv_stat == 0) {
3994 					cn->c_ref--;
3995 					mutex_exit(&cn->c_lock);
3996 					return (RDMA_INTR);
3997 				}
3998 				if (cv_stat < 0) {
3999 					cn->c_ref--;
4000 					mutex_exit(&cn->c_lock);
4001 					return (RDMA_TIMEDOUT);
4002 				}
4003 				if (cn->c_state == C_CONNECTED) {
4004 					*conn = cn;
4005 					mutex_exit(&cn->c_lock);
4006 					return (status);
4007 				} else {
4008 					cn->c_ref--;
4009 					mutex_exit(&cn->c_lock);
4010 					return (RDMA_TIMEDOUT);
4011 				}
4012 			}
4013 		}
4014 		mutex_exit(&cn->c_lock);
4015 		cn = cn->c_next;
4016 	}
4017 	rw_exit(&hca->cl_conn_list.conn_lock);
4018 
4019 	bzero(&path, sizeof (ibt_path_info_t));
4020 	bzero(&s_ip, sizeof (ibt_ip_addr_t));
4021 	bzero(&d_ip, sizeof (ibt_ip_addr_t));
4022 
4023 	status = rib_chk_srv_ibaddr(svcaddr, addr_type, &path, &s_ip, &d_ip);
4024 	if (status != RDMA_SUCCESS) {
4025 		return (RDMA_FAILED);
4026 	}
4027 
4028 	/*
4029 	 * Channel to server doesn't exist yet, create one.
4030 	 */
4031 	if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) {
4032 		return (RDMA_FAILED);
4033 	}
4034 	cn = qptoc(qp);
4035 	cn->c_state = C_CONN_PEND;
4036 	cn->c_ref = 1;
4037 
4038 	/*
4039 	 * Add to conn list.
4040 	 * We had given up the READER lock. In the time since then,
4041 	 * another thread might have created the connection we are
4042 	 * trying here. But for now, that is quiet alright - there
4043 	 * might be two connections between a pair of hosts instead
4044 	 * of one. If we really want to close that window,
4045 	 * then need to check the list after acquiring the
4046 	 * WRITER lock.
4047 	 */
4048 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4049 	status = rib_conn_to_srv(hca, qp, &path, &s_ip, &d_ip);
4050 	mutex_enter(&cn->c_lock);
4051 	if (status == RDMA_SUCCESS) {
4052 		cn->c_state = C_CONNECTED;
4053 		*conn = cn;
4054 	} else {
4055 		cn->c_state = C_ERROR_CONN;
4056 		cn->c_ref--;
4057 	}
4058 	cv_broadcast(&cn->c_cv);
4059 	mutex_exit(&cn->c_lock);
4060 	return (status);
4061 }
4062 
4063 static rdma_stat
4064 rib_conn_release(CONN *conn)
4065 {
4066 	rib_qp_t	*qp = ctoqp(conn);
4067 
4068 	mutex_enter(&conn->c_lock);
4069 	conn->c_ref--;
4070 
4071 	/*
4072 	 * If a conn is C_ERROR_CONN, close the channel.
4073 	 * If it's CONNECTED, keep it that way.
4074 	 */
4075 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4076 		conn->c_state = C_DISCONN_PEND;
4077 		mutex_exit(&conn->c_lock);
4078 		if (qp->mode == RIB_SERVER)
4079 			(void) rib_disconnect_channel(conn,
4080 			    &qp->hca->srv_conn_list);
4081 		else
4082 			(void) rib_disconnect_channel(conn,
4083 			    &qp->hca->cl_conn_list);
4084 		return (RDMA_SUCCESS);
4085 	}
4086 	mutex_exit(&conn->c_lock);
4087 	return (RDMA_SUCCESS);
4088 }
4089 
4090 /*
4091  * Add at front of list
4092  */
4093 static struct rdma_done_list *
4094 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4095 {
4096 	struct rdma_done_list *rd;
4097 
4098 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4099 
4100 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4101 	rd->xid = xid;
4102 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4103 
4104 	rd->prev = NULL;
4105 	rd->next = qp->rdlist;
4106 	if (qp->rdlist != NULL)
4107 		qp->rdlist->prev = rd;
4108 	qp->rdlist = rd;
4109 
4110 	return (rd);
4111 }
4112 
4113 static void
4114 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4115 {
4116 	struct rdma_done_list *r;
4117 
4118 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4119 
4120 	r = rd->next;
4121 	if (r != NULL) {
4122 		r->prev = rd->prev;
4123 	}
4124 
4125 	r = rd->prev;
4126 	if (r != NULL) {
4127 		r->next = rd->next;
4128 	} else {
4129 		qp->rdlist = rd->next;
4130 	}
4131 
4132 	cv_destroy(&rd->rdma_done_cv);
4133 	kmem_free(rd, sizeof (*rd));
4134 }
4135 
4136 static void
4137 rdma_done_rem_list(rib_qp_t *qp)
4138 {
4139 	struct rdma_done_list	*r, *n;
4140 
4141 	mutex_enter(&qp->rdlist_lock);
4142 	for (r = qp->rdlist; r != NULL; r = n) {
4143 		n = r->next;
4144 		rdma_done_rm(qp, r);
4145 	}
4146 	mutex_exit(&qp->rdlist_lock);
4147 }
4148 
4149 static void
4150 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4151 {
4152 	struct rdma_done_list *r = qp->rdlist;
4153 
4154 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4155 
4156 	while (r) {
4157 		if (r->xid == xid) {
4158 			cv_signal(&r->rdma_done_cv);
4159 			return;
4160 		} else {
4161 			r = r->next;
4162 		}
4163 	}
4164 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4165 	    int, xid);
4166 }
4167 
4168 
4169 /*
4170  * Goes through all connections and closes the channel
4171  * This will cause all the WRs on those channels to be
4172  * flushed.
4173  */
4174 static void
4175 rib_close_channels(rib_conn_list_t *connlist)
4176 {
4177 	CONN 		*conn;
4178 	rib_qp_t	*qp;
4179 
4180 	rw_enter(&connlist->conn_lock, RW_READER);
4181 	conn = connlist->conn_hd;
4182 	while (conn != NULL) {
4183 		mutex_enter(&conn->c_lock);
4184 		qp = ctoqp(conn);
4185 		if (conn->c_state == C_CONNECTED) {
4186 			/*
4187 			 * Live connection in CONNECTED state.
4188 			 * Call ibt_close_rc_channel in nonblocking mode
4189 			 * with no callbacks.
4190 			 */
4191 			conn->c_state = C_ERROR_CONN;
4192 			(void) ibt_close_rc_channel(qp->qp_hdl,
4193 			    IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
4194 			(void) ibt_free_channel(qp->qp_hdl);
4195 			qp->qp_hdl = NULL;
4196 		} else {
4197 			if (conn->c_state == C_ERROR_CONN &&
4198 			    qp->qp_hdl != NULL) {
4199 				/*
4200 				 * Connection in ERROR state but
4201 				 * channel is not yet freed.
4202 				 */
4203 				(void) ibt_close_rc_channel(qp->qp_hdl,
4204 				    IBT_NOCALLBACKS, NULL, 0, NULL,
4205 				    NULL, 0);
4206 				(void) ibt_free_channel(qp->qp_hdl);
4207 				qp->qp_hdl = NULL;
4208 			}
4209 		}
4210 		mutex_exit(&conn->c_lock);
4211 		conn = conn->c_next;
4212 	}
4213 	rw_exit(&connlist->conn_lock);
4214 }
4215 
4216 /*
4217  * Frees up all connections that are no longer being referenced
4218  */
4219 static void
4220 rib_purge_connlist(rib_conn_list_t *connlist)
4221 {
4222 	CONN 		*conn;
4223 
4224 top:
4225 	rw_enter(&connlist->conn_lock, RW_READER);
4226 	conn = connlist->conn_hd;
4227 	while (conn != NULL) {
4228 		mutex_enter(&conn->c_lock);
4229 
4230 		/*
4231 		 * At this point connection is either in ERROR
4232 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4233 		 * then some other thread is culling that connection.
4234 		 * If not and if c_ref is 0, then destroy the connection.
4235 		 */
4236 		if (conn->c_ref == 0 &&
4237 		    conn->c_state != C_DISCONN_PEND) {
4238 			/*
4239 			 * Cull the connection
4240 			 */
4241 			conn->c_state = C_DISCONN_PEND;
4242 			mutex_exit(&conn->c_lock);
4243 			rw_exit(&connlist->conn_lock);
4244 			(void) rib_disconnect_channel(conn, connlist);
4245 			goto top;
4246 		} else {
4247 			/*
4248 			 * conn disconnect already scheduled or will
4249 			 * happen from conn_release when c_ref drops to 0.
4250 			 */
4251 			mutex_exit(&conn->c_lock);
4252 		}
4253 		conn = conn->c_next;
4254 	}
4255 	rw_exit(&connlist->conn_lock);
4256 
4257 	/*
4258 	 * At this point, only connections with c_ref != 0 are on the list
4259 	 */
4260 }
4261 
4262 /*
4263  * Cleans and closes up all uses of the HCA
4264  */
4265 static void
4266 rib_detach_hca(rib_hca_t *hca)
4267 {
4268 
4269 	/*
4270 	 * Stop all services on the HCA
4271 	 * Go through cl_conn_list and close all rc_channels
4272 	 * Go through svr_conn_list and close all rc_channels
4273 	 * Free connections whose c_ref has dropped to 0
4274 	 * Destroy all CQs
4275 	 * Deregister and released all buffer pool memory after all
4276 	 * connections are destroyed
4277 	 * Free the protection domain
4278 	 * ibt_close_hca()
4279 	 */
4280 	rw_enter(&hca->state_lock, RW_WRITER);
4281 	if (hca->state == HCA_DETACHED) {
4282 		rw_exit(&hca->state_lock);
4283 		return;
4284 	}
4285 
4286 	hca->state = HCA_DETACHED;
4287 	rib_stat->nhca_inited--;
4288 
4289 	rib_stop_services(hca);
4290 	rib_close_channels(&hca->cl_conn_list);
4291 	rib_close_channels(&hca->srv_conn_list);
4292 
4293 	rib_mod.rdma_count--;
4294 
4295 	rw_exit(&hca->state_lock);
4296 
4297 	/*
4298 	 * purge will free all datastructures used by CQ handlers. We don't
4299 	 * want to receive completions after purge, so we'll free the CQs now.
4300 	 */
4301 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4302 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4303 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4304 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4305 
4306 	rib_purge_connlist(&hca->cl_conn_list);
4307 	rib_purge_connlist(&hca->srv_conn_list);
4308 
4309 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4310 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4311 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4312 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4313 	if (stats_enabled) {
4314 		kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4315 		    GLOBAL_ZONEID);
4316 	}
4317 
4318 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4319 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4320 	if (hca->srv_conn_list.conn_hd == NULL &&
4321 	    hca->cl_conn_list.conn_hd == NULL) {
4322 		/*
4323 		 * conn_lists are NULL, so destroy
4324 		 * buffers, close hca and be done.
4325 		 */
4326 		rib_rbufpool_destroy(hca, RECV_BUFFER);
4327 		rib_rbufpool_destroy(hca, SEND_BUFFER);
4328 		rib_destroy_cache(hca);
4329 		rdma_unregister_mod(&rib_mod);
4330 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4331 		(void) ibt_close_hca(hca->hca_hdl);
4332 		hca->hca_hdl = NULL;
4333 	}
4334 	rw_exit(&hca->cl_conn_list.conn_lock);
4335 	rw_exit(&hca->srv_conn_list.conn_lock);
4336 
4337 	if (hca->hca_hdl != NULL) {
4338 		mutex_enter(&hca->inuse_lock);
4339 		while (hca->inuse)
4340 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4341 		mutex_exit(&hca->inuse_lock);
4342 
4343 		rdma_unregister_mod(&rib_mod);
4344 
4345 		/*
4346 		 * conn_lists are now NULL, so destroy
4347 		 * buffers, close hca and be done.
4348 		 */
4349 		rib_rbufpool_destroy(hca, RECV_BUFFER);
4350 		rib_rbufpool_destroy(hca, SEND_BUFFER);
4351 		rib_destroy_cache(hca);
4352 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4353 		(void) ibt_close_hca(hca->hca_hdl);
4354 		hca->hca_hdl = NULL;
4355 	}
4356 }
4357 
4358 static void
4359 rib_server_side_cache_reclaim(void *argp)
4360 {
4361 	cache_avl_struct_t    *rcas;
4362 	rib_lrc_entry_t		*rb;
4363 	rib_hca_t *hca = (rib_hca_t *)argp;
4364 
4365 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4366 	rcas = avl_first(&hca->avl_tree);
4367 	if (rcas != NULL)
4368 		avl_remove(&hca->avl_tree, rcas);
4369 
4370 	while (rcas != NULL) {
4371 		while (rcas->r.forw != &rcas->r) {
4372 			rcas->elements--;
4373 			rib_total_buffers --;
4374 			rb = rcas->r.forw;
4375 			remque(rb);
4376 			if (rb->registered)
4377 				(void) rib_deregistermem_via_hca(hca,
4378 				    rb->lrc_buf, rb->lrc_mhandle);
4379 			cache_allocation -= rb->lrc_len;
4380 			kmem_free(rb->lrc_buf, rb->lrc_len);
4381 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4382 		}
4383 		mutex_destroy(&rcas->node_lock);
4384 		kmem_cache_free(hca->server_side_cache, rcas);
4385 		rcas = avl_first(&hca->avl_tree);
4386 		if (rcas != NULL)
4387 			avl_remove(&hca->avl_tree, rcas);
4388 	}
4389 	rw_exit(&hca->avl_rw_lock);
4390 }
4391 
4392 static void
4393 rib_server_side_cache_cleanup(void *argp)
4394 {
4395 	cache_avl_struct_t    *rcas;
4396 	rib_lrc_entry_t		*rb;
4397 	rib_hca_t *hca = (rib_hca_t *)argp;
4398 
4399 	rw_enter(&hca->avl_rw_lock, RW_READER);
4400 	if (cache_allocation < cache_limit) {
4401 		rw_exit(&hca->avl_rw_lock);
4402 		return;
4403 	}
4404 	rw_exit(&hca->avl_rw_lock);
4405 
4406 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4407 	rcas = avl_last(&hca->avl_tree);
4408 	if (rcas != NULL)
4409 		avl_remove(&hca->avl_tree, rcas);
4410 
4411 	while (rcas != NULL) {
4412 		while (rcas->r.forw != &rcas->r) {
4413 			rcas->elements--;
4414 			rib_total_buffers --;
4415 			rb = rcas->r.forw;
4416 			remque(rb);
4417 			if (rb->registered)
4418 				(void) rib_deregistermem_via_hca(hca,
4419 				    rb->lrc_buf, rb->lrc_mhandle);
4420 			cache_allocation -= rb->lrc_len;
4421 			kmem_free(rb->lrc_buf, rb->lrc_len);
4422 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4423 		}
4424 		mutex_destroy(&rcas->node_lock);
4425 		if (hca->server_side_cache) {
4426 			kmem_cache_free(hca->server_side_cache, rcas);
4427 		}
4428 		if ((cache_allocation) < cache_limit) {
4429 			rw_exit(&hca->avl_rw_lock);
4430 			return;
4431 		}
4432 
4433 		rcas = avl_last(&hca->avl_tree);
4434 		if (rcas != NULL)
4435 			avl_remove(&hca->avl_tree, rcas);
4436 	}
4437 	rw_exit(&hca->avl_rw_lock);
4438 }
4439 
4440 static int
4441 avl_compare(const void *t1, const void *t2)
4442 {
4443 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
4444 		return (0);
4445 
4446 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
4447 		return (-1);
4448 
4449 	return (1);
4450 }
4451 
4452 static void
4453 rib_destroy_cache(rib_hca_t *hca)
4454 {
4455 	if (hca->reg_cache_clean_up != NULL) {
4456 		ddi_taskq_destroy(hca->reg_cache_clean_up);
4457 		hca->reg_cache_clean_up = NULL;
4458 	}
4459 	if (hca->avl_init) {
4460 		rib_server_side_cache_reclaim((void *)hca);
4461 		if (hca->server_side_cache) {
4462 			kmem_cache_destroy(hca->server_side_cache);
4463 			hca->server_side_cache = NULL;
4464 		}
4465 		avl_destroy(&hca->avl_tree);
4466 		mutex_destroy(&hca->cache_allocation);
4467 		rw_destroy(&hca->avl_rw_lock);
4468 	}
4469 	hca->avl_init = FALSE;
4470 }
4471 
4472 static void
4473 rib_force_cleanup(void *hca)
4474 {
4475 	if (((rib_hca_t *)hca)->reg_cache_clean_up != NULL)
4476 		(void) ddi_taskq_dispatch(
4477 		    ((rib_hca_t *)hca)->reg_cache_clean_up,
4478 		    rib_server_side_cache_cleanup,
4479 		    (void *)hca, DDI_NOSLEEP);
4480 }
4481 
4482 static rib_lrc_entry_t *
4483 rib_get_cache_buf(CONN *conn, uint32_t len)
4484 {
4485 	cache_avl_struct_t	cas, *rcas;
4486 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4487 	rib_lrc_entry_t *reply_buf;
4488 	avl_index_t where = NULL;
4489 	uint64_t c_alloc = 0;
4490 
4491 	if (!hca->avl_init)
4492 		goto  error_alloc;
4493 
4494 	cas.len = len;
4495 
4496 	rw_enter(&hca->avl_rw_lock, RW_READER);
4497 
4498 	mutex_enter(&hca->cache_allocation);
4499 	c_alloc = cache_allocation;
4500 	mutex_exit(&hca->cache_allocation);
4501 
4502 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
4503 	    &where)) == NULL) {
4504 		/* Am I above the cache limit */
4505 		if ((c_alloc + len) >= cache_limit) {
4506 			rib_force_cleanup((void *)hca);
4507 			rw_exit(&hca->avl_rw_lock);
4508 			cache_misses_above_the_limit ++;
4509 
4510 			/* Allocate and register the buffer directly */
4511 			goto error_alloc;
4512 		}
4513 
4514 		rw_exit(&hca->avl_rw_lock);
4515 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
4516 
4517 		/* Recheck to make sure no other thread added the entry in */
4518 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
4519 		    &cas, &where)) == NULL) {
4520 			/* Allocate an avl tree entry */
4521 			rcas = (cache_avl_struct_t *)
4522 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
4523 
4524 			bzero(rcas, sizeof (cache_avl_struct_t));
4525 			rcas->elements = 0;
4526 			rcas->r.forw = &rcas->r;
4527 			rcas->r.back = &rcas->r;
4528 			rcas->len = len;
4529 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
4530 			avl_insert(&hca->avl_tree, rcas, where);
4531 		}
4532 	}
4533 
4534 	mutex_enter(&rcas->node_lock);
4535 
4536 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
4537 		rib_total_buffers--;
4538 		cache_hits++;
4539 		reply_buf = rcas->r.forw;
4540 		remque(reply_buf);
4541 		rcas->elements--;
4542 		mutex_exit(&rcas->node_lock);
4543 		rw_exit(&hca->avl_rw_lock);
4544 		mutex_enter(&hca->cache_allocation);
4545 		cache_allocation -= len;
4546 		mutex_exit(&hca->cache_allocation);
4547 	} else {
4548 		/* Am I above the cache limit */
4549 		mutex_exit(&rcas->node_lock);
4550 		if ((c_alloc + len) >= cache_limit) {
4551 			rib_force_cleanup((void *)hca);
4552 			rw_exit(&hca->avl_rw_lock);
4553 			cache_misses_above_the_limit ++;
4554 			/* Allocate and register the buffer directly */
4555 			goto error_alloc;
4556 		}
4557 		rw_exit(&hca->avl_rw_lock);
4558 		cache_misses ++;
4559 		/* Allocate a reply_buf entry */
4560 		reply_buf = (rib_lrc_entry_t *)
4561 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4562 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
4563 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
4564 		reply_buf->lrc_len  = len;
4565 		reply_buf->registered = FALSE;
4566 		reply_buf->avl_node = (void *)rcas;
4567 	}
4568 
4569 	return (reply_buf);
4570 
4571 error_alloc:
4572 	reply_buf = (rib_lrc_entry_t *)
4573 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4574 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
4575 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
4576 	reply_buf->lrc_len = len;
4577 	reply_buf->registered = FALSE;
4578 	reply_buf->avl_node = NULL;
4579 
4580 	return (reply_buf);
4581 }
4582 
4583 /*
4584  * Return a pre-registered back to the cache (without
4585  * unregistering the buffer)..
4586  */
4587 
4588 static void
4589 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
4590 {
4591 	cache_avl_struct_t    cas, *rcas;
4592 	avl_index_t where = NULL;
4593 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4594 
4595 	if (!hca->avl_init)
4596 		goto  error_free;
4597 
4598 	cas.len = reg_buf->lrc_len;
4599 	rw_enter(&hca->avl_rw_lock, RW_READER);
4600 	if ((rcas = (cache_avl_struct_t *)
4601 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
4602 		rw_exit(&hca->avl_rw_lock);
4603 		goto error_free;
4604 	} else {
4605 		rib_total_buffers ++;
4606 		cas.len = reg_buf->lrc_len;
4607 		mutex_enter(&rcas->node_lock);
4608 		insque(reg_buf, &rcas->r);
4609 		rcas->elements ++;
4610 		mutex_exit(&rcas->node_lock);
4611 		rw_exit(&hca->avl_rw_lock);
4612 		mutex_enter(&hca->cache_allocation);
4613 		cache_allocation += cas.len;
4614 		mutex_exit(&hca->cache_allocation);
4615 	}
4616 
4617 	return;
4618 
4619 error_free:
4620 
4621 	if (reg_buf->registered)
4622 		(void) rib_deregistermem_via_hca(hca,
4623 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
4624 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
4625 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
4626 }
4627 
4628 static rdma_stat
4629 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
4630 	uint_t buflen, struct mrc *buf_handle)
4631 {
4632 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
4633 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
4634 	rdma_stat	status;
4635 
4636 
4637 	/*
4638 	 * Note: ALL buffer pools use the same memory type RDMARW.
4639 	 */
4640 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
4641 	if (status == RDMA_SUCCESS) {
4642 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
4643 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
4644 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
4645 	} else {
4646 		buf_handle->mrc_linfo = NULL;
4647 		buf_handle->mrc_lmr = 0;
4648 		buf_handle->mrc_rmr = 0;
4649 	}
4650 	return (status);
4651 }
4652 
4653 /* ARGSUSED */
4654 static rdma_stat
4655 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
4656     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
4657 {
4658 
4659 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
4660 	return (RDMA_SUCCESS);
4661 }
4662 
4663 /* ARGSUSED */
4664 static rdma_stat
4665 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
4666 {
4667 
4668 	(void) ibt_deregister_mr(hca->hca_hdl,
4669 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
4670 	return (RDMA_SUCCESS);
4671 }
4672 
4673 /*
4674  * Check if the IP interface named by `lifrp' is RDMA-capable.
4675  */
4676 static boolean_t
4677 rpcib_rdma_capable_interface(struct lifreq *lifrp)
4678 {
4679 	char ifname[LIFNAMSIZ];
4680 	char *cp;
4681 
4682 	if (lifrp->lifr_type == IFT_IB)
4683 		return (B_TRUE);
4684 
4685 	/*
4686 	 * Strip off the logical interface portion before getting
4687 	 * intimate with the name.
4688 	 */
4689 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
4690 	if ((cp = strchr(ifname, ':')) != NULL)
4691 		*cp = '\0';
4692 
4693 	return (strcmp("lo0", ifname) == 0);
4694 }
4695 
4696 static int
4697 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
4698 {
4699 	vnode_t *kvp, *vp;
4700 	TIUSER  *tiptr;
4701 	struct  strioctl iocb;
4702 	k_sigset_t smask;
4703 	int	err = 0;
4704 
4705 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) {
4706 		if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE,
4707 		    &tiptr, CRED()) == 0) {
4708 			vp = tiptr->fp->f_vnode;
4709 		} else {
4710 			VN_RELE(kvp);
4711 			return (EPROTO);
4712 		}
4713 	} else {
4714 		return (EPROTO);
4715 	}
4716 
4717 	iocb.ic_cmd = cmd;
4718 	iocb.ic_timout = 0;
4719 	iocb.ic_len = len;
4720 	iocb.ic_dp = (caddr_t)arg;
4721 	sigintr(&smask, 0);
4722 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4723 	sigunintr(&smask);
4724 	(void) t_kclose(tiptr, 0);
4725 	VN_RELE(kvp);
4726 	return (err);
4727 }
4728 
4729 /*
4730  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
4731  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
4732  */
4733 static int
4734 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
4735 {
4736 	int err;
4737 	struct lifnum lifn;
4738 
4739 	bzero(&lifn, sizeof (struct lifnum));
4740 	lifn.lifn_family = AF_UNSPEC;
4741 
4742 	err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
4743 	if (err != 0)
4744 		return (err);
4745 
4746 	/*
4747 	 * Pad the interface count to account for additional interfaces that
4748 	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
4749 	 */
4750 	lifn.lifn_count += 4;
4751 
4752 	bzero(lifcp, sizeof (struct lifconf));
4753 	lifcp->lifc_family = AF_UNSPEC;
4754 	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
4755 	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
4756 
4757 	err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
4758 	if (err != 0) {
4759 		kmem_free(lifcp->lifc_buf, *bufsizep);
4760 		return (err);
4761 	}
4762 	return (0);
4763 }
4764 
4765 static boolean_t
4766 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
4767 {
4768 	uint_t i, nifs;
4769 	uint_t bufsize;
4770 	struct lifconf lifc;
4771 	struct lifreq *lifrp;
4772 	struct sockaddr_in *sinp;
4773 	struct sockaddr_in6 *sin6p;
4774 
4775 	bzero(addrs4, sizeof (rpcib_ipaddrs_t));
4776 	bzero(addrs6, sizeof (rpcib_ipaddrs_t));
4777 
4778 	if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
4779 		return (B_FALSE);
4780 
4781 	if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
4782 		kmem_free(lifc.lifc_buf, bufsize);
4783 		return (B_FALSE);
4784 	}
4785 
4786 	/*
4787 	 * Worst case is that all of the addresses are IB-capable and have
4788 	 * the same address family, so size our buffers accordingly.
4789 	 */
4790 	addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
4791 	addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
4792 	addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
4793 	addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
4794 
4795 	for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
4796 		if (!rpcib_rdma_capable_interface(lifrp))
4797 			continue;
4798 
4799 		if (lifrp->lifr_addr.ss_family == AF_INET) {
4800 			sinp = addrs4->ri_list;
4801 			bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
4802 			    sizeof (struct sockaddr_in));
4803 		} else if (lifrp->lifr_addr.ss_family == AF_INET6) {
4804 			sin6p = addrs6->ri_list;
4805 			bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
4806 			    sizeof (struct sockaddr_in6));
4807 		}
4808 	}
4809 
4810 	kmem_free(lifc.lifc_buf, bufsize);
4811 	return (B_TRUE);
4812 }
4813 
4814 /* ARGSUSED */
4815 static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) {
4816 
4817 	if (KSTAT_WRITE == rw) {
4818 		return (EACCES);
4819 	}
4820 	rpcib_kstat.cache_limit.value.ui64 =
4821 	    (uint64_t)cache_limit;
4822 	rpcib_kstat.cache_allocation.value.ui64 =
4823 	    (uint64_t)cache_allocation;
4824 	rpcib_kstat.cache_hits.value.ui64 =
4825 	    (uint64_t)cache_hits;
4826 	rpcib_kstat.cache_misses.value.ui64 =
4827 	    (uint64_t)cache_misses;
4828 	rpcib_kstat.cache_misses_above_the_limit.value.ui64 =
4829 	    (uint64_t)cache_misses_above_the_limit;
4830 	return (0);
4831 }
4832