xref: /titanic_51/usr/src/uts/common/rpc/rpcib.c (revision c2765d203a42aaeda144370182c6cda62904d860)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
28  *
29  * Portions of this source code is developed by the team members of
30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31  * headed by Professor Dhabaleswar K. (DK) Panda.
32  *
33  * Acknowledgements to contributions from developors:
34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
35  *   Lei Chai      : chail@cse.ohio-state.edu
36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
37  *
38  */
39 
40 /*
41  * The rpcib plugin. Implements the interface for RDMATF's
42  * interaction with IBTF.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/user.h>
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/file.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/stropts.h>
56 #include <sys/errno.h>
57 #include <sys/kmem.h>
58 #include <sys/debug.h>
59 #include <sys/pathname.h>
60 #include <sys/kstat.h>
61 #include <sys/t_lock.h>
62 #include <sys/ddi.h>
63 #include <sys/cmn_err.h>
64 #include <sys/time.h>
65 #include <sys/isa_defs.h>
66 #include <sys/callb.h>
67 #include <sys/sunddi.h>
68 #include <sys/sunndi.h>
69 #include <sys/sdt.h>
70 #include <sys/ib/ibtl/ibti.h>
71 #include <rpc/rpc.h>
72 #include <rpc/ib.h>
73 #include <sys/modctl.h>
74 #include <sys/kstr.h>
75 #include <sys/sockio.h>
76 #include <sys/vnode.h>
77 #include <sys/tiuser.h>
78 #include <net/if.h>
79 #include <net/if_types.h>
80 #include <sys/cred.h>
81 #include <rpc/rpc_rdma.h>
82 #include <nfs/nfs.h>
83 #include <sys/atomic.h>
84 
85 #define	NFS_RDMA_PORT	2050
86 
87 /*
88  * Convenience structure used by rpcib_get_ib_addresses()
89  */
90 typedef struct rpcib_ipaddrs {
91 	void	*ri_list;	/* pointer to list of addresses */
92 	uint_t	ri_count;	/* number of addresses in list */
93 	uint_t	ri_size;	/* size of ri_list in bytes */
94 } rpcib_ipaddrs_t;
95 
96 /*
97  * Prototype declarations for driver ops
98  */
99 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
100 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
101 				void *, void **);
102 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
103 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
104 static int	rpcib_do_ip_ioctl(int, int, void *);
105 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
106 static int rpcib_cache_kstat_update(kstat_t *, int);
107 static void rib_force_cleanup(void *);
108 
109 struct {
110 	kstat_named_t cache_limit;
111 	kstat_named_t cache_allocation;
112 	kstat_named_t cache_hits;
113 	kstat_named_t cache_misses;
114 	kstat_named_t cache_misses_above_the_limit;
115 } rpcib_kstat = {
116 	{"cache_limit",			KSTAT_DATA_UINT64 },
117 	{"cache_allocation",		KSTAT_DATA_UINT64 },
118 	{"cache_hits",			KSTAT_DATA_UINT64 },
119 	{"cache_misses",		KSTAT_DATA_UINT64 },
120 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
121 };
122 
123 /* rpcib cb_ops */
124 static struct cb_ops rpcib_cbops = {
125 	nulldev,		/* open */
126 	nulldev,		/* close */
127 	nodev,			/* strategy */
128 	nodev,			/* print */
129 	nodev,			/* dump */
130 	nodev,			/* read */
131 	nodev,			/* write */
132 	nodev,			/* ioctl */
133 	nodev,			/* devmap */
134 	nodev,			/* mmap */
135 	nodev,			/* segmap */
136 	nochpoll,		/* poll */
137 	ddi_prop_op,		/* prop_op */
138 	NULL,			/* stream */
139 	D_MP,			/* cb_flag */
140 	CB_REV,			/* rev */
141 	nodev,			/* int (*cb_aread)() */
142 	nodev			/* int (*cb_awrite)() */
143 };
144 
145 /*
146  * Device options
147  */
148 static struct dev_ops rpcib_ops = {
149 	DEVO_REV,		/* devo_rev, */
150 	0,			/* refcnt  */
151 	rpcib_getinfo,		/* info */
152 	nulldev,		/* identify */
153 	nulldev,		/* probe */
154 	rpcib_attach,		/* attach */
155 	rpcib_detach,		/* detach */
156 	nodev,			/* reset */
157 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
158 	NULL,			/* bus operations */
159 	NULL,			/* power */
160 	ddi_quiesce_not_needed,		/* quiesce */
161 };
162 
163 /*
164  * Module linkage information.
165  */
166 
167 static struct modldrv rib_modldrv = {
168 	&mod_driverops,		/* Driver module */
169 	"RPCIB plugin driver",	/* Driver name and version */
170 	&rpcib_ops,		/* Driver ops */
171 };
172 
173 static struct modlinkage rib_modlinkage = {
174 	MODREV_1,
175 	(void *)&rib_modldrv,
176 	NULL
177 };
178 
179 typedef struct rib_lrc_entry {
180 	struct rib_lrc_entry *forw;
181 	struct rib_lrc_entry *back;
182 	char *lrc_buf;
183 
184 	uint32_t lrc_len;
185 	void  *avl_node;
186 	bool_t registered;
187 
188 	struct mrc lrc_mhandle;
189 	bool_t lrc_on_freed_list;
190 } rib_lrc_entry_t;
191 
192 typedef	struct cache_struct	{
193 	rib_lrc_entry_t		r;
194 	uint32_t		len;
195 	uint32_t		elements;
196 	kmutex_t		node_lock;
197 	avl_node_t		avl_link;
198 } cache_avl_struct_t;
199 
200 static uint64_t	rib_total_buffers = 0;
201 uint64_t	cache_limit = 100 * 1024 * 1024;
202 static volatile uint64_t	cache_allocation = 0;
203 static uint64_t	cache_watermark = 80 * 1024 * 1024;
204 static uint64_t	cache_hits = 0;
205 static uint64_t	cache_misses = 0;
206 static uint64_t	cache_cold_misses = 0;
207 static uint64_t	cache_hot_misses = 0;
208 static uint64_t	cache_misses_above_the_limit = 0;
209 static bool_t	stats_enabled = FALSE;
210 
211 static uint64_t max_unsignaled_rws = 5;
212 
213 /*
214  * rib_stat: private data pointer used when registering
215  *	with the IBTF.  It is returned to the consumer
216  *	in all callbacks.
217  */
218 static rpcib_state_t *rib_stat = NULL;
219 
220 #define	RNR_RETRIES	IBT_RNR_RETRY_1
221 #define	MAX_PORTS	2
222 
223 int preposted_rbufs = RDMA_BUFS_GRANT;
224 int send_threshold = 1;
225 
226 /*
227  * State of the plugin.
228  * ACCEPT = accepting new connections and requests.
229  * NO_ACCEPT = not accepting new connection and requests.
230  * This should eventually move to rpcib_state_t structure, since this
231  * will tell in which state the plugin is for a particular type of service
232  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
233  * state for one and in no_accept state for the other.
234  */
235 int		plugin_state;
236 kmutex_t	plugin_state_lock;
237 
238 ldi_ident_t rpcib_li;
239 
240 /*
241  * RPCIB RDMATF operations
242  */
243 #if defined(MEASURE_POOL_DEPTH)
244 static void rib_posted_rbufs(uint32_t x) { return; }
245 #endif
246 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
247 static rdma_stat rib_disconnect(CONN *conn);
248 static void rib_listen(struct rdma_svc_data *rd);
249 static void rib_listen_stop(struct rdma_svc_data *rd);
250 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
251 	uint_t buflen, struct mrc *buf_handle);
252 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
253 	struct mrc buf_handle);
254 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
255 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
256 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
257 		struct mrc buf_handle);
258 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
259 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
260 	void *lrc);
261 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
262 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
263 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
264 	caddr_t buf, int len, int cpu);
265 
266 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
267 
268 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
269 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
270 
271 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
272 
273 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
274 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
275 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
276 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
277 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
278 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
279 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
280 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
281 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **);
282 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
283 static rdma_stat rib_conn_release(CONN *conn);
284 static rdma_stat rib_getinfo(rdma_info_t *info);
285 
286 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
287 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
288 static void rib_destroy_cache(rib_hca_t *hca);
289 static	void	rib_server_side_cache_reclaim(void *argp);
290 static int avl_compare(const void *t1, const void *t2);
291 
292 static void rib_stop_services(rib_hca_t *);
293 static void rib_close_channels(rib_conn_list_t *);
294 
295 /*
296  * RPCIB addressing operations
297  */
298 
299 /*
300  * RDMA operations the RPCIB module exports
301  */
302 static rdmaops_t rib_ops = {
303 	rib_reachable,
304 	rib_conn_get,
305 	rib_conn_release,
306 	rib_listen,
307 	rib_listen_stop,
308 	rib_registermem,
309 	rib_deregistermem,
310 	rib_registermemsync,
311 	rib_deregistermemsync,
312 	rib_syncmem,
313 	rib_reg_buf_alloc,
314 	rib_reg_buf_free,
315 	rib_send,
316 	rib_send_resp,
317 	rib_post_resp,
318 	rib_post_resp_remove,
319 	rib_post_recv,
320 	rib_recv,
321 	rib_read,
322 	rib_write,
323 	rib_getinfo,
324 };
325 
326 /*
327  * RDMATF RPCIB plugin details
328  */
329 static rdma_mod_t rib_mod = {
330 	"ibtf",		/* api name */
331 	RDMATF_VERS_1,
332 	0,
333 	&rib_ops,	/* rdma op vector for ibtf */
334 };
335 
336 static rdma_stat open_hcas(rpcib_state_t *);
337 static rdma_stat rib_qp_init(rib_qp_t *, int);
338 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
339 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
340 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
341 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
342 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
343 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
344 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
345 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
346 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
347 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *,
348 	ibt_ip_addr_t *, ibt_ip_addr_t *);
349 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
350 	rib_qp_t **);
351 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
352 	rib_qp_t **);
353 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
354 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
355 static int rib_free_sendwait(struct send_wid *);
356 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
357 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
358 static void rdma_done_rem_list(rib_qp_t *);
359 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
360 
361 static void rib_async_handler(void *,
362 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
363 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
364 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
365 static int rib_free_svc_recv(struct svc_recv *);
366 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
367 static void rib_free_wid(struct recv_wid *);
368 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
369 static void rib_detach_hca(rib_hca_t *);
370 static rdma_stat rib_chk_srv_ibaddr(struct netbuf *, int,
371 	ibt_path_info_t *, ibt_ip_addr_t *, ibt_ip_addr_t *);
372 
373 /*
374  * Registration with IBTF as a consumer
375  */
376 static struct ibt_clnt_modinfo_s rib_modinfo = {
377 	IBTI_V_CURR,
378 	IBT_GENERIC,
379 	rib_async_handler,	/* async event handler */
380 	NULL,			/* Memory Region Handler */
381 	"nfs/ib"
382 };
383 
384 /*
385  * Global strucuture
386  */
387 
388 typedef struct rpcib_s {
389 	dev_info_t	*rpcib_dip;
390 	kmutex_t	rpcib_mutex;
391 } rpcib_t;
392 
393 rpcib_t rpcib;
394 
395 /*
396  * /etc/system controlled variable to control
397  * debugging in rpcib kernel module.
398  * Set it to values greater that 1 to control
399  * the amount of debugging messages required.
400  */
401 int rib_debug = 0;
402 
403 int
404 _init(void)
405 {
406 	int error;
407 
408 	error = mod_install((struct modlinkage *)&rib_modlinkage);
409 	if (error != 0) {
410 		/*
411 		 * Could not load module
412 		 */
413 		return (error);
414 	}
415 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
416 	return (0);
417 }
418 
419 int
420 _fini()
421 {
422 	int status;
423 
424 	if ((status = rdma_unregister_mod(&rib_mod)) != RDMA_SUCCESS) {
425 		return (EBUSY);
426 	}
427 
428 	/*
429 	 * Remove module
430 	 */
431 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
432 		(void) rdma_register_mod(&rib_mod);
433 		return (status);
434 	}
435 	mutex_destroy(&plugin_state_lock);
436 	return (0);
437 }
438 
439 int
440 _info(struct modinfo *modinfop)
441 {
442 	return (mod_info(&rib_modlinkage, modinfop));
443 }
444 
445 /*
446  * rpcib_getinfo()
447  * Given the device number, return the devinfo pointer or the
448  * instance number.
449  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
450  */
451 
452 /*ARGSUSED*/
453 static int
454 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
455 {
456 	int ret = DDI_SUCCESS;
457 
458 	switch (cmd) {
459 	case DDI_INFO_DEVT2DEVINFO:
460 		if (rpcib.rpcib_dip != NULL)
461 			*result = rpcib.rpcib_dip;
462 		else {
463 			*result = NULL;
464 			ret = DDI_FAILURE;
465 		}
466 		break;
467 
468 	case DDI_INFO_DEVT2INSTANCE:
469 		*result = NULL;
470 		break;
471 
472 	default:
473 		ret = DDI_FAILURE;
474 	}
475 	return (ret);
476 }
477 
478 static int
479 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
480 {
481 	ibt_status_t	ibt_status;
482 	rdma_stat	r_status;
483 
484 	switch (cmd) {
485 	case DDI_ATTACH:
486 		break;
487 	case DDI_RESUME:
488 		return (DDI_SUCCESS);
489 	default:
490 		return (DDI_FAILURE);
491 	}
492 
493 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
494 
495 	mutex_enter(&rpcib.rpcib_mutex);
496 	if (rpcib.rpcib_dip != NULL) {
497 		mutex_exit(&rpcib.rpcib_mutex);
498 		return (DDI_FAILURE);
499 	}
500 	rpcib.rpcib_dip = dip;
501 	mutex_exit(&rpcib.rpcib_mutex);
502 	/*
503 	 * Create the "rpcib" minor-node.
504 	 */
505 	if (ddi_create_minor_node(dip,
506 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
507 		/* Error message, no cmn_err as they print on console */
508 		return (DDI_FAILURE);
509 	}
510 
511 	if (rib_stat == NULL) {
512 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
513 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
514 	}
515 
516 	rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids);
517 	if (rib_stat->hca_count < 1) {
518 		mutex_destroy(&rib_stat->open_hca_lock);
519 		kmem_free(rib_stat, sizeof (*rib_stat));
520 		rib_stat = NULL;
521 		return (DDI_FAILURE);
522 	}
523 
524 	ibt_status = ibt_attach(&rib_modinfo, dip,
525 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
526 
527 	if (ibt_status != IBT_SUCCESS) {
528 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
529 		mutex_destroy(&rib_stat->open_hca_lock);
530 		kmem_free(rib_stat, sizeof (*rib_stat));
531 		rib_stat = NULL;
532 		return (DDI_FAILURE);
533 	}
534 
535 	mutex_enter(&rib_stat->open_hca_lock);
536 	if (open_hcas(rib_stat) != RDMA_SUCCESS) {
537 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
538 		(void) ibt_detach(rib_stat->ibt_clnt_hdl);
539 		mutex_exit(&rib_stat->open_hca_lock);
540 		mutex_destroy(&rib_stat->open_hca_lock);
541 		kmem_free(rib_stat, sizeof (*rib_stat));
542 		rib_stat = NULL;
543 		return (DDI_FAILURE);
544 	}
545 	mutex_exit(&rib_stat->open_hca_lock);
546 
547 	/*
548 	 * Register with rdmatf
549 	 */
550 	rib_mod.rdma_count = rib_stat->hca_count;
551 	r_status = rdma_register_mod(&rib_mod);
552 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
553 		rib_detach_hca(rib_stat->hca);
554 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
555 		(void) ibt_detach(rib_stat->ibt_clnt_hdl);
556 		mutex_destroy(&rib_stat->open_hca_lock);
557 		kmem_free(rib_stat, sizeof (*rib_stat));
558 		rib_stat = NULL;
559 		return (DDI_FAILURE);
560 	}
561 
562 
563 	return (DDI_SUCCESS);
564 }
565 
566 /*ARGSUSED*/
567 static int
568 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
569 {
570 	switch (cmd) {
571 
572 	case DDI_DETACH:
573 		break;
574 
575 	case DDI_SUSPEND:
576 	default:
577 		return (DDI_FAILURE);
578 	}
579 
580 	/*
581 	 * Detach the hca and free resources
582 	 */
583 	mutex_enter(&plugin_state_lock);
584 	plugin_state = NO_ACCEPT;
585 	mutex_exit(&plugin_state_lock);
586 	rib_detach_hca(rib_stat->hca);
587 	ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
588 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
589 
590 	mutex_enter(&rpcib.rpcib_mutex);
591 	rpcib.rpcib_dip = NULL;
592 	mutex_exit(&rpcib.rpcib_mutex);
593 
594 	mutex_destroy(&rpcib.rpcib_mutex);
595 	return (DDI_SUCCESS);
596 }
597 
598 
599 static void rib_rbufpool_free(rib_hca_t *, int);
600 static void rib_rbufpool_deregister(rib_hca_t *, int);
601 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
602 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
603 static rdma_stat rib_rem_replylist(rib_qp_t *);
604 static int rib_remreply(rib_qp_t *, struct reply *);
605 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
606 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
607 
608 
609 /*
610  * One CQ pair per HCA
611  */
612 static rdma_stat
613 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
614 	rib_cq_t **cqp, rpcib_state_t *ribstat)
615 {
616 	rib_cq_t	*cq;
617 	ibt_cq_attr_t	cq_attr;
618 	uint32_t	real_size;
619 	ibt_status_t	status;
620 	rdma_stat	error = RDMA_SUCCESS;
621 
622 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
623 	cq->rib_hca = hca;
624 	cq_attr.cq_size = cq_size;
625 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
626 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
627 	    &real_size);
628 	if (status != IBT_SUCCESS) {
629 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
630 		    " status=%d", status);
631 		error = RDMA_FAILED;
632 		goto fail;
633 	}
634 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat);
635 
636 	/*
637 	 * Enable CQ callbacks. CQ Callbacks are single shot
638 	 * (e.g. you have to call ibt_enable_cq_notify()
639 	 * after each callback to get another one).
640 	 */
641 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
642 	if (status != IBT_SUCCESS) {
643 		cmn_err(CE_WARN, "rib_create_cq: "
644 		    "enable_cq_notify failed, status %d", status);
645 		error = RDMA_FAILED;
646 		goto fail;
647 	}
648 	*cqp = cq;
649 
650 	return (error);
651 fail:
652 	if (cq->rib_cq_hdl)
653 		(void) ibt_free_cq(cq->rib_cq_hdl);
654 	if (cq)
655 		kmem_free(cq, sizeof (rib_cq_t));
656 	return (error);
657 }
658 
659 static rdma_stat
660 open_hcas(rpcib_state_t *ribstat)
661 {
662 	rib_hca_t		*hca;
663 	ibt_status_t		ibt_status;
664 	rdma_stat		status;
665 	ibt_hca_portinfo_t	*pinfop;
666 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
667 	uint_t			size, cq_size;
668 	int			i;
669 	kstat_t *ksp;
670 	cache_avl_struct_t example_avl_node;
671 	char rssc_name[32];
672 
673 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
674 
675 	if (ribstat->hcas == NULL)
676 		ribstat->hcas = kmem_zalloc(ribstat->hca_count *
677 		    sizeof (rib_hca_t), KM_SLEEP);
678 
679 	/*
680 	 * Open a hca and setup for RDMA
681 	 */
682 	for (i = 0; i < ribstat->hca_count; i++) {
683 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
684 		    ribstat->hca_guids[i],
685 		    &ribstat->hcas[i].hca_hdl);
686 		if (ibt_status != IBT_SUCCESS) {
687 			continue;
688 		}
689 		ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
690 		hca = &(ribstat->hcas[i]);
691 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
692 		hca->state = HCA_INITED;
693 
694 		/*
695 		 * query HCA info
696 		 */
697 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
698 		if (ibt_status != IBT_SUCCESS) {
699 			goto fail1;
700 		}
701 
702 		/*
703 		 * One PD (Protection Domain) per HCA.
704 		 * A qp is allowed to access a memory region
705 		 * only when it's in the same PD as that of
706 		 * the memory region.
707 		 */
708 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
709 		if (ibt_status != IBT_SUCCESS) {
710 			goto fail1;
711 		}
712 
713 		/*
714 		 * query HCA ports
715 		 */
716 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
717 		    0, &pinfop, &hca->hca_nports, &size);
718 		if (ibt_status != IBT_SUCCESS) {
719 			goto fail2;
720 		}
721 		hca->hca_ports = pinfop;
722 		hca->hca_pinfosz = size;
723 		pinfop = NULL;
724 
725 		cq_size = DEF_CQ_SIZE; /* default cq size */
726 		/*
727 		 * Create 2 pairs of cq's (1 pair for client
728 		 * and the other pair for server) on this hca.
729 		 * If number of qp's gets too large, then several
730 		 * cq's will be needed.
731 		 */
732 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
733 		    &hca->svc_rcq, ribstat);
734 		if (status != RDMA_SUCCESS) {
735 			goto fail3;
736 		}
737 
738 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
739 		    &hca->svc_scq, ribstat);
740 		if (status != RDMA_SUCCESS) {
741 			goto fail3;
742 		}
743 
744 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
745 		    &hca->clnt_rcq, ribstat);
746 		if (status != RDMA_SUCCESS) {
747 			goto fail3;
748 		}
749 
750 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
751 		    &hca->clnt_scq, ribstat);
752 		if (status != RDMA_SUCCESS) {
753 			goto fail3;
754 		}
755 
756 		/*
757 		 * Create buffer pools.
758 		 * Note rib_rbuf_create also allocates memory windows.
759 		 */
760 		hca->recv_pool = rib_rbufpool_create(hca,
761 		    RECV_BUFFER, MAX_BUFS);
762 		if (hca->recv_pool == NULL) {
763 			goto fail3;
764 		}
765 
766 		hca->send_pool = rib_rbufpool_create(hca,
767 		    SEND_BUFFER, MAX_BUFS);
768 		if (hca->send_pool == NULL) {
769 			rib_rbufpool_destroy(hca, RECV_BUFFER);
770 			goto fail3;
771 		}
772 
773 		if (hca->server_side_cache == NULL) {
774 			(void) sprintf(rssc_name,
775 			    "rib_server_side_cache_%04d", i);
776 			hca->server_side_cache = kmem_cache_create(
777 			    rssc_name,
778 			    sizeof (cache_avl_struct_t), 0,
779 			    NULL,
780 			    NULL,
781 			    rib_server_side_cache_reclaim,
782 			    hca, NULL, 0);
783 		}
784 
785 		avl_create(&hca->avl_tree,
786 		    avl_compare,
787 		    sizeof (cache_avl_struct_t),
788 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
789 		    (uint_t)(uintptr_t)&example_avl_node);
790 
791 		rw_init(&hca->avl_rw_lock,
792 		    NULL, RW_DRIVER, hca->iblock);
793 		mutex_init(&hca->cache_allocation,
794 		    NULL, MUTEX_DRIVER, NULL);
795 		hca->avl_init = TRUE;
796 
797 		/* Create kstats for the cache */
798 		ASSERT(INGLOBALZONE(curproc));
799 
800 		if (!stats_enabled) {
801 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
802 			    KSTAT_TYPE_NAMED,
803 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
804 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
805 			    GLOBAL_ZONEID);
806 			if (ksp) {
807 				ksp->ks_data = (void *) &rpcib_kstat;
808 				ksp->ks_update = rpcib_cache_kstat_update;
809 				kstat_install(ksp);
810 				stats_enabled = TRUE;
811 			}
812 		}
813 		if (NULL == hca->reg_cache_clean_up) {
814 			hca->reg_cache_clean_up = ddi_taskq_create(NULL,
815 			    "REG_CACHE_CLEANUP", 1, TASKQ_DEFAULTPRI, 0);
816 		}
817 
818 		/*
819 		 * Initialize the registered service list and
820 		 * the lock
821 		 */
822 		hca->service_list = NULL;
823 		rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
824 
825 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
826 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
827 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
828 		    hca->iblock);
829 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
830 		    hca->iblock);
831 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
832 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
833 		hca->inuse = TRUE;
834 		/*
835 		 * XXX One hca only. Add multi-hca functionality if needed
836 		 * later.
837 		 */
838 		ribstat->hca = hca;
839 		ribstat->nhca_inited++;
840 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
841 		break;
842 
843 fail3:
844 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
845 fail2:
846 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
847 fail1:
848 		(void) ibt_close_hca(hca->hca_hdl);
849 
850 	}
851 	if (ribstat->hca != NULL)
852 		return (RDMA_SUCCESS);
853 	else
854 		return (RDMA_FAILED);
855 }
856 
857 /*
858  * Callback routines
859  */
860 
861 /*
862  * SCQ handlers
863  */
864 /* ARGSUSED */
865 static void
866 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
867 {
868 	ibt_status_t	ibt_status;
869 	ibt_wc_t	wc;
870 	int		i;
871 
872 	/*
873 	 * Re-enable cq notify here to avoid missing any
874 	 * completion queue notification.
875 	 */
876 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
877 
878 	ibt_status = IBT_SUCCESS;
879 	while (ibt_status != IBT_CQ_EMPTY) {
880 	bzero(&wc, sizeof (wc));
881 	ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
882 	if (ibt_status != IBT_SUCCESS)
883 		return;
884 
885 	/*
886 	 * Got a send completion
887 	 */
888 	if (wc.wc_id != NULL) {	/* XXX can it be otherwise ???? */
889 		struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
890 		CONN	*conn = qptoc(wd->qp);
891 
892 		mutex_enter(&wd->sendwait_lock);
893 		switch (wc.wc_status) {
894 		case IBT_WC_SUCCESS:
895 			wd->status = RDMA_SUCCESS;
896 			break;
897 		case IBT_WC_WR_FLUSHED_ERR:
898 			wd->status = RDMA_FAILED;
899 			break;
900 		default:
901 /*
902  *    RC Send Q Error Code		Local state     Remote State
903  *    ==================== 		===========     ============
904  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
905  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
906  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
907  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
908  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
909  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
910  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
911  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
912  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
913  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
914  *    IBT_WC_WR_FLUSHED_ERR               None            None
915  */
916 			/*
917 			 * Channel in error state. Set connection to
918 			 * ERROR and cleanup will happen either from
919 			 * conn_release  or from rib_conn_get
920 			 */
921 			wd->status = RDMA_FAILED;
922 			mutex_enter(&conn->c_lock);
923 			if (conn->c_state != C_DISCONN_PEND)
924 				conn->c_state = C_ERROR_CONN;
925 			mutex_exit(&conn->c_lock);
926 			break;
927 		}
928 
929 		if (wd->cv_sig == 1) {
930 			/*
931 			 * Notify poster
932 			 */
933 			cv_signal(&wd->wait_cv);
934 			mutex_exit(&wd->sendwait_lock);
935 		} else {
936 			/*
937 			 * Poster not waiting for notification.
938 			 * Free the send buffers and send_wid
939 			 */
940 			for (i = 0; i < wd->nsbufs; i++) {
941 				rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
942 				    (void *)(uintptr_t)wd->sbufaddr[i]);
943 				}
944 			mutex_exit(&wd->sendwait_lock);
945 			(void) rib_free_sendwait(wd);
946 			}
947 		}
948 	}
949 }
950 
951 /* ARGSUSED */
952 static void
953 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
954 {
955 	ibt_status_t	ibt_status;
956 	ibt_wc_t	wc;
957 	int		i;
958 
959 	/*
960 	 * Re-enable cq notify here to avoid missing any
961 	 * completion queue notification.
962 	 */
963 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
964 
965 	ibt_status = IBT_SUCCESS;
966 	while (ibt_status != IBT_CQ_EMPTY) {
967 		bzero(&wc, sizeof (wc));
968 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
969 		if (ibt_status != IBT_SUCCESS)
970 			return;
971 
972 		/*
973 		 * Got a send completion
974 		 */
975 		if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
976 			struct send_wid *wd =
977 			    (struct send_wid *)(uintptr_t)wc.wc_id;
978 			mutex_enter(&wd->sendwait_lock);
979 			if (wd->cv_sig == 1) {
980 				/*
981 				 * Update completion status and notify poster
982 				 */
983 				if (wc.wc_status == IBT_WC_SUCCESS)
984 					wd->status = RDMA_SUCCESS;
985 				else
986 					wd->status = RDMA_FAILED;
987 				cv_signal(&wd->wait_cv);
988 				mutex_exit(&wd->sendwait_lock);
989 			} else {
990 				/*
991 				 * Poster not waiting for notification.
992 				 * Free the send buffers and send_wid
993 				 */
994 				for (i = 0; i < wd->nsbufs; i++) {
995 					rib_rbuf_free(qptoc(wd->qp),
996 					    SEND_BUFFER,
997 					    (void *)(uintptr_t)wd->sbufaddr[i]);
998 				}
999 				mutex_exit(&wd->sendwait_lock);
1000 				(void) rib_free_sendwait(wd);
1001 			}
1002 		}
1003 	}
1004 }
1005 
1006 /*
1007  * RCQ handler
1008  */
1009 /* ARGSUSED */
1010 static void
1011 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1012 {
1013 	rib_qp_t	*qp;
1014 	ibt_status_t	ibt_status;
1015 	ibt_wc_t	wc;
1016 	struct recv_wid	*rwid;
1017 
1018 	/*
1019 	 * Re-enable cq notify here to avoid missing any
1020 	 * completion queue notification.
1021 	 */
1022 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1023 
1024 	ibt_status = IBT_SUCCESS;
1025 	while (ibt_status != IBT_CQ_EMPTY) {
1026 		bzero(&wc, sizeof (wc));
1027 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1028 		if (ibt_status != IBT_SUCCESS)
1029 			return;
1030 
1031 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1032 		qp = rwid->qp;
1033 		if (wc.wc_status == IBT_WC_SUCCESS) {
1034 			XDR	inxdrs, *xdrs;
1035 			uint_t	xid, vers, op, find_xid = 0;
1036 			struct reply	*r;
1037 			CONN *conn = qptoc(qp);
1038 			uint32_t rdma_credit = 0;
1039 
1040 			xdrs = &inxdrs;
1041 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1042 			    wc.wc_bytes_xfer, XDR_DECODE);
1043 			/*
1044 			 * Treat xid as opaque (xid is the first entity
1045 			 * in the rpc rdma message).
1046 			 */
1047 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1048 
1049 			/* Skip xid and set the xdr position accordingly. */
1050 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1051 			(void) xdr_u_int(xdrs, &vers);
1052 			(void) xdr_u_int(xdrs, &rdma_credit);
1053 			(void) xdr_u_int(xdrs, &op);
1054 			XDR_DESTROY(xdrs);
1055 
1056 			if (vers != RPCRDMA_VERS) {
1057 				/*
1058 				 * Invalid RPC/RDMA version. Cannot
1059 				 * interoperate.  Set connection to
1060 				 * ERROR state and bail out.
1061 				 */
1062 				mutex_enter(&conn->c_lock);
1063 				if (conn->c_state != C_DISCONN_PEND)
1064 					conn->c_state = C_ERROR_CONN;
1065 				mutex_exit(&conn->c_lock);
1066 				rib_rbuf_free(conn, RECV_BUFFER,
1067 				    (void *)(uintptr_t)rwid->addr);
1068 				rib_free_wid(rwid);
1069 				continue;
1070 			}
1071 
1072 			mutex_enter(&qp->replylist_lock);
1073 			for (r = qp->replylist; r != NULL; r = r->next) {
1074 				if (r->xid == xid) {
1075 					find_xid = 1;
1076 					switch (op) {
1077 					case RDMA_MSG:
1078 					case RDMA_NOMSG:
1079 					case RDMA_MSGP:
1080 						r->status = RDMA_SUCCESS;
1081 						r->vaddr_cq = rwid->addr;
1082 						r->bytes_xfer =
1083 						    wc.wc_bytes_xfer;
1084 						cv_signal(&r->wait_cv);
1085 						break;
1086 					default:
1087 						rib_rbuf_free(qptoc(qp),
1088 						    RECV_BUFFER,
1089 						    (void *)(uintptr_t)
1090 						    rwid->addr);
1091 						break;
1092 					}
1093 					break;
1094 				}
1095 			}
1096 			mutex_exit(&qp->replylist_lock);
1097 			if (find_xid == 0) {
1098 				/* RPC caller not waiting for reply */
1099 
1100 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1101 				    int, xid);
1102 
1103 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1104 				    (void *)(uintptr_t)rwid->addr);
1105 			}
1106 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1107 			CONN *conn = qptoc(qp);
1108 
1109 			/*
1110 			 * Connection being flushed. Just free
1111 			 * the posted buffer
1112 			 */
1113 			rib_rbuf_free(conn, RECV_BUFFER,
1114 			    (void *)(uintptr_t)rwid->addr);
1115 		} else {
1116 			CONN *conn = qptoc(qp);
1117 /*
1118  *  RC Recv Q Error Code		Local state     Remote State
1119  *  ====================		===========     ============
1120  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1121  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1122  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1123  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1124  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1125  *  IBT_WC_WR_FLUSHED_ERR               None            None
1126  */
1127 			/*
1128 			 * Channel in error state. Set connection
1129 			 * in ERROR state.
1130 			 */
1131 			mutex_enter(&conn->c_lock);
1132 			if (conn->c_state != C_DISCONN_PEND)
1133 				conn->c_state = C_ERROR_CONN;
1134 			mutex_exit(&conn->c_lock);
1135 			rib_rbuf_free(conn, RECV_BUFFER,
1136 			    (void *)(uintptr_t)rwid->addr);
1137 		}
1138 		rib_free_wid(rwid);
1139 	}
1140 }
1141 
1142 /* Server side */
1143 /* ARGSUSED */
1144 static void
1145 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1146 {
1147 	rdma_recv_data_t *rdp;
1148 	rib_qp_t	*qp;
1149 	ibt_status_t	ibt_status;
1150 	ibt_wc_t	wc;
1151 	struct svc_recv	*s_recvp;
1152 	CONN		*conn;
1153 	mblk_t		*mp;
1154 
1155 	/*
1156 	 * Re-enable cq notify here to avoid missing any
1157 	 * completion queue notification.
1158 	 */
1159 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1160 
1161 	ibt_status = IBT_SUCCESS;
1162 	while (ibt_status != IBT_CQ_EMPTY) {
1163 		bzero(&wc, sizeof (wc));
1164 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1165 		if (ibt_status != IBT_SUCCESS)
1166 			return;
1167 
1168 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1169 		qp = s_recvp->qp;
1170 		conn = qptoc(qp);
1171 		mutex_enter(&qp->posted_rbufs_lock);
1172 		qp->n_posted_rbufs--;
1173 #if defined(MEASURE_POOL_DEPTH)
1174 		rib_posted_rbufs(preposted_rbufs -  qp->n_posted_rbufs);
1175 #endif
1176 		if (qp->n_posted_rbufs == 0)
1177 			cv_signal(&qp->posted_rbufs_cv);
1178 		mutex_exit(&qp->posted_rbufs_lock);
1179 
1180 		if (wc.wc_status == IBT_WC_SUCCESS) {
1181 			XDR	inxdrs, *xdrs;
1182 			uint_t	xid, vers, op;
1183 			uint32_t rdma_credit;
1184 
1185 			xdrs = &inxdrs;
1186 			/* s_recvp->vaddr stores data */
1187 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1188 			    wc.wc_bytes_xfer, XDR_DECODE);
1189 
1190 			/*
1191 			 * Treat xid as opaque (xid is the first entity
1192 			 * in the rpc rdma message).
1193 			 */
1194 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1195 			/* Skip xid and set the xdr position accordingly. */
1196 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1197 			if (!xdr_u_int(xdrs, &vers) ||
1198 			    !xdr_u_int(xdrs, &rdma_credit) ||
1199 			    !xdr_u_int(xdrs, &op)) {
1200 				rib_rbuf_free(conn, RECV_BUFFER,
1201 				    (void *)(uintptr_t)s_recvp->vaddr);
1202 				XDR_DESTROY(xdrs);
1203 				(void) rib_free_svc_recv(s_recvp);
1204 				continue;
1205 			}
1206 			XDR_DESTROY(xdrs);
1207 
1208 			if (vers != RPCRDMA_VERS) {
1209 				/*
1210 				 * Invalid RPC/RDMA version.
1211 				 * Drop rpc rdma message.
1212 				 */
1213 				rib_rbuf_free(conn, RECV_BUFFER,
1214 				    (void *)(uintptr_t)s_recvp->vaddr);
1215 				(void) rib_free_svc_recv(s_recvp);
1216 				continue;
1217 			}
1218 			/*
1219 			 * Is this for RDMA_DONE?
1220 			 */
1221 			if (op == RDMA_DONE) {
1222 				rib_rbuf_free(conn, RECV_BUFFER,
1223 				    (void *)(uintptr_t)s_recvp->vaddr);
1224 				/*
1225 				 * Wake up the thread waiting on
1226 				 * a RDMA_DONE for xid
1227 				 */
1228 				mutex_enter(&qp->rdlist_lock);
1229 				rdma_done_notify(qp, xid);
1230 				mutex_exit(&qp->rdlist_lock);
1231 				(void) rib_free_svc_recv(s_recvp);
1232 				continue;
1233 			}
1234 
1235 			mutex_enter(&plugin_state_lock);
1236 			if (plugin_state == ACCEPT) {
1237 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1238 				    == NULL)
1239 					(void) strwaitbuf(
1240 					    sizeof (*rdp), BPRI_LO);
1241 				/*
1242 				 * Plugin is in accept state, hence the master
1243 				 * transport queue for this is still accepting
1244 				 * requests. Hence we can call svc_queuereq to
1245 				 * queue this recieved msg.
1246 				 */
1247 				rdp = (rdma_recv_data_t *)mp->b_rptr;
1248 				rdp->conn = conn;
1249 				rdp->rpcmsg.addr =
1250 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1251 				rdp->rpcmsg.type = RECV_BUFFER;
1252 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1253 				rdp->status = wc.wc_status;
1254 				mutex_enter(&conn->c_lock);
1255 				conn->c_ref++;
1256 				mutex_exit(&conn->c_lock);
1257 				mp->b_wptr += sizeof (*rdp);
1258 				svc_queuereq((queue_t *)rib_stat->q, mp);
1259 				mutex_exit(&plugin_state_lock);
1260 			} else {
1261 				/*
1262 				 * The master transport for this is going
1263 				 * away and the queue is not accepting anymore
1264 				 * requests for krpc, so don't do anything, just
1265 				 * free the msg.
1266 				 */
1267 				mutex_exit(&plugin_state_lock);
1268 				rib_rbuf_free(conn, RECV_BUFFER,
1269 				    (void *)(uintptr_t)s_recvp->vaddr);
1270 			}
1271 		} else {
1272 			rib_rbuf_free(conn, RECV_BUFFER,
1273 			    (void *)(uintptr_t)s_recvp->vaddr);
1274 		}
1275 		(void) rib_free_svc_recv(s_recvp);
1276 	}
1277 }
1278 
1279 /*
1280  * Handles DR event of IBT_HCA_DETACH_EVENT.
1281  */
1282 /* ARGSUSED */
1283 static void
1284 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1285 	ibt_async_code_t code, ibt_async_event_t *event)
1286 {
1287 
1288 	switch (code) {
1289 	case IBT_HCA_ATTACH_EVENT:
1290 		/* ignore */
1291 		break;
1292 	case IBT_HCA_DETACH_EVENT:
1293 	{
1294 		ASSERT(rib_stat->hca->hca_hdl == hca_hdl);
1295 		rib_detach_hca(rib_stat->hca);
1296 #ifdef DEBUG
1297 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1298 #endif
1299 		break;
1300 	}
1301 #ifdef DEBUG
1302 	case IBT_EVENT_PATH_MIGRATED:
1303 		cmn_err(CE_NOTE, "rib_async_handler(): "
1304 		    "IBT_EVENT_PATH_MIGRATED\n");
1305 		break;
1306 	case IBT_EVENT_SQD:
1307 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1308 		break;
1309 	case IBT_EVENT_COM_EST:
1310 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1311 		break;
1312 	case IBT_ERROR_CATASTROPHIC_CHAN:
1313 		cmn_err(CE_NOTE, "rib_async_handler(): "
1314 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1315 		break;
1316 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1317 		cmn_err(CE_NOTE, "rib_async_handler(): "
1318 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1319 		break;
1320 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1321 		cmn_err(CE_NOTE, "rib_async_handler(): "
1322 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1323 		break;
1324 	case IBT_ERROR_PATH_MIGRATE_REQ:
1325 		cmn_err(CE_NOTE, "rib_async_handler(): "
1326 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1327 		break;
1328 	case IBT_ERROR_CQ:
1329 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1330 		break;
1331 	case IBT_ERROR_PORT_DOWN:
1332 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1333 		break;
1334 	case IBT_EVENT_PORT_UP:
1335 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1336 		break;
1337 	case IBT_ASYNC_OPAQUE1:
1338 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1339 		break;
1340 	case IBT_ASYNC_OPAQUE2:
1341 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1342 		break;
1343 	case IBT_ASYNC_OPAQUE3:
1344 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1345 		break;
1346 	case IBT_ASYNC_OPAQUE4:
1347 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1348 		break;
1349 #endif
1350 	default:
1351 		break;
1352 	}
1353 }
1354 
1355 /*
1356  * Client's reachable function.
1357  */
1358 static rdma_stat
1359 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1360 {
1361 	rib_hca_t	*hca;
1362 	rdma_stat	status;
1363 
1364 	/*
1365 	 * First check if a hca is still attached
1366 	 */
1367 	*handle = NULL;
1368 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
1369 	if (rib_stat->hca->state != HCA_INITED) {
1370 		rw_exit(&rib_stat->hca->state_lock);
1371 		return (RDMA_FAILED);
1372 	}
1373 	status = rib_ping_srv(addr_type, raddr, &hca);
1374 	rw_exit(&rib_stat->hca->state_lock);
1375 
1376 	if (status == RDMA_SUCCESS) {
1377 		*handle = (void *)hca;
1378 		return (RDMA_SUCCESS);
1379 	} else {
1380 		*handle = NULL;
1381 		DTRACE_PROBE(rpcib__i__pingfailed);
1382 		return (RDMA_FAILED);
1383 	}
1384 }
1385 
1386 /* Client side qp creation */
1387 static rdma_stat
1388 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1389 {
1390 	rib_qp_t	*kqp = NULL;
1391 	CONN		*conn;
1392 	rdma_clnt_cred_ctrl_t *cc_info;
1393 
1394 	ASSERT(qp != NULL);
1395 	*qp = NULL;
1396 
1397 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1398 	conn = qptoc(kqp);
1399 	kqp->hca = hca;
1400 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1401 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1402 
1403 	kqp->mode = RIB_CLIENT;
1404 	kqp->chan_flags = IBT_BLOCKING;
1405 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1406 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1407 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1408 	/*
1409 	 * Initialize
1410 	 */
1411 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1412 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1413 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1414 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1415 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1416 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1417 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1418 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1419 	/*
1420 	 * Initialize the client credit control
1421 	 * portion of the rdmaconn struct.
1422 	 */
1423 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1424 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1425 	cc_info->clnt_cc_granted_ops = 0;
1426 	cc_info->clnt_cc_in_flight_ops = 0;
1427 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1428 
1429 	*qp = kqp;
1430 	return (RDMA_SUCCESS);
1431 }
1432 
1433 /* Server side qp creation */
1434 static rdma_stat
1435 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1436 {
1437 	rib_qp_t	*kqp = NULL;
1438 	ibt_chan_sizes_t	chan_sizes;
1439 	ibt_rc_chan_alloc_args_t	qp_attr;
1440 	ibt_status_t		ibt_status;
1441 	rdma_srv_cred_ctrl_t *cc_info;
1442 
1443 	*qp = NULL;
1444 
1445 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1446 	kqp->hca = hca;
1447 	kqp->port_num = port;
1448 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1449 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1450 
1451 	/*
1452 	 * Create the qp handle
1453 	 */
1454 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1455 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1456 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1457 	qp_attr.rc_pd = hca->pd_hdl;
1458 	qp_attr.rc_hca_port_num = port;
1459 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1460 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1461 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1462 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1463 	qp_attr.rc_clone_chan = NULL;
1464 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1465 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1466 
1467 	rw_enter(&hca->state_lock, RW_READER);
1468 	if (hca->state != HCA_DETACHED) {
1469 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1470 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1471 		    &chan_sizes);
1472 	} else {
1473 		rw_exit(&hca->state_lock);
1474 		goto fail;
1475 	}
1476 	rw_exit(&hca->state_lock);
1477 
1478 	if (ibt_status != IBT_SUCCESS) {
1479 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1480 		    int, ibt_status);
1481 		goto fail;
1482 	}
1483 
1484 	kqp->mode = RIB_SERVER;
1485 	kqp->chan_flags = IBT_BLOCKING;
1486 	kqp->q = q;	/* server ONLY */
1487 
1488 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1489 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1490 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1491 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1492 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1493 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1494 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1495 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1496 	/*
1497 	 * Set the private data area to qp to be used in callbacks
1498 	 */
1499 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1500 	kqp->rdmaconn.c_state = C_CONNECTED;
1501 
1502 	/*
1503 	 * Initialize the server credit control
1504 	 * portion of the rdmaconn struct.
1505 	 */
1506 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1507 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1508 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1509 	cc_info->srv_cc_cur_buffers_used = 0;
1510 	cc_info->srv_cc_posted = preposted_rbufs;
1511 
1512 	*qp = kqp;
1513 
1514 	return (RDMA_SUCCESS);
1515 fail:
1516 	if (kqp)
1517 		kmem_free(kqp, sizeof (rib_qp_t));
1518 
1519 	return (RDMA_FAILED);
1520 }
1521 
1522 /* ARGSUSED */
1523 ibt_cm_status_t
1524 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1525     ibt_cm_return_args_t *ret_args, void *priv_data,
1526     ibt_priv_data_len_t len)
1527 {
1528 	rpcib_state_t   *ribstat;
1529 	rib_hca_t	*hca;
1530 
1531 	ribstat = (rpcib_state_t *)clnt_hdl;
1532 	hca = (rib_hca_t *)ribstat->hca;
1533 
1534 	switch (event->cm_type) {
1535 
1536 	/* got a connection close event */
1537 	case IBT_CM_EVENT_CONN_CLOSED:
1538 	{
1539 		CONN	*conn;
1540 		rib_qp_t *qp;
1541 
1542 		/* check reason why connection was closed */
1543 		switch (event->cm_event.closed) {
1544 		case IBT_CM_CLOSED_DREP_RCVD:
1545 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1546 		case IBT_CM_CLOSED_DUP:
1547 		case IBT_CM_CLOSED_ABORT:
1548 		case IBT_CM_CLOSED_ALREADY:
1549 			/*
1550 			 * These cases indicate the local end initiated
1551 			 * the closing of the channel. Nothing to do here.
1552 			 */
1553 			break;
1554 		default:
1555 			/*
1556 			 * Reason for CONN_CLOSED event must be one of
1557 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1558 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1559 			 * the remote end is closing the channel. In these
1560 			 * cases free the channel and transition to error
1561 			 * state
1562 			 */
1563 			qp = ibt_get_chan_private(event->cm_channel);
1564 			conn = qptoc(qp);
1565 			mutex_enter(&conn->c_lock);
1566 			if (conn->c_state == C_DISCONN_PEND) {
1567 				mutex_exit(&conn->c_lock);
1568 				break;
1569 			}
1570 
1571 			conn->c_state = C_ERROR_CONN;
1572 
1573 			/*
1574 			 * Free the rc_channel. Channel has already
1575 			 * transitioned to ERROR state and WRs have been
1576 			 * FLUSHED_ERR already.
1577 			 */
1578 			(void) ibt_free_channel(qp->qp_hdl);
1579 			qp->qp_hdl = NULL;
1580 
1581 			/*
1582 			 * Free the conn if c_ref is down to 0 already
1583 			 */
1584 			if (conn->c_ref == 0) {
1585 				/*
1586 				 * Remove from list and free conn
1587 				 */
1588 				conn->c_state = C_DISCONN_PEND;
1589 				mutex_exit(&conn->c_lock);
1590 				(void) rib_disconnect_channel(conn,
1591 				    &hca->cl_conn_list);
1592 			} else {
1593 				mutex_exit(&conn->c_lock);
1594 			}
1595 #ifdef DEBUG
1596 			if (rib_debug)
1597 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1598 				    "(CONN_CLOSED) channel disconnected");
1599 #endif
1600 			break;
1601 		}
1602 		break;
1603 	}
1604 	default:
1605 		break;
1606 	}
1607 	return (IBT_CM_ACCEPT);
1608 }
1609 
1610 /* Check server ib address */
1611 rdma_stat
1612 rib_chk_srv_ibaddr(struct netbuf *raddr,
1613 	int addr_type, ibt_path_info_t *path, ibt_ip_addr_t *s_ip,
1614 	ibt_ip_addr_t *d_ip)
1615 {
1616 	struct sockaddr_in	*sin4;
1617 	struct sockaddr_in6	*sin6;
1618 	ibt_status_t		ibt_status;
1619 	ibt_ip_path_attr_t	ipattr;
1620 	uint8_t npaths = 0;
1621 	ibt_path_ip_src_t	srcip;
1622 
1623 	ASSERT(raddr->buf != NULL);
1624 
1625 	(void) bzero(path, sizeof (ibt_path_info_t));
1626 
1627 	switch (addr_type) {
1628 	case AF_INET:
1629 		sin4 = (struct sockaddr_in *)raddr->buf;
1630 		d_ip->family = AF_INET;
1631 		d_ip->un.ip4addr = sin4->sin_addr.s_addr;
1632 		break;
1633 
1634 	case AF_INET6:
1635 		sin6 = (struct sockaddr_in6 *)raddr->buf;
1636 		d_ip->family = AF_INET6;
1637 		d_ip->un.ip6addr = sin6->sin6_addr;
1638 		break;
1639 
1640 	default:
1641 		return (RDMA_INVAL);
1642 	}
1643 
1644 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1645 	bzero(&srcip, sizeof (ibt_path_ip_src_t));
1646 
1647 	ipattr.ipa_dst_ip 	= d_ip;
1648 	ipattr.ipa_hca_guid 	= rib_stat->hca->hca_guid;
1649 	ipattr.ipa_ndst		= 1;
1650 	ipattr.ipa_max_paths	= 1;
1651 	npaths = 0;
1652 
1653 	ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1654 	    IBT_PATH_NO_FLAGS,
1655 	    &ipattr,
1656 	    path,
1657 	    &npaths,
1658 	    &srcip);
1659 
1660 	if (ibt_status != IBT_SUCCESS ||
1661 	    npaths < 1 ||
1662 	    path->pi_hca_guid != rib_stat->hca->hca_guid) {
1663 
1664 		bzero(s_ip, sizeof (ibt_path_ip_src_t));
1665 		return (RDMA_FAILED);
1666 	}
1667 
1668 	if (srcip.ip_primary.family == AF_INET) {
1669 		s_ip->family = AF_INET;
1670 		s_ip->un.ip4addr = srcip.ip_primary.un.ip4addr;
1671 	} else {
1672 		s_ip->family = AF_INET6;
1673 		s_ip->un.ip6addr = srcip.ip_primary.un.ip6addr;
1674 	}
1675 
1676 	return (RDMA_SUCCESS);
1677 }
1678 
1679 
1680 /*
1681  * Connect to the server.
1682  */
1683 rdma_stat
1684 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path,
1685 		ibt_ip_addr_t *s_ip, ibt_ip_addr_t *d_ip)
1686 {
1687 	ibt_chan_open_args_t	chan_args;	/* channel args */
1688 	ibt_chan_sizes_t	chan_sizes;
1689 	ibt_rc_chan_alloc_args_t	qp_attr;
1690 	ibt_status_t		ibt_status;
1691 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1692 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1693 	ibt_ip_cm_info_t	ipcm_info;
1694 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1695 
1696 
1697 	(void) bzero(&chan_args, sizeof (chan_args));
1698 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1699 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1700 
1701 	switch (ipcm_info.src_addr.family = s_ip->family) {
1702 	case AF_INET:
1703 		ipcm_info.src_addr.un.ip4addr = s_ip->un.ip4addr;
1704 		break;
1705 	case AF_INET6:
1706 		ipcm_info.src_addr.un.ip6addr = s_ip->un.ip6addr;
1707 		break;
1708 	}
1709 
1710 	switch (ipcm_info.dst_addr.family = d_ip->family) {
1711 	case AF_INET:
1712 		ipcm_info.dst_addr.un.ip4addr = d_ip->un.ip4addr;
1713 		break;
1714 	case AF_INET6:
1715 		ipcm_info.dst_addr.un.ip6addr = d_ip->un.ip6addr;
1716 		break;
1717 	}
1718 
1719 	ipcm_info.src_port = NFS_RDMA_PORT;
1720 
1721 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1722 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1723 
1724 	if (ibt_status != IBT_SUCCESS) {
1725 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1726 		return (-1);
1727 	}
1728 
1729 	qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num;
1730 	/* Alloc a RC channel */
1731 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1732 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1733 	qp_attr.rc_pd = hca->pd_hdl;
1734 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1735 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1736 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1737 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1738 	qp_attr.rc_clone_chan = NULL;
1739 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1740 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1741 
1742 	path->pi_sid = ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT);
1743 	chan_args.oc_path = path;
1744 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1745 	chan_args.oc_cm_clnt_private = (void *)rib_stat;
1746 	chan_args.oc_rdma_ra_out = 4;
1747 	chan_args.oc_rdma_ra_in = 4;
1748 	chan_args.oc_path_retry_cnt = 2;
1749 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1750 	chan_args.oc_priv_data = cmp_ip_pvt;
1751 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1752 
1753 refresh:
1754 	rw_enter(&hca->state_lock, RW_READER);
1755 	if (hca->state != HCA_DETACHED) {
1756 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1757 		    IBT_ACHAN_NO_FLAGS,
1758 		    &qp_attr, &qp->qp_hdl,
1759 		    &chan_sizes);
1760 	} else {
1761 		rw_exit(&hca->state_lock);
1762 		return (RDMA_FAILED);
1763 	}
1764 	rw_exit(&hca->state_lock);
1765 
1766 	if (ibt_status != IBT_SUCCESS) {
1767 		DTRACE_PROBE1(rpcib__i_conntosrv,
1768 		    int, ibt_status);
1769 		return (RDMA_FAILED);
1770 	}
1771 
1772 	/* Connect to the Server */
1773 	(void) bzero(&ret_args, sizeof (ret_args));
1774 	mutex_enter(&qp->cb_lock);
1775 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1776 	    IBT_BLOCKING, &chan_args, &ret_args);
1777 	if (ibt_status != IBT_SUCCESS) {
1778 		DTRACE_PROBE2(rpcib__i_openrctosrv,
1779 		    int, ibt_status, int, ret_args.rc_status);
1780 
1781 		(void) ibt_free_channel(qp->qp_hdl);
1782 		qp->qp_hdl = NULL;
1783 		mutex_exit(&qp->cb_lock);
1784 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1785 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1786 			/*
1787 			 * Got IBT_CM_CONN_STALE probably because of stale
1788 			 * data on the passive end of a channel that existed
1789 			 * prior to reboot. Retry establishing a channel
1790 			 * REFRESH_ATTEMPTS times, during which time the
1791 			 * stale conditions on the server might clear up.
1792 			 */
1793 			goto refresh;
1794 		}
1795 		return (RDMA_FAILED);
1796 	}
1797 	mutex_exit(&qp->cb_lock);
1798 	/*
1799 	 * Set the private data area to qp to be used in callbacks
1800 	 */
1801 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1802 	return (RDMA_SUCCESS);
1803 }
1804 
1805 rdma_stat
1806 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca)
1807 {
1808 	uint_t			i;
1809 	ibt_path_info_t		path;
1810 	ibt_status_t		ibt_status;
1811 	uint8_t			num_paths_p;
1812 	ibt_ip_path_attr_t	ipattr;
1813 	ibt_ip_addr_t		dstip;
1814 	ibt_path_ip_src_t	srcip;
1815 	rpcib_ipaddrs_t		addrs4;
1816 	rpcib_ipaddrs_t		addrs6;
1817 	struct sockaddr_in	*sinp;
1818 	struct sockaddr_in6	*sin6p;
1819 	rdma_stat		retval = RDMA_SUCCESS;
1820 
1821 	*hca = NULL;
1822 	ASSERT(raddr->buf != NULL);
1823 
1824 	bzero(&path, sizeof (ibt_path_info_t));
1825 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1826 	bzero(&srcip, sizeof (ibt_path_ip_src_t));
1827 
1828 	if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1829 	    (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1830 		retval = RDMA_FAILED;
1831 		goto done;
1832 	}
1833 
1834 	/* Prep the destination address */
1835 	switch (addr_type) {
1836 	case AF_INET:
1837 		sinp = (struct sockaddr_in *)raddr->buf;
1838 		dstip.family = AF_INET;
1839 		dstip.un.ip4addr = sinp->sin_addr.s_addr;
1840 		sinp = addrs4.ri_list;
1841 
1842 		for (i = 0; i < addrs4.ri_count; i++) {
1843 			num_paths_p = 0;
1844 			ipattr.ipa_dst_ip 	= &dstip;
1845 			ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
1846 			ipattr.ipa_ndst		= 1;
1847 			ipattr.ipa_max_paths	= 1;
1848 			ipattr.ipa_src_ip.family = dstip.family;
1849 			ipattr.ipa_src_ip.un.ip4addr = sinp[i].sin_addr.s_addr;
1850 
1851 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1852 			    IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p,
1853 			    &srcip);
1854 			if (ibt_status == IBT_SUCCESS &&
1855 			    num_paths_p != 0 &&
1856 			    path.pi_hca_guid == rib_stat->hca->hca_guid) {
1857 				*hca = rib_stat->hca;
1858 				goto done;
1859 			}
1860 		}
1861 		retval = RDMA_FAILED;
1862 		break;
1863 
1864 	case AF_INET6:
1865 		sin6p = (struct sockaddr_in6 *)raddr->buf;
1866 		dstip.family = AF_INET6;
1867 		dstip.un.ip6addr = sin6p->sin6_addr;
1868 		sin6p = addrs6.ri_list;
1869 
1870 		for (i = 0; i < addrs6.ri_count; i++) {
1871 			num_paths_p = 0;
1872 			ipattr.ipa_dst_ip 	= &dstip;
1873 			ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
1874 			ipattr.ipa_ndst		= 1;
1875 			ipattr.ipa_max_paths	= 1;
1876 			ipattr.ipa_src_ip.family = dstip.family;
1877 			ipattr.ipa_src_ip.un.ip6addr = sin6p[i].sin6_addr;
1878 
1879 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1880 			    IBT_PATH_NO_FLAGS, &ipattr, &path, &num_paths_p,
1881 			    &srcip);
1882 			if (ibt_status == IBT_SUCCESS &&
1883 			    num_paths_p != 0 &&
1884 			    path.pi_hca_guid == rib_stat->hca->hca_guid) {
1885 				*hca = rib_stat->hca;
1886 				goto done;
1887 			}
1888 		}
1889 		retval = RDMA_FAILED;
1890 		break;
1891 
1892 	default:
1893 		retval = RDMA_INVAL;
1894 		break;
1895 	}
1896 done:
1897 	if (addrs4.ri_size > 0)
1898 		kmem_free(addrs4.ri_list, addrs4.ri_size);
1899 	if (addrs6.ri_size > 0)
1900 		kmem_free(addrs6.ri_list, addrs6.ri_size);
1901 	return (retval);
1902 }
1903 
1904 /*
1905  * Close channel, remove from connection list and
1906  * free up resources allocated for that channel.
1907  */
1908 rdma_stat
1909 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
1910 {
1911 	rib_qp_t	*qp = ctoqp(conn);
1912 	rib_hca_t	*hca;
1913 
1914 	/*
1915 	 * c_ref == 0 and connection is in C_DISCONN_PEND
1916 	 */
1917 	hca = qp->hca;
1918 	if (conn_list != NULL)
1919 		(void) rib_rm_conn(conn, conn_list);
1920 
1921 	if (qp->qp_hdl != NULL) {
1922 		/*
1923 		 * If the channel has not been establised,
1924 		 * ibt_flush_channel is called to flush outstanding WRs
1925 		 * on the Qs.  Otherwise, ibt_close_rc_channel() is
1926 		 * called.  The channel is then freed.
1927 		 */
1928 		if (conn_list != NULL)
1929 			(void) ibt_close_rc_channel(qp->qp_hdl,
1930 			    IBT_BLOCKING, NULL, 0, NULL, NULL, 0);
1931 		else
1932 			(void) ibt_flush_channel(qp->qp_hdl);
1933 
1934 		mutex_enter(&qp->posted_rbufs_lock);
1935 		while (qp->n_posted_rbufs)
1936 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
1937 		mutex_exit(&qp->posted_rbufs_lock);
1938 		(void) ibt_free_channel(qp->qp_hdl);
1939 		qp->qp_hdl = NULL;
1940 	}
1941 
1942 	ASSERT(qp->rdlist == NULL);
1943 
1944 	if (qp->replylist != NULL) {
1945 		(void) rib_rem_replylist(qp);
1946 	}
1947 
1948 	cv_destroy(&qp->cb_conn_cv);
1949 	cv_destroy(&qp->posted_rbufs_cv);
1950 	mutex_destroy(&qp->cb_lock);
1951 
1952 	mutex_destroy(&qp->replylist_lock);
1953 	mutex_destroy(&qp->posted_rbufs_lock);
1954 	mutex_destroy(&qp->rdlist_lock);
1955 
1956 	cv_destroy(&conn->c_cv);
1957 	mutex_destroy(&conn->c_lock);
1958 
1959 	if (conn->c_raddr.buf != NULL) {
1960 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
1961 	}
1962 	if (conn->c_laddr.buf != NULL) {
1963 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
1964 	}
1965 
1966 	/*
1967 	 * Credit control cleanup.
1968 	 */
1969 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
1970 		rdma_clnt_cred_ctrl_t *cc_info;
1971 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1972 		cv_destroy(&cc_info->clnt_cc_cv);
1973 	}
1974 
1975 	kmem_free(qp, sizeof (rib_qp_t));
1976 
1977 	/*
1978 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
1979 	 * then the hca is no longer being used.
1980 	 */
1981 	if (conn_list != NULL) {
1982 		rw_enter(&hca->state_lock, RW_READER);
1983 		if (hca->state == HCA_DETACHED) {
1984 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
1985 			if (hca->srv_conn_list.conn_hd == NULL) {
1986 				rw_enter(&hca->cl_conn_list.conn_lock,
1987 				    RW_READER);
1988 
1989 				if (hca->cl_conn_list.conn_hd == NULL) {
1990 					mutex_enter(&hca->inuse_lock);
1991 					hca->inuse = FALSE;
1992 					cv_signal(&hca->cb_cv);
1993 					mutex_exit(&hca->inuse_lock);
1994 				}
1995 				rw_exit(&hca->cl_conn_list.conn_lock);
1996 			}
1997 			rw_exit(&hca->srv_conn_list.conn_lock);
1998 		}
1999 		rw_exit(&hca->state_lock);
2000 	}
2001 
2002 	return (RDMA_SUCCESS);
2003 }
2004 
2005 /*
2006  * Wait for send completion notification. Only on receiving a
2007  * notification be it a successful or error completion, free the
2008  * send_wid.
2009  */
2010 static rdma_stat
2011 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2012 {
2013 	clock_t timout, cv_wait_ret;
2014 	rdma_stat error = RDMA_SUCCESS;
2015 	int	i;
2016 
2017 	/*
2018 	 * Wait for send to complete
2019 	 */
2020 	ASSERT(wd != NULL);
2021 	mutex_enter(&wd->sendwait_lock);
2022 	if (wd->status == (uint_t)SEND_WAIT) {
2023 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2024 		    ddi_get_lbolt();
2025 
2026 		if (qp->mode == RIB_SERVER) {
2027 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2028 			    &wd->sendwait_lock, timout)) > 0 &&
2029 			    wd->status == (uint_t)SEND_WAIT)
2030 				;
2031 			switch (cv_wait_ret) {
2032 			case -1:	/* timeout */
2033 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2034 
2035 				wd->cv_sig = 0;		/* no signal needed */
2036 				error = RDMA_TIMEDOUT;
2037 				break;
2038 			default:	/* got send completion */
2039 				break;
2040 			}
2041 		} else {
2042 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2043 			    &wd->sendwait_lock, timout)) > 0 &&
2044 			    wd->status == (uint_t)SEND_WAIT)
2045 				;
2046 			switch (cv_wait_ret) {
2047 			case -1:	/* timeout */
2048 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2049 
2050 				wd->cv_sig = 0;		/* no signal needed */
2051 				error = RDMA_TIMEDOUT;
2052 				break;
2053 			case 0:		/* interrupted */
2054 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2055 
2056 				wd->cv_sig = 0;		/* no signal needed */
2057 				error = RDMA_INTR;
2058 				break;
2059 			default:	/* got send completion */
2060 				break;
2061 			}
2062 		}
2063 	}
2064 
2065 	if (wd->status != (uint_t)SEND_WAIT) {
2066 		/* got send completion */
2067 		if (wd->status != RDMA_SUCCESS) {
2068 			error = wd->status;
2069 		if (wd->status != RDMA_CONNLOST)
2070 			error = RDMA_FAILED;
2071 		}
2072 		for (i = 0; i < wd->nsbufs; i++) {
2073 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2074 			    (void *)(uintptr_t)wd->sbufaddr[i]);
2075 		}
2076 		mutex_exit(&wd->sendwait_lock);
2077 		(void) rib_free_sendwait(wd);
2078 	} else {
2079 		mutex_exit(&wd->sendwait_lock);
2080 	}
2081 	return (error);
2082 }
2083 
2084 static struct send_wid *
2085 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2086 {
2087 	struct send_wid	*wd;
2088 
2089 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2090 	wd->xid = xid;
2091 	wd->cv_sig = cv_sig;
2092 	wd->qp = qp;
2093 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2094 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2095 	wd->status = (uint_t)SEND_WAIT;
2096 
2097 	return (wd);
2098 }
2099 
2100 static int
2101 rib_free_sendwait(struct send_wid *wdesc)
2102 {
2103 	cv_destroy(&wdesc->wait_cv);
2104 	mutex_destroy(&wdesc->sendwait_lock);
2105 	kmem_free(wdesc, sizeof (*wdesc));
2106 
2107 	return (0);
2108 }
2109 
2110 static rdma_stat
2111 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2112 {
2113 	mutex_enter(&qp->replylist_lock);
2114 	if (rep != NULL) {
2115 		(void) rib_remreply(qp, rep);
2116 		mutex_exit(&qp->replylist_lock);
2117 		return (RDMA_SUCCESS);
2118 	}
2119 	mutex_exit(&qp->replylist_lock);
2120 	return (RDMA_FAILED);
2121 }
2122 
2123 /*
2124  * Send buffers are freed here only in case of error in posting
2125  * on QP. If the post succeeded, the send buffers are freed upon
2126  * send completion in rib_sendwait() or in the scq_handler.
2127  */
2128 rdma_stat
2129 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2130 	int send_sig, int cv_sig, caddr_t *swid)
2131 {
2132 	struct send_wid	*wdesc;
2133 	struct clist	*clp;
2134 	ibt_status_t	ibt_status = IBT_SUCCESS;
2135 	rdma_stat	ret = RDMA_SUCCESS;
2136 	ibt_send_wr_t	tx_wr;
2137 	int		i, nds;
2138 	ibt_wr_ds_t	sgl[DSEG_MAX];
2139 	uint_t		total_msg_size;
2140 	rib_qp_t	*qp;
2141 
2142 	qp = ctoqp(conn);
2143 
2144 	ASSERT(cl != NULL);
2145 
2146 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2147 
2148 	nds = 0;
2149 	total_msg_size = 0;
2150 	clp = cl;
2151 	while (clp != NULL) {
2152 		if (nds >= DSEG_MAX) {
2153 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2154 			return (RDMA_FAILED);
2155 		}
2156 		sgl[nds].ds_va = clp->w.c_saddr;
2157 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2158 		sgl[nds].ds_len = clp->c_len;
2159 		total_msg_size += clp->c_len;
2160 		clp = clp->c_next;
2161 		nds++;
2162 	}
2163 
2164 	if (send_sig) {
2165 		/* Set SEND_SIGNAL flag. */
2166 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2167 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2168 		*swid = (caddr_t)wdesc;
2169 	} else {
2170 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2171 		wdesc = rib_init_sendwait(msgid, 0, qp);
2172 		*swid = (caddr_t)wdesc;
2173 	}
2174 	wdesc->nsbufs = nds;
2175 	for (i = 0; i < nds; i++) {
2176 		wdesc->sbufaddr[i] = sgl[i].ds_va;
2177 	}
2178 
2179 	tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2180 	tx_wr.wr_opcode = IBT_WRC_SEND;
2181 	tx_wr.wr_trans = IBT_RC_SRV;
2182 	tx_wr.wr_nds = nds;
2183 	tx_wr.wr_sgl = sgl;
2184 
2185 	mutex_enter(&conn->c_lock);
2186 	if (conn->c_state == C_CONNECTED) {
2187 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2188 	}
2189 	if (conn->c_state != C_CONNECTED ||
2190 	    ibt_status != IBT_SUCCESS) {
2191 		if (conn->c_state != C_DISCONN_PEND)
2192 			conn->c_state = C_ERROR_CONN;
2193 		mutex_exit(&conn->c_lock);
2194 		for (i = 0; i < nds; i++) {
2195 			rib_rbuf_free(conn, SEND_BUFFER,
2196 			    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2197 		}
2198 
2199 		(void) rib_free_sendwait(wdesc);
2200 
2201 		return (RDMA_CONNLOST);
2202 	}
2203 	mutex_exit(&conn->c_lock);
2204 
2205 	if (send_sig) {
2206 		if (cv_sig) {
2207 			/*
2208 			 * cv_wait for send to complete.
2209 			 * We can fail due to a timeout or signal or
2210 			 * unsuccessful send.
2211 			 */
2212 			ret = rib_sendwait(qp, wdesc);
2213 
2214 			return (ret);
2215 		}
2216 	}
2217 
2218 	return (RDMA_SUCCESS);
2219 }
2220 
2221 
2222 rdma_stat
2223 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2224 {
2225 	rdma_stat	ret;
2226 	caddr_t		wd;
2227 
2228 	/* send-wait & cv_signal */
2229 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2230 	return (ret);
2231 }
2232 
2233 /*
2234  * Server interface (svc_rdma_ksend).
2235  * Send RPC reply and wait for RDMA_DONE.
2236  */
2237 rdma_stat
2238 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2239 {
2240 	rdma_stat ret = RDMA_SUCCESS;
2241 	struct rdma_done_list *rd;
2242 	clock_t timout, cv_wait_ret;
2243 	caddr_t *wid = NULL;
2244 	rib_qp_t *qp = ctoqp(conn);
2245 
2246 	mutex_enter(&qp->rdlist_lock);
2247 	rd = rdma_done_add(qp, msgid);
2248 
2249 	/* No cv_signal (whether send-wait or no-send-wait) */
2250 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2251 
2252 	if (ret != RDMA_SUCCESS) {
2253 		rdma_done_rm(qp, rd);
2254 	} else {
2255 		/*
2256 		 * Wait for RDMA_DONE from remote end
2257 		 */
2258 		timout =
2259 		    drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2260 		cv_wait_ret = cv_timedwait(&rd->rdma_done_cv,
2261 		    &qp->rdlist_lock,
2262 		    timout);
2263 
2264 		rdma_done_rm(qp, rd);
2265 
2266 		if (cv_wait_ret < 0) {
2267 			ret = RDMA_TIMEDOUT;
2268 		}
2269 	}
2270 
2271 	mutex_exit(&qp->rdlist_lock);
2272 	return (ret);
2273 }
2274 
2275 static struct recv_wid *
2276 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2277 {
2278 	struct recv_wid	*rwid;
2279 
2280 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2281 	rwid->xid = msgid;
2282 	rwid->addr = sgl->ds_va;
2283 	rwid->qp = qp;
2284 
2285 	return (rwid);
2286 }
2287 
2288 static void
2289 rib_free_wid(struct recv_wid *rwid)
2290 {
2291 	kmem_free(rwid, sizeof (struct recv_wid));
2292 }
2293 
2294 rdma_stat
2295 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2296 {
2297 	rib_qp_t	*qp = ctoqp(conn);
2298 	struct clist	*clp = cl;
2299 	struct reply	*rep;
2300 	struct recv_wid	*rwid;
2301 	int		nds;
2302 	ibt_wr_ds_t	sgl[DSEG_MAX];
2303 	ibt_recv_wr_t	recv_wr;
2304 	rdma_stat	ret;
2305 	ibt_status_t	ibt_status;
2306 
2307 	/*
2308 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2309 	 */
2310 
2311 	nds = 0;
2312 	while (cl != NULL) {
2313 		if (nds >= DSEG_MAX) {
2314 			ret = RDMA_FAILED;
2315 			goto done;
2316 		}
2317 		sgl[nds].ds_va = cl->w.c_saddr;
2318 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2319 		sgl[nds].ds_len = cl->c_len;
2320 		cl = cl->c_next;
2321 		nds++;
2322 	}
2323 
2324 	if (nds != 1) {
2325 		ret = RDMA_FAILED;
2326 		goto done;
2327 	}
2328 
2329 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2330 	recv_wr.wr_nds = nds;
2331 	recv_wr.wr_sgl = sgl;
2332 
2333 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2334 	if (rwid) {
2335 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2336 	} else {
2337 		ret = RDMA_NORESOURCE;
2338 		goto done;
2339 	}
2340 	rep = rib_addreplylist(qp, msgid);
2341 	if (!rep) {
2342 		rib_free_wid(rwid);
2343 		ret = RDMA_NORESOURCE;
2344 		goto done;
2345 	}
2346 
2347 	mutex_enter(&conn->c_lock);
2348 
2349 	if (conn->c_state == C_CONNECTED) {
2350 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2351 	}
2352 
2353 	if (conn->c_state != C_CONNECTED ||
2354 	    ibt_status != IBT_SUCCESS) {
2355 		if (conn->c_state != C_DISCONN_PEND)
2356 			conn->c_state = C_ERROR_CONN;
2357 		mutex_exit(&conn->c_lock);
2358 		rib_free_wid(rwid);
2359 		(void) rib_rem_rep(qp, rep);
2360 		ret = RDMA_CONNLOST;
2361 		goto done;
2362 	}
2363 	mutex_exit(&conn->c_lock);
2364 	return (RDMA_SUCCESS);
2365 
2366 done:
2367 	while (clp != NULL) {
2368 		rib_rbuf_free(conn, RECV_BUFFER,
2369 		    (void *)(uintptr_t)clp->w.c_saddr3);
2370 		clp = clp->c_next;
2371 	}
2372 	return (ret);
2373 }
2374 
2375 rdma_stat
2376 rib_svc_post(CONN* conn, struct clist *cl)
2377 {
2378 	rib_qp_t	*qp = ctoqp(conn);
2379 	struct svc_recv	*s_recvp;
2380 	int		nds;
2381 	ibt_wr_ds_t	sgl[DSEG_MAX];
2382 	ibt_recv_wr_t	recv_wr;
2383 	ibt_status_t	ibt_status;
2384 
2385 	nds = 0;
2386 	while (cl != NULL) {
2387 		if (nds >= DSEG_MAX) {
2388 			return (RDMA_FAILED);
2389 		}
2390 		sgl[nds].ds_va = cl->w.c_saddr;
2391 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2392 		sgl[nds].ds_len = cl->c_len;
2393 		cl = cl->c_next;
2394 		nds++;
2395 	}
2396 
2397 	if (nds != 1) {
2398 		rib_rbuf_free(conn, RECV_BUFFER,
2399 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2400 
2401 		return (RDMA_FAILED);
2402 	}
2403 
2404 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2405 	recv_wr.wr_nds = nds;
2406 	recv_wr.wr_sgl = sgl;
2407 
2408 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2409 	/* Use s_recvp's addr as wr id */
2410 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2411 	mutex_enter(&conn->c_lock);
2412 	if (conn->c_state == C_CONNECTED) {
2413 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2414 	}
2415 	if (conn->c_state != C_CONNECTED ||
2416 	    ibt_status != IBT_SUCCESS) {
2417 		if (conn->c_state != C_DISCONN_PEND)
2418 			conn->c_state = C_ERROR_CONN;
2419 		mutex_exit(&conn->c_lock);
2420 		rib_rbuf_free(conn, RECV_BUFFER,
2421 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2422 		(void) rib_free_svc_recv(s_recvp);
2423 
2424 		return (RDMA_CONNLOST);
2425 	}
2426 	mutex_exit(&conn->c_lock);
2427 
2428 	return (RDMA_SUCCESS);
2429 }
2430 
2431 /* Client */
2432 rdma_stat
2433 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2434 {
2435 
2436 	return (rib_clnt_post(conn, cl, msgid));
2437 }
2438 
2439 /* Client */
2440 rdma_stat
2441 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2442 {
2443 	rib_qp_t	*qp = ctoqp(conn);
2444 	struct reply	*rep;
2445 
2446 	mutex_enter(&qp->replylist_lock);
2447 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2448 		if (rep->xid == msgid) {
2449 			if (rep->vaddr_cq) {
2450 				rib_rbuf_free(conn, RECV_BUFFER,
2451 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2452 			}
2453 			(void) rib_remreply(qp, rep);
2454 			break;
2455 		}
2456 	}
2457 	mutex_exit(&qp->replylist_lock);
2458 
2459 	return (RDMA_SUCCESS);
2460 }
2461 
2462 /* Server */
2463 rdma_stat
2464 rib_post_recv(CONN *conn, struct clist *cl)
2465 {
2466 	rib_qp_t	*qp = ctoqp(conn);
2467 
2468 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2469 		mutex_enter(&qp->posted_rbufs_lock);
2470 		qp->n_posted_rbufs++;
2471 		mutex_exit(&qp->posted_rbufs_lock);
2472 		return (RDMA_SUCCESS);
2473 	}
2474 	return (RDMA_FAILED);
2475 }
2476 
2477 /*
2478  * Client side only interface to "recv" the rpc reply buf
2479  * posted earlier by rib_post_resp(conn, cl, msgid).
2480  */
2481 rdma_stat
2482 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2483 {
2484 	struct reply *rep = NULL;
2485 	clock_t timout, cv_wait_ret;
2486 	rdma_stat ret = RDMA_SUCCESS;
2487 	rib_qp_t *qp = ctoqp(conn);
2488 
2489 	/*
2490 	 * Find the reply structure for this msgid
2491 	 */
2492 	mutex_enter(&qp->replylist_lock);
2493 
2494 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2495 		if (rep->xid == msgid)
2496 			break;
2497 	}
2498 
2499 	if (rep != NULL) {
2500 		/*
2501 		 * If message not yet received, wait.
2502 		 */
2503 		if (rep->status == (uint_t)REPLY_WAIT) {
2504 			timout = ddi_get_lbolt() +
2505 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2506 
2507 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2508 			    &qp->replylist_lock, timout)) > 0 &&
2509 			    rep->status == (uint_t)REPLY_WAIT)
2510 				;
2511 
2512 			switch (cv_wait_ret) {
2513 			case -1:	/* timeout */
2514 				ret = RDMA_TIMEDOUT;
2515 				break;
2516 			case 0:
2517 				ret = RDMA_INTR;
2518 				break;
2519 			default:
2520 				break;
2521 			}
2522 		}
2523 
2524 		if (rep->status == RDMA_SUCCESS) {
2525 			struct clist *cl = NULL;
2526 
2527 			/*
2528 			 * Got message successfully
2529 			 */
2530 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2531 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2532 			*clp = cl;
2533 		} else {
2534 			if (rep->status != (uint_t)REPLY_WAIT) {
2535 				/*
2536 				 * Got error in reply message. Free
2537 				 * recv buffer here.
2538 				 */
2539 				ret = rep->status;
2540 				rib_rbuf_free(conn, RECV_BUFFER,
2541 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2542 			}
2543 		}
2544 		(void) rib_remreply(qp, rep);
2545 	} else {
2546 		/*
2547 		 * No matching reply structure found for given msgid on the
2548 		 * reply wait list.
2549 		 */
2550 		ret = RDMA_INVAL;
2551 		DTRACE_PROBE(rpcib__i__nomatchxid2);
2552 	}
2553 
2554 	/*
2555 	 * Done.
2556 	 */
2557 	mutex_exit(&qp->replylist_lock);
2558 	return (ret);
2559 }
2560 
2561 /*
2562  * RDMA write a buffer to the remote address.
2563  */
2564 rdma_stat
2565 rib_write(CONN *conn, struct clist *cl, int wait)
2566 {
2567 	ibt_send_wr_t	tx_wr;
2568 	int		cv_sig;
2569 	int		i;
2570 	ibt_wr_ds_t	sgl[DSEG_MAX];
2571 	struct send_wid	*wdesc;
2572 	ibt_status_t	ibt_status;
2573 	rdma_stat	ret = RDMA_SUCCESS;
2574 	rib_qp_t	*qp = ctoqp(conn);
2575 	uint64_t	n_writes = 0;
2576 	bool_t		force_wait = FALSE;
2577 
2578 	if (cl == NULL) {
2579 		return (RDMA_FAILED);
2580 	}
2581 
2582 
2583 	while ((cl != NULL)) {
2584 		if (cl->c_len > 0) {
2585 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2586 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2587 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2588 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2589 			sgl[0].ds_va = cl->w.c_saddr;
2590 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2591 			sgl[0].ds_len = cl->c_len;
2592 
2593 			if (wait) {
2594 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2595 				cv_sig = 1;
2596 			} else {
2597 				if (n_writes > max_unsignaled_rws) {
2598 					n_writes = 0;
2599 					force_wait = TRUE;
2600 					tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2601 					cv_sig = 1;
2602 				} else {
2603 					tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2604 					cv_sig = 0;
2605 				}
2606 			}
2607 
2608 			wdesc = rib_init_sendwait(0, cv_sig, qp);
2609 			tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2610 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2611 			tx_wr.wr_trans = IBT_RC_SRV;
2612 			tx_wr.wr_nds = 1;
2613 			tx_wr.wr_sgl = sgl;
2614 
2615 			mutex_enter(&conn->c_lock);
2616 			if (conn->c_state == C_CONNECTED) {
2617 				ibt_status =
2618 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2619 			}
2620 			if (conn->c_state != C_CONNECTED ||
2621 			    ibt_status != IBT_SUCCESS) {
2622 				if (conn->c_state != C_DISCONN_PEND)
2623 					conn->c_state = C_ERROR_CONN;
2624 				mutex_exit(&conn->c_lock);
2625 				(void) rib_free_sendwait(wdesc);
2626 				return (RDMA_CONNLOST);
2627 			}
2628 			mutex_exit(&conn->c_lock);
2629 
2630 			/*
2631 			 * Wait for send to complete
2632 			 */
2633 			if (wait || force_wait) {
2634 				force_wait = FALSE;
2635 				ret = rib_sendwait(qp, wdesc);
2636 				if (ret != 0) {
2637 					return (ret);
2638 				}
2639 			} else {
2640 				mutex_enter(&wdesc->sendwait_lock);
2641 				for (i = 0; i < wdesc->nsbufs; i++) {
2642 					rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2643 					    (void *)(uintptr_t)
2644 					    wdesc->sbufaddr[i]);
2645 				}
2646 				mutex_exit(&wdesc->sendwait_lock);
2647 				(void) rib_free_sendwait(wdesc);
2648 			}
2649 			n_writes ++;
2650 		}
2651 		cl = cl->c_next;
2652 	}
2653 	return (RDMA_SUCCESS);
2654 }
2655 
2656 /*
2657  * RDMA Read a buffer from the remote address.
2658  */
2659 rdma_stat
2660 rib_read(CONN *conn, struct clist *cl, int wait)
2661 {
2662 	ibt_send_wr_t	rx_wr;
2663 	int		cv_sig;
2664 	int		i;
2665 	ibt_wr_ds_t	sgl;
2666 	struct send_wid	*wdesc;
2667 	ibt_status_t	ibt_status = IBT_SUCCESS;
2668 	rdma_stat	ret = RDMA_SUCCESS;
2669 	rib_qp_t	*qp = ctoqp(conn);
2670 
2671 	if (cl == NULL) {
2672 		return (RDMA_FAILED);
2673 	}
2674 
2675 	while (cl != NULL) {
2676 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2677 		/*
2678 		 * Remote address is at the head chunk item in list.
2679 		 */
2680 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2681 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2682 
2683 		sgl.ds_va = cl->u.c_daddr;
2684 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2685 		sgl.ds_len = cl->c_len;
2686 
2687 		if (wait) {
2688 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2689 			cv_sig = 1;
2690 		} else {
2691 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2692 			cv_sig = 0;
2693 		}
2694 
2695 		wdesc = rib_init_sendwait(0, cv_sig, qp);
2696 		rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2697 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2698 		rx_wr.wr_trans = IBT_RC_SRV;
2699 		rx_wr.wr_nds = 1;
2700 		rx_wr.wr_sgl = &sgl;
2701 
2702 		mutex_enter(&conn->c_lock);
2703 		if (conn->c_state == C_CONNECTED) {
2704 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2705 		}
2706 		if (conn->c_state != C_CONNECTED ||
2707 		    ibt_status != IBT_SUCCESS) {
2708 			if (conn->c_state != C_DISCONN_PEND)
2709 				conn->c_state = C_ERROR_CONN;
2710 			mutex_exit(&conn->c_lock);
2711 			(void) rib_free_sendwait(wdesc);
2712 			return (RDMA_CONNLOST);
2713 		}
2714 		mutex_exit(&conn->c_lock);
2715 
2716 		/*
2717 		 * Wait for send to complete if this is the
2718 		 * last item in the list.
2719 		 */
2720 		if (wait && cl->c_next == NULL) {
2721 			ret = rib_sendwait(qp, wdesc);
2722 			if (ret != 0) {
2723 				return (ret);
2724 			}
2725 		} else {
2726 			mutex_enter(&wdesc->sendwait_lock);
2727 			for (i = 0; i < wdesc->nsbufs; i++) {
2728 				rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2729 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2730 			}
2731 			mutex_exit(&wdesc->sendwait_lock);
2732 			(void) rib_free_sendwait(wdesc);
2733 		}
2734 		cl = cl->c_next;
2735 	}
2736 	return (RDMA_SUCCESS);
2737 }
2738 
2739 /*
2740  * rib_srv_cm_handler()
2741  *    Connection Manager callback to handle RC connection requests.
2742  */
2743 /* ARGSUSED */
2744 static ibt_cm_status_t
2745 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2746 	ibt_cm_return_args_t *ret_args, void *priv_data,
2747 	ibt_priv_data_len_t len)
2748 {
2749 	queue_t		*q;
2750 	rib_qp_t	*qp;
2751 	rpcib_state_t	*ribstat;
2752 	rib_hca_t	*hca;
2753 	rdma_stat	status = RDMA_SUCCESS;
2754 	int		i;
2755 	struct clist	cl;
2756 	rdma_buf_t	rdbuf = {0};
2757 	void		*buf = NULL;
2758 	CONN		*conn;
2759 	ibt_ip_cm_info_t	ipinfo;
2760 	struct sockaddr_in *s;
2761 	struct sockaddr_in6 *s6;
2762 	int sin_size = sizeof (struct sockaddr_in);
2763 	int in_size = sizeof (struct in_addr);
2764 	int sin6_size = sizeof (struct sockaddr_in6);
2765 
2766 	ASSERT(any != NULL);
2767 	ASSERT(event != NULL);
2768 
2769 	ribstat = (rpcib_state_t *)any;
2770 	hca = (rib_hca_t *)ribstat->hca;
2771 	ASSERT(hca != NULL);
2772 
2773 	/* got a connection request */
2774 	switch (event->cm_type) {
2775 	case IBT_CM_EVENT_REQ_RCV:
2776 		/*
2777 		 * If the plugin is in the NO_ACCEPT state, bail out.
2778 		 */
2779 		mutex_enter(&plugin_state_lock);
2780 		if (plugin_state == NO_ACCEPT) {
2781 			mutex_exit(&plugin_state_lock);
2782 			return (IBT_CM_REJECT);
2783 		}
2784 		mutex_exit(&plugin_state_lock);
2785 
2786 		/*
2787 		 * Need to send a MRA MAD to CM so that it does not
2788 		 * timeout on us.
2789 		 */
2790 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2791 		    event->cm_event.req.req_timeout * 8, NULL, 0);
2792 
2793 		mutex_enter(&rib_stat->open_hca_lock);
2794 		q = rib_stat->q;
2795 		mutex_exit(&rib_stat->open_hca_lock);
2796 
2797 		status = rib_svc_create_chan(hca, (caddr_t)q,
2798 		    event->cm_event.req.req_prim_hca_port, &qp);
2799 
2800 		if (status) {
2801 			return (IBT_CM_REJECT);
2802 		}
2803 
2804 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2805 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2806 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2807 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2808 
2809 		/*
2810 		 * Pre-posts RECV buffers
2811 		 */
2812 		conn = qptoc(qp);
2813 		for (i = 0; i < preposted_rbufs; i++) {
2814 			bzero(&rdbuf, sizeof (rdbuf));
2815 			rdbuf.type = RECV_BUFFER;
2816 			buf = rib_rbuf_alloc(conn, &rdbuf);
2817 			if (buf == NULL) {
2818 				(void) rib_disconnect_channel(conn, NULL);
2819 				return (IBT_CM_REJECT);
2820 			}
2821 
2822 			bzero(&cl, sizeof (cl));
2823 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
2824 			cl.c_len = rdbuf.len;
2825 			cl.c_smemhandle.mrc_lmr =
2826 			    rdbuf.handle.mrc_lmr; /* lkey */
2827 			cl.c_next = NULL;
2828 			status = rib_post_recv(conn, &cl);
2829 			if (status != RDMA_SUCCESS) {
2830 				(void) rib_disconnect_channel(conn, NULL);
2831 				return (IBT_CM_REJECT);
2832 			}
2833 		}
2834 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
2835 
2836 		/*
2837 		 * Get the address translation
2838 		 */
2839 		rw_enter(&hca->state_lock, RW_READER);
2840 		if (hca->state == HCA_DETACHED) {
2841 			rw_exit(&hca->state_lock);
2842 			return (IBT_CM_REJECT);
2843 		}
2844 		rw_exit(&hca->state_lock);
2845 
2846 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
2847 
2848 		if (ibt_get_ip_data(event->cm_priv_data_len,
2849 		    event->cm_priv_data,
2850 		    &ipinfo) != IBT_SUCCESS) {
2851 
2852 			return (IBT_CM_REJECT);
2853 		}
2854 
2855 		switch (ipinfo.src_addr.family) {
2856 		case AF_INET:
2857 
2858 			conn->c_raddr.maxlen =
2859 			    conn->c_raddr.len = sin_size;
2860 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
2861 
2862 			s = (struct sockaddr_in *)conn->c_raddr.buf;
2863 			s->sin_family = AF_INET;
2864 
2865 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
2866 			    &s->sin_addr, in_size);
2867 
2868 			break;
2869 
2870 		case AF_INET6:
2871 
2872 			conn->c_raddr.maxlen =
2873 			    conn->c_raddr.len = sin6_size;
2874 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
2875 
2876 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
2877 			s6->sin6_family = AF_INET6;
2878 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
2879 			    &s6->sin6_addr,
2880 			    sizeof (struct in6_addr));
2881 
2882 			break;
2883 
2884 		default:
2885 			return (IBT_CM_REJECT);
2886 		}
2887 
2888 		break;
2889 
2890 	case IBT_CM_EVENT_CONN_CLOSED:
2891 	{
2892 		CONN		*conn;
2893 		rib_qp_t	*qp;
2894 
2895 		switch (event->cm_event.closed) {
2896 		case IBT_CM_CLOSED_DREP_RCVD:
2897 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
2898 		case IBT_CM_CLOSED_DUP:
2899 		case IBT_CM_CLOSED_ABORT:
2900 		case IBT_CM_CLOSED_ALREADY:
2901 			/*
2902 			 * These cases indicate the local end initiated
2903 			 * the closing of the channel. Nothing to do here.
2904 			 */
2905 			break;
2906 		default:
2907 			/*
2908 			 * Reason for CONN_CLOSED event must be one of
2909 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
2910 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
2911 			 * the remote end is closing the channel. In these
2912 			 * cases free the channel and transition to error
2913 			 * state
2914 			 */
2915 			qp = ibt_get_chan_private(event->cm_channel);
2916 			conn = qptoc(qp);
2917 			mutex_enter(&conn->c_lock);
2918 			if (conn->c_state == C_DISCONN_PEND) {
2919 				mutex_exit(&conn->c_lock);
2920 				break;
2921 			}
2922 			conn->c_state = C_ERROR_CONN;
2923 
2924 			/*
2925 			 * Free the rc_channel. Channel has already
2926 			 * transitioned to ERROR state and WRs have been
2927 			 * FLUSHED_ERR already.
2928 			 */
2929 			(void) ibt_free_channel(qp->qp_hdl);
2930 			qp->qp_hdl = NULL;
2931 
2932 			/*
2933 			 * Free the conn if c_ref goes down to 0
2934 			 */
2935 			if (conn->c_ref == 0) {
2936 				/*
2937 				 * Remove from list and free conn
2938 				 */
2939 				conn->c_state = C_DISCONN_PEND;
2940 				mutex_exit(&conn->c_lock);
2941 				(void) rib_disconnect_channel(conn,
2942 				    &hca->srv_conn_list);
2943 			} else {
2944 				mutex_exit(&conn->c_lock);
2945 			}
2946 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
2947 			break;
2948 		}
2949 		break;
2950 	}
2951 	case IBT_CM_EVENT_CONN_EST:
2952 		/*
2953 		 * RTU received, hence connection established.
2954 		 */
2955 		if (rib_debug > 1)
2956 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2957 			    "(CONN_EST) channel established");
2958 		break;
2959 
2960 	default:
2961 		if (rib_debug > 2) {
2962 			/* Let CM handle the following events. */
2963 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
2964 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2965 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
2966 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
2967 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2968 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
2969 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
2970 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2971 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
2972 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
2973 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2974 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
2975 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
2976 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2977 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
2978 			}
2979 		}
2980 		return (IBT_CM_DEFAULT);
2981 	}
2982 
2983 	/* accept all other CM messages (i.e. let the CM handle them) */
2984 	return (IBT_CM_ACCEPT);
2985 }
2986 
2987 static rdma_stat
2988 rib_register_service(rib_hca_t *hca, int service_type)
2989 {
2990 	ibt_srv_desc_t		sdesc;
2991 	ibt_hca_portinfo_t	*port_infop;
2992 	ib_svc_id_t		srv_id;
2993 	ibt_srv_hdl_t		srv_hdl;
2994 	uint_t			port_size;
2995 	uint_t			pki, i, num_ports, nbinds;
2996 	ibt_status_t		ibt_status;
2997 	rib_service_t		*new_service;
2998 	ib_pkey_t		pkey;
2999 
3000 	/*
3001 	 * Query all ports for the given HCA
3002 	 */
3003 	rw_enter(&hca->state_lock, RW_READER);
3004 	if (hca->state != HCA_DETACHED) {
3005 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3006 		    &num_ports, &port_size);
3007 		rw_exit(&hca->state_lock);
3008 	} else {
3009 		rw_exit(&hca->state_lock);
3010 		return (RDMA_FAILED);
3011 	}
3012 	if (ibt_status != IBT_SUCCESS) {
3013 		return (RDMA_FAILED);
3014 	}
3015 
3016 	DTRACE_PROBE1(rpcib__i__regservice_numports,
3017 	    int, num_ports);
3018 
3019 	for (i = 0; i < num_ports; i++) {
3020 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3021 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3022 			    int, i+1);
3023 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3024 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
3025 			    int, i+1);
3026 		}
3027 	}
3028 
3029 	/*
3030 	 * Get all the IP addresses on this system to register the
3031 	 * given "service type" on all DNS recognized IP addrs.
3032 	 * Each service type such as NFS will have all the systems
3033 	 * IP addresses as its different names. For now the only
3034 	 * type of service we support in RPCIB is NFS.
3035 	 */
3036 	rw_enter(&hca->service_list_lock, RW_WRITER);
3037 	/*
3038 	 * Start registering and binding service to active
3039 	 * on active ports on this HCA.
3040 	 */
3041 	nbinds = 0;
3042 	new_service = NULL;
3043 
3044 	/*
3045 	 * We use IP addresses as the service names for
3046 	 * service registration.  Register each of them
3047 	 * with CM to obtain a svc_id and svc_hdl.  We do not
3048 	 * register the service with machine's loopback address.
3049 	 */
3050 	(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3051 	(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3052 	(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3053 
3054 	sdesc.sd_handler = rib_srv_cm_handler;
3055 	sdesc.sd_flags = 0;
3056 	ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3057 	    &sdesc, ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT),
3058 	    1, &srv_hdl, &srv_id);
3059 
3060 	for (i = 0; i < num_ports; i++) {
3061 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3062 			continue;
3063 
3064 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3065 			pkey = port_infop[i].p_pkey_tbl[pki];
3066 			if ((pkey & IBSRM_HB) &&
3067 			    (pkey != IB_PKEY_INVALID_FULL)) {
3068 
3069 				/*
3070 				 * Allocate and prepare a service entry
3071 				 */
3072 				new_service =
3073 				    kmem_zalloc(1 * sizeof (rib_service_t),
3074 				    KM_SLEEP);
3075 
3076 				new_service->srv_type = service_type;
3077 				new_service->srv_hdl = srv_hdl;
3078 				new_service->srv_next = NULL;
3079 
3080 				ibt_status = ibt_bind_service(srv_hdl,
3081 				    port_infop[i].p_sgid_tbl[0],
3082 				    NULL, rib_stat, NULL);
3083 
3084 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3085 				    int, ibt_status);
3086 
3087 				if (ibt_status != IBT_SUCCESS) {
3088 					kmem_free(new_service,
3089 					    sizeof (rib_service_t));
3090 					new_service = NULL;
3091 					continue;
3092 				}
3093 
3094 				/*
3095 				 * Add to the service list for this HCA
3096 				 */
3097 				new_service->srv_next = hca->service_list;
3098 				hca->service_list = new_service;
3099 				new_service = NULL;
3100 				nbinds++;
3101 			}
3102 		}
3103 	}
3104 	rw_exit(&hca->service_list_lock);
3105 
3106 	ibt_free_portinfo(port_infop, port_size);
3107 
3108 	if (nbinds == 0) {
3109 		return (RDMA_FAILED);
3110 	} else {
3111 		/*
3112 		 * Put this plugin into accept state, since atleast
3113 		 * one registration was successful.
3114 		 */
3115 		mutex_enter(&plugin_state_lock);
3116 		plugin_state = ACCEPT;
3117 		mutex_exit(&plugin_state_lock);
3118 		return (RDMA_SUCCESS);
3119 	}
3120 }
3121 
3122 void
3123 rib_listen(struct rdma_svc_data *rd)
3124 {
3125 	rdma_stat status = RDMA_SUCCESS;
3126 
3127 	rd->active = 0;
3128 	rd->err_code = RDMA_FAILED;
3129 
3130 	/*
3131 	 * First check if a hca is still attached
3132 	 */
3133 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
3134 	if (rib_stat->hca->state != HCA_INITED) {
3135 		rw_exit(&rib_stat->hca->state_lock);
3136 		return;
3137 	}
3138 	rw_exit(&rib_stat->hca->state_lock);
3139 
3140 	rib_stat->q = &rd->q;
3141 	/*
3142 	 * Right now the only service type is NFS. Hence force feed this
3143 	 * value. Ideally to communicate the service type it should be
3144 	 * passed down in rdma_svc_data.
3145 	 */
3146 	rib_stat->service_type = NFS;
3147 	status = rib_register_service(rib_stat->hca, NFS);
3148 	if (status != RDMA_SUCCESS) {
3149 		rd->err_code = status;
3150 		return;
3151 	}
3152 	/*
3153 	 * Service active on an HCA, check rd->err_code for more
3154 	 * explainable errors.
3155 	 */
3156 	rd->active = 1;
3157 	rd->err_code = status;
3158 }
3159 
3160 /* XXXX */
3161 /* ARGSUSED */
3162 static void
3163 rib_listen_stop(struct rdma_svc_data *svcdata)
3164 {
3165 	rib_hca_t		*hca;
3166 
3167 	/*
3168 	 * KRPC called the RDMATF to stop the listeners, this means
3169 	 * stop sending incomming or recieved requests to KRPC master
3170 	 * transport handle for RDMA-IB. This is also means that the
3171 	 * master transport handle, responsible for us, is going away.
3172 	 */
3173 	mutex_enter(&plugin_state_lock);
3174 	plugin_state = NO_ACCEPT;
3175 	if (svcdata != NULL)
3176 		svcdata->active = 0;
3177 	mutex_exit(&plugin_state_lock);
3178 
3179 	/*
3180 	 * First check if a hca is still attached
3181 	 */
3182 	hca = rib_stat->hca;
3183 	rw_enter(&hca->state_lock, RW_READER);
3184 	if (hca->state != HCA_INITED) {
3185 		rw_exit(&hca->state_lock);
3186 		return;
3187 	}
3188 	rib_close_channels(&hca->srv_conn_list);
3189 	rib_stop_services(hca);
3190 	rw_exit(&hca->state_lock);
3191 }
3192 
3193 /*
3194  * Traverse the HCA's service list to unbind and deregister services.
3195  * Instead of unbinding the service for a service handle by
3196  * calling ibt_unbind_service() for each port/pkey, we unbind
3197  * all the services for the service handle by making only one
3198  * call to ibt_unbind_all_services().  Then, we deregister the
3199  * service for the service handle.
3200  *
3201  * When traversing the entries in service_list, we compare the
3202  * srv_hdl of the current entry with that of the next.  If they
3203  * are different or if the next entry is NULL, the current entry
3204  * marks the last binding of the service handle.  In this case,
3205  * call ibt_unbind_all_services() and deregister the service for
3206  * the service handle.  If they are the same, the current and the
3207  * next entries are bound to the same service handle.  In this
3208  * case, move on to the next entry.
3209  */
3210 static void
3211 rib_stop_services(rib_hca_t *hca)
3212 {
3213 	rib_service_t		*srv_list, *to_remove;
3214 
3215 	/*
3216 	 * unbind and deregister the services for this service type.
3217 	 * Right now there is only one service type. In future it will
3218 	 * be passed down to this function.
3219 	 */
3220 	rw_enter(&hca->service_list_lock, RW_WRITER);
3221 	srv_list = hca->service_list;
3222 	while (srv_list != NULL) {
3223 		to_remove = srv_list;
3224 		srv_list = to_remove->srv_next;
3225 		if (srv_list == NULL || bcmp(to_remove->srv_hdl,
3226 		    srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) {
3227 
3228 			(void) ibt_unbind_all_services(to_remove->srv_hdl);
3229 			(void) ibt_deregister_service(hca->ibt_clnt_hdl,
3230 			    to_remove->srv_hdl);
3231 		}
3232 
3233 		kmem_free(to_remove, sizeof (rib_service_t));
3234 	}
3235 	hca->service_list = NULL;
3236 	rw_exit(&hca->service_list_lock);
3237 }
3238 
3239 static struct svc_recv *
3240 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3241 {
3242 	struct svc_recv	*recvp;
3243 
3244 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3245 	recvp->vaddr = sgl->ds_va;
3246 	recvp->qp = qp;
3247 	recvp->bytes_xfer = 0;
3248 	return (recvp);
3249 }
3250 
3251 static int
3252 rib_free_svc_recv(struct svc_recv *recvp)
3253 {
3254 	kmem_free(recvp, sizeof (*recvp));
3255 
3256 	return (0);
3257 }
3258 
3259 static struct reply *
3260 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3261 {
3262 	struct reply	*rep;
3263 
3264 
3265 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3266 	if (rep == NULL) {
3267 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3268 		return (NULL);
3269 	}
3270 	rep->xid = msgid;
3271 	rep->vaddr_cq = NULL;
3272 	rep->bytes_xfer = 0;
3273 	rep->status = (uint_t)REPLY_WAIT;
3274 	rep->prev = NULL;
3275 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3276 
3277 	mutex_enter(&qp->replylist_lock);
3278 	if (qp->replylist) {
3279 		rep->next = qp->replylist;
3280 		qp->replylist->prev = rep;
3281 	}
3282 	qp->rep_list_size++;
3283 
3284 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3285 	    int, qp->rep_list_size);
3286 
3287 	qp->replylist = rep;
3288 	mutex_exit(&qp->replylist_lock);
3289 
3290 	return (rep);
3291 }
3292 
3293 static rdma_stat
3294 rib_rem_replylist(rib_qp_t *qp)
3295 {
3296 	struct reply	*r, *n;
3297 
3298 	mutex_enter(&qp->replylist_lock);
3299 	for (r = qp->replylist; r != NULL; r = n) {
3300 		n = r->next;
3301 		(void) rib_remreply(qp, r);
3302 	}
3303 	mutex_exit(&qp->replylist_lock);
3304 
3305 	return (RDMA_SUCCESS);
3306 }
3307 
3308 static int
3309 rib_remreply(rib_qp_t *qp, struct reply *rep)
3310 {
3311 
3312 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3313 	if (rep->prev) {
3314 		rep->prev->next = rep->next;
3315 	}
3316 	if (rep->next) {
3317 		rep->next->prev = rep->prev;
3318 	}
3319 	if (qp->replylist == rep)
3320 		qp->replylist = rep->next;
3321 
3322 	cv_destroy(&rep->wait_cv);
3323 	qp->rep_list_size--;
3324 
3325 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3326 	    int, qp->rep_list_size);
3327 
3328 	kmem_free(rep, sizeof (*rep));
3329 
3330 	return (0);
3331 }
3332 
3333 rdma_stat
3334 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3335 	struct mrc *buf_handle)
3336 {
3337 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3338 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3339 	rdma_stat	status;
3340 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3341 
3342 	/*
3343 	 * Note: ALL buffer pools use the same memory type RDMARW.
3344 	 */
3345 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3346 	if (status == RDMA_SUCCESS) {
3347 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3348 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3349 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3350 	} else {
3351 		buf_handle->mrc_linfo = NULL;
3352 		buf_handle->mrc_lmr = 0;
3353 		buf_handle->mrc_rmr = 0;
3354 	}
3355 	return (status);
3356 }
3357 
3358 static rdma_stat
3359 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3360 	ibt_mr_flags_t spec,
3361 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3362 {
3363 	ibt_mr_attr_t	mem_attr;
3364 	ibt_status_t	ibt_status;
3365 	mem_attr.mr_vaddr = (uintptr_t)buf;
3366 	mem_attr.mr_len = (ib_msglen_t)size;
3367 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3368 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3369 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3370 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3371 
3372 	rw_enter(&hca->state_lock, RW_READER);
3373 	if (hca->state == HCA_INITED) {
3374 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3375 		    &mem_attr, mr_hdlp, mr_descp);
3376 		rw_exit(&hca->state_lock);
3377 	} else {
3378 		rw_exit(&hca->state_lock);
3379 		return (RDMA_FAILED);
3380 	}
3381 
3382 	if (ibt_status != IBT_SUCCESS) {
3383 		return (RDMA_FAILED);
3384 	}
3385 	return (RDMA_SUCCESS);
3386 }
3387 
3388 rdma_stat
3389 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3390 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3391 {
3392 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3393 	rib_lrc_entry_t *l;
3394 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3395 	rdma_stat	status;
3396 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3397 
3398 	/*
3399 	 * Non-coherent memory registration.
3400 	 */
3401 	l = (rib_lrc_entry_t *)lrc;
3402 	if (l) {
3403 		if (l->registered) {
3404 			buf_handle->mrc_linfo =
3405 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3406 			buf_handle->mrc_lmr =
3407 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3408 			buf_handle->mrc_rmr =
3409 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3410 			*sync_handle = (RIB_SYNCMEM_HANDLE)
3411 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3412 			return (RDMA_SUCCESS);
3413 		} else {
3414 			/* Always register the whole buffer */
3415 			buf = (caddr_t)l->lrc_buf;
3416 			buflen = l->lrc_len;
3417 		}
3418 	}
3419 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3420 
3421 	if (status == RDMA_SUCCESS) {
3422 		if (l) {
3423 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3424 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3425 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3426 			l->registered		 = TRUE;
3427 		}
3428 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3429 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3430 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3431 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3432 	} else {
3433 		buf_handle->mrc_linfo = NULL;
3434 		buf_handle->mrc_lmr = 0;
3435 		buf_handle->mrc_rmr = 0;
3436 	}
3437 	return (status);
3438 }
3439 
3440 /* ARGSUSED */
3441 rdma_stat
3442 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3443 {
3444 	rib_hca_t *hca = (ctoqp(conn))->hca;
3445 	/*
3446 	 * Allow memory deregistration even if HCA is
3447 	 * getting detached. Need all outstanding
3448 	 * memory registrations to be deregistered
3449 	 * before HCA_DETACH_EVENT can be accepted.
3450 	 */
3451 	(void) ibt_deregister_mr(hca->hca_hdl,
3452 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3453 	return (RDMA_SUCCESS);
3454 }
3455 
3456 /* ARGSUSED */
3457 rdma_stat
3458 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3459 		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3460 {
3461 	rib_lrc_entry_t *l;
3462 	l = (rib_lrc_entry_t *)lrc;
3463 	if (l)
3464 		if (l->registered)
3465 			return (RDMA_SUCCESS);
3466 
3467 	(void) rib_deregistermem(conn, buf, buf_handle);
3468 
3469 	return (RDMA_SUCCESS);
3470 }
3471 
3472 /* ARGSUSED */
3473 rdma_stat
3474 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3475 		int len, int cpu)
3476 {
3477 	ibt_status_t	status;
3478 	rib_hca_t *hca = (ctoqp(conn))->hca;
3479 	ibt_mr_sync_t	mr_segment;
3480 
3481 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3482 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3483 	mr_segment.ms_len = (ib_memlen_t)len;
3484 	if (cpu) {
3485 		/* make incoming data visible to memory */
3486 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3487 	} else {
3488 		/* make memory changes visible to IO */
3489 		mr_segment.ms_flags = IBT_SYNC_READ;
3490 	}
3491 	rw_enter(&hca->state_lock, RW_READER);
3492 	if (hca->state == HCA_INITED) {
3493 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3494 		rw_exit(&hca->state_lock);
3495 	} else {
3496 		rw_exit(&hca->state_lock);
3497 		return (RDMA_FAILED);
3498 	}
3499 
3500 	if (status == IBT_SUCCESS)
3501 		return (RDMA_SUCCESS);
3502 	else {
3503 		return (RDMA_FAILED);
3504 	}
3505 }
3506 
3507 /*
3508  * XXXX	????
3509  */
3510 static rdma_stat
3511 rib_getinfo(rdma_info_t *info)
3512 {
3513 	/*
3514 	 * XXXX	Hack!
3515 	 */
3516 	info->addrlen = 16;
3517 	info->mts = 1000000;
3518 	info->mtu = 1000000;
3519 
3520 	return (RDMA_SUCCESS);
3521 }
3522 
3523 rib_bufpool_t *
3524 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3525 {
3526 	rib_bufpool_t	*rbp = NULL;
3527 	bufpool_t	*bp = NULL;
3528 	caddr_t		buf;
3529 	ibt_mr_attr_t	mem_attr;
3530 	ibt_status_t	ibt_status;
3531 	int		i, j;
3532 
3533 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3534 
3535 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3536 	    num * sizeof (void *), KM_SLEEP);
3537 
3538 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3539 	bp->numelems = num;
3540 
3541 
3542 	switch (ptype) {
3543 	case SEND_BUFFER:
3544 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3545 		bp->rsize = RPC_MSG_SZ;
3546 		break;
3547 	case RECV_BUFFER:
3548 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3549 		bp->rsize = RPC_BUF_SIZE;
3550 		break;
3551 	default:
3552 		goto fail;
3553 	}
3554 
3555 	/*
3556 	 * Register the pool.
3557 	 */
3558 	bp->bufsize = num * bp->rsize;
3559 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3560 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3561 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3562 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3563 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3564 	rw_enter(&hca->state_lock, RW_READER);
3565 
3566 	if (hca->state != HCA_INITED) {
3567 		rw_exit(&hca->state_lock);
3568 		goto fail;
3569 	}
3570 
3571 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3572 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3573 		mem_attr.mr_vaddr = (uintptr_t)buf;
3574 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3575 		mem_attr.mr_as = NULL;
3576 		ibt_status = ibt_register_mr(hca->hca_hdl,
3577 		    hca->pd_hdl, &mem_attr,
3578 		    &rbp->mr_hdl[i],
3579 		    &rbp->mr_desc[i]);
3580 		if (ibt_status != IBT_SUCCESS) {
3581 			for (j = 0; j < i; j++) {
3582 				(void) ibt_deregister_mr(hca->hca_hdl,
3583 				    rbp->mr_hdl[j]);
3584 			}
3585 			rw_exit(&hca->state_lock);
3586 			goto fail;
3587 		}
3588 	}
3589 	rw_exit(&hca->state_lock);
3590 	buf = (caddr_t)bp->buf;
3591 	for (i = 0; i < num; i++, buf += bp->rsize) {
3592 		bp->buflist[i] = (void *)buf;
3593 	}
3594 	bp->buffree = num - 1;	/* no. of free buffers */
3595 	rbp->bpool = bp;
3596 
3597 	return (rbp);
3598 fail:
3599 	if (bp) {
3600 		if (bp->buf)
3601 			kmem_free(bp->buf, bp->bufsize);
3602 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3603 	}
3604 	if (rbp) {
3605 		if (rbp->mr_hdl)
3606 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3607 		if (rbp->mr_desc)
3608 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3609 		kmem_free(rbp, sizeof (rib_bufpool_t));
3610 	}
3611 	return (NULL);
3612 }
3613 
3614 static void
3615 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3616 {
3617 	int i;
3618 	rib_bufpool_t *rbp = NULL;
3619 	bufpool_t *bp;
3620 
3621 	/*
3622 	 * Obtain pool address based on type of pool
3623 	 */
3624 	switch (ptype) {
3625 		case SEND_BUFFER:
3626 			rbp = hca->send_pool;
3627 			break;
3628 		case RECV_BUFFER:
3629 			rbp = hca->recv_pool;
3630 			break;
3631 		default:
3632 			return;
3633 	}
3634 	if (rbp == NULL)
3635 		return;
3636 
3637 	bp = rbp->bpool;
3638 
3639 	/*
3640 	 * Deregister the pool memory and free it.
3641 	 */
3642 	for (i = 0; i < bp->numelems; i++) {
3643 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3644 	}
3645 }
3646 
3647 static void
3648 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3649 {
3650 
3651 	rib_bufpool_t *rbp = NULL;
3652 	bufpool_t *bp;
3653 
3654 	/*
3655 	 * Obtain pool address based on type of pool
3656 	 */
3657 	switch (ptype) {
3658 		case SEND_BUFFER:
3659 			rbp = hca->send_pool;
3660 			break;
3661 		case RECV_BUFFER:
3662 			rbp = hca->recv_pool;
3663 			break;
3664 		default:
3665 			return;
3666 	}
3667 	if (rbp == NULL)
3668 		return;
3669 
3670 	bp = rbp->bpool;
3671 
3672 	/*
3673 	 * Free the pool memory.
3674 	 */
3675 	if (rbp->mr_hdl)
3676 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3677 
3678 	if (rbp->mr_desc)
3679 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
3680 	if (bp->buf)
3681 		kmem_free(bp->buf, bp->bufsize);
3682 	mutex_destroy(&bp->buflock);
3683 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
3684 	kmem_free(rbp, sizeof (rib_bufpool_t));
3685 }
3686 
3687 void
3688 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
3689 {
3690 	/*
3691 	 * Deregister the pool memory and free it.
3692 	 */
3693 	rib_rbufpool_deregister(hca, ptype);
3694 	rib_rbufpool_free(hca, ptype);
3695 }
3696 
3697 /*
3698  * Fetch a buffer from the pool of type specified in rdbuf->type.
3699  */
3700 static rdma_stat
3701 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3702 {
3703 	rib_lrc_entry_t *rlep;
3704 
3705 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
3706 		rlep = rib_get_cache_buf(conn, rdbuf->len);
3707 		rdbuf->rb_private =  (caddr_t)rlep;
3708 		rdbuf->addr = rlep->lrc_buf;
3709 		rdbuf->handle = rlep->lrc_mhandle;
3710 		return (RDMA_SUCCESS);
3711 	}
3712 
3713 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
3714 	if (rdbuf->addr) {
3715 		switch (rdbuf->type) {
3716 		case SEND_BUFFER:
3717 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
3718 			break;
3719 		case RECV_BUFFER:
3720 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
3721 			break;
3722 		default:
3723 			rdbuf->len = 0;
3724 		}
3725 		return (RDMA_SUCCESS);
3726 	} else
3727 		return (RDMA_FAILED);
3728 }
3729 
3730 #if defined(MEASURE_POOL_DEPTH)
3731 static void rib_recv_bufs(uint32_t x) {
3732 
3733 }
3734 
3735 static void rib_send_bufs(uint32_t x) {
3736 
3737 }
3738 #endif
3739 
3740 /*
3741  * Fetch a buffer of specified type.
3742  * Note that rdbuf->handle is mw's rkey.
3743  */
3744 static void *
3745 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3746 {
3747 	rib_qp_t	*qp = ctoqp(conn);
3748 	rib_hca_t	*hca = qp->hca;
3749 	rdma_btype	ptype = rdbuf->type;
3750 	void		*buf;
3751 	rib_bufpool_t	*rbp = NULL;
3752 	bufpool_t	*bp;
3753 	int		i;
3754 
3755 	/*
3756 	 * Obtain pool address based on type of pool
3757 	 */
3758 	switch (ptype) {
3759 	case SEND_BUFFER:
3760 		rbp = hca->send_pool;
3761 		break;
3762 	case RECV_BUFFER:
3763 		rbp = hca->recv_pool;
3764 		break;
3765 	default:
3766 		return (NULL);
3767 	}
3768 	if (rbp == NULL)
3769 		return (NULL);
3770 
3771 	bp = rbp->bpool;
3772 
3773 	mutex_enter(&bp->buflock);
3774 	if (bp->buffree < 0) {
3775 		mutex_exit(&bp->buflock);
3776 		return (NULL);
3777 	}
3778 
3779 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
3780 	buf = bp->buflist[bp->buffree];
3781 	rdbuf->addr = buf;
3782 	rdbuf->len = bp->rsize;
3783 	for (i = bp->numelems - 1; i >= 0; i--) {
3784 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
3785 			rdbuf->handle.mrc_rmr =
3786 			    (uint32_t)rbp->mr_desc[i].md_rkey;
3787 			rdbuf->handle.mrc_linfo =
3788 			    (uintptr_t)rbp->mr_hdl[i];
3789 			rdbuf->handle.mrc_lmr =
3790 			    (uint32_t)rbp->mr_desc[i].md_lkey;
3791 #if defined(MEASURE_POOL_DEPTH)
3792 			if (ptype == SEND_BUFFER)
3793 				rib_send_bufs(MAX_BUFS - (bp->buffree+1));
3794 			if (ptype == RECV_BUFFER)
3795 				rib_recv_bufs(MAX_BUFS - (bp->buffree+1));
3796 #endif
3797 			bp->buffree--;
3798 
3799 			mutex_exit(&bp->buflock);
3800 
3801 			return (buf);
3802 		}
3803 	}
3804 
3805 	mutex_exit(&bp->buflock);
3806 
3807 	return (NULL);
3808 }
3809 
3810 static void
3811 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
3812 {
3813 
3814 	if (rdbuf->type == RDMA_LONG_BUFFER) {
3815 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
3816 		rdbuf->rb_private = NULL;
3817 		return;
3818 	}
3819 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
3820 }
3821 
3822 static void
3823 rib_rbuf_free(CONN *conn, int ptype, void *buf)
3824 {
3825 	rib_qp_t *qp = ctoqp(conn);
3826 	rib_hca_t *hca = qp->hca;
3827 	rib_bufpool_t *rbp = NULL;
3828 	bufpool_t *bp;
3829 
3830 	/*
3831 	 * Obtain pool address based on type of pool
3832 	 */
3833 	switch (ptype) {
3834 	case SEND_BUFFER:
3835 		rbp = hca->send_pool;
3836 		break;
3837 	case RECV_BUFFER:
3838 		rbp = hca->recv_pool;
3839 		break;
3840 	default:
3841 		return;
3842 	}
3843 	if (rbp == NULL)
3844 		return;
3845 
3846 	bp = rbp->bpool;
3847 
3848 	mutex_enter(&bp->buflock);
3849 	if (++bp->buffree >= bp->numelems) {
3850 		/*
3851 		 * Should never happen
3852 		 */
3853 		bp->buffree--;
3854 	} else {
3855 		bp->buflist[bp->buffree] = buf;
3856 	}
3857 	mutex_exit(&bp->buflock);
3858 }
3859 
3860 static rdma_stat
3861 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
3862 {
3863 	rw_enter(&connlist->conn_lock, RW_WRITER);
3864 	if (connlist->conn_hd) {
3865 		cn->c_next = connlist->conn_hd;
3866 		connlist->conn_hd->c_prev = cn;
3867 	}
3868 	connlist->conn_hd = cn;
3869 	rw_exit(&connlist->conn_lock);
3870 
3871 	return (RDMA_SUCCESS);
3872 }
3873 
3874 static rdma_stat
3875 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
3876 {
3877 	rw_enter(&connlist->conn_lock, RW_WRITER);
3878 	if (cn->c_prev) {
3879 		cn->c_prev->c_next = cn->c_next;
3880 	}
3881 	if (cn->c_next) {
3882 		cn->c_next->c_prev = cn->c_prev;
3883 	}
3884 	if (connlist->conn_hd == cn)
3885 		connlist->conn_hd = cn->c_next;
3886 	rw_exit(&connlist->conn_lock);
3887 
3888 	return (RDMA_SUCCESS);
3889 }
3890 
3891 /*
3892  * Connection management.
3893  * IBTF does not support recycling of channels. So connections are only
3894  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
3895  * C_DISCONN_PEND state. No C_IDLE state.
3896  * C_CONN_PEND state: Connection establishment in progress to the server.
3897  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
3898  * It has an RC channel associated with it. ibt_post_send/recv are allowed
3899  * only in this state.
3900  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
3901  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
3902  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
3903  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
3904  * c_ref drops to 0 (this indicates that RPC has no more references to this
3905  * connection), the connection should be destroyed. A connection transitions
3906  * into this state when it is being destroyed.
3907  */
3908 static rdma_stat
3909 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn)
3910 {
3911 	CONN *cn;
3912 	int status = RDMA_SUCCESS;
3913 	rib_hca_t *hca = (rib_hca_t *)handle;
3914 	rib_qp_t *qp;
3915 	clock_t cv_stat, timout;
3916 	ibt_path_info_t path;
3917 	ibt_ip_addr_t s_ip, d_ip;
3918 
3919 again:
3920 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
3921 	cn = hca->cl_conn_list.conn_hd;
3922 	while (cn != NULL) {
3923 		/*
3924 		 * First, clear up any connection in the ERROR state
3925 		 */
3926 		mutex_enter(&cn->c_lock);
3927 		if (cn->c_state == C_ERROR_CONN) {
3928 			if (cn->c_ref == 0) {
3929 				/*
3930 				 * Remove connection from list and destroy it.
3931 				 */
3932 				cn->c_state = C_DISCONN_PEND;
3933 				mutex_exit(&cn->c_lock);
3934 				rw_exit(&hca->cl_conn_list.conn_lock);
3935 				(void) rib_disconnect_channel(cn,
3936 				    &hca->cl_conn_list);
3937 				goto again;
3938 			}
3939 			mutex_exit(&cn->c_lock);
3940 			cn = cn->c_next;
3941 			continue;
3942 		}
3943 		if (cn->c_state == C_DISCONN_PEND) {
3944 			mutex_exit(&cn->c_lock);
3945 			cn = cn->c_next;
3946 			continue;
3947 		}
3948 		if ((cn->c_raddr.len == svcaddr->len) &&
3949 		    bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) {
3950 			/*
3951 			 * Our connection. Give up conn list lock
3952 			 * as we are done traversing the list.
3953 			 */
3954 			rw_exit(&hca->cl_conn_list.conn_lock);
3955 			if (cn->c_state == C_CONNECTED) {
3956 				cn->c_ref++;	/* sharing a conn */
3957 				mutex_exit(&cn->c_lock);
3958 				*conn = cn;
3959 				return (status);
3960 			}
3961 			if (cn->c_state == C_CONN_PEND) {
3962 				/*
3963 				 * Hold a reference to this conn before
3964 				 * we give up the lock.
3965 				 */
3966 				cn->c_ref++;
3967 				timout =  ddi_get_lbolt() +
3968 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
3969 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
3970 				    &cn->c_lock, timout)) > 0 &&
3971 				    cn->c_state == C_CONN_PEND)
3972 					;
3973 				if (cv_stat == 0) {
3974 					cn->c_ref--;
3975 					mutex_exit(&cn->c_lock);
3976 					return (RDMA_INTR);
3977 				}
3978 				if (cv_stat < 0) {
3979 					cn->c_ref--;
3980 					mutex_exit(&cn->c_lock);
3981 					return (RDMA_TIMEDOUT);
3982 				}
3983 				if (cn->c_state == C_CONNECTED) {
3984 					*conn = cn;
3985 					mutex_exit(&cn->c_lock);
3986 					return (status);
3987 				} else {
3988 					cn->c_ref--;
3989 					mutex_exit(&cn->c_lock);
3990 					return (RDMA_TIMEDOUT);
3991 				}
3992 			}
3993 		}
3994 		mutex_exit(&cn->c_lock);
3995 		cn = cn->c_next;
3996 	}
3997 	rw_exit(&hca->cl_conn_list.conn_lock);
3998 
3999 	bzero(&path, sizeof (ibt_path_info_t));
4000 	bzero(&s_ip, sizeof (ibt_ip_addr_t));
4001 	bzero(&d_ip, sizeof (ibt_ip_addr_t));
4002 
4003 	status = rib_chk_srv_ibaddr(svcaddr, addr_type, &path, &s_ip, &d_ip);
4004 	if (status != RDMA_SUCCESS) {
4005 		return (RDMA_FAILED);
4006 	}
4007 
4008 	/*
4009 	 * Channel to server doesn't exist yet, create one.
4010 	 */
4011 	if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) {
4012 		return (RDMA_FAILED);
4013 	}
4014 	cn = qptoc(qp);
4015 	cn->c_state = C_CONN_PEND;
4016 	cn->c_ref = 1;
4017 
4018 	/*
4019 	 * Add to conn list.
4020 	 * We had given up the READER lock. In the time since then,
4021 	 * another thread might have created the connection we are
4022 	 * trying here. But for now, that is quiet alright - there
4023 	 * might be two connections between a pair of hosts instead
4024 	 * of one. If we really want to close that window,
4025 	 * then need to check the list after acquiring the
4026 	 * WRITER lock.
4027 	 */
4028 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4029 	status = rib_conn_to_srv(hca, qp, &path, &s_ip, &d_ip);
4030 	mutex_enter(&cn->c_lock);
4031 	if (status == RDMA_SUCCESS) {
4032 		cn->c_state = C_CONNECTED;
4033 		*conn = cn;
4034 	} else {
4035 		cn->c_state = C_ERROR_CONN;
4036 		cn->c_ref--;
4037 	}
4038 	cv_broadcast(&cn->c_cv);
4039 	mutex_exit(&cn->c_lock);
4040 	return (status);
4041 }
4042 
4043 static rdma_stat
4044 rib_conn_release(CONN *conn)
4045 {
4046 	rib_qp_t	*qp = ctoqp(conn);
4047 
4048 	mutex_enter(&conn->c_lock);
4049 	conn->c_ref--;
4050 
4051 	/*
4052 	 * If a conn is C_ERROR_CONN, close the channel.
4053 	 * If it's CONNECTED, keep it that way.
4054 	 */
4055 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4056 		conn->c_state = C_DISCONN_PEND;
4057 		mutex_exit(&conn->c_lock);
4058 		if (qp->mode == RIB_SERVER)
4059 			(void) rib_disconnect_channel(conn,
4060 			    &qp->hca->srv_conn_list);
4061 		else
4062 			(void) rib_disconnect_channel(conn,
4063 			    &qp->hca->cl_conn_list);
4064 		return (RDMA_SUCCESS);
4065 	}
4066 	mutex_exit(&conn->c_lock);
4067 	return (RDMA_SUCCESS);
4068 }
4069 
4070 /*
4071  * Add at front of list
4072  */
4073 static struct rdma_done_list *
4074 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4075 {
4076 	struct rdma_done_list *rd;
4077 
4078 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4079 
4080 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4081 	rd->xid = xid;
4082 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4083 
4084 	rd->prev = NULL;
4085 	rd->next = qp->rdlist;
4086 	if (qp->rdlist != NULL)
4087 		qp->rdlist->prev = rd;
4088 	qp->rdlist = rd;
4089 
4090 	return (rd);
4091 }
4092 
4093 static void
4094 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4095 {
4096 	struct rdma_done_list *r;
4097 
4098 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4099 
4100 	r = rd->next;
4101 	if (r != NULL) {
4102 		r->prev = rd->prev;
4103 	}
4104 
4105 	r = rd->prev;
4106 	if (r != NULL) {
4107 		r->next = rd->next;
4108 	} else {
4109 		qp->rdlist = rd->next;
4110 	}
4111 
4112 	cv_destroy(&rd->rdma_done_cv);
4113 	kmem_free(rd, sizeof (*rd));
4114 }
4115 
4116 static void
4117 rdma_done_rem_list(rib_qp_t *qp)
4118 {
4119 	struct rdma_done_list	*r, *n;
4120 
4121 	mutex_enter(&qp->rdlist_lock);
4122 	for (r = qp->rdlist; r != NULL; r = n) {
4123 		n = r->next;
4124 		rdma_done_rm(qp, r);
4125 	}
4126 	mutex_exit(&qp->rdlist_lock);
4127 }
4128 
4129 static void
4130 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4131 {
4132 	struct rdma_done_list *r = qp->rdlist;
4133 
4134 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4135 
4136 	while (r) {
4137 		if (r->xid == xid) {
4138 			cv_signal(&r->rdma_done_cv);
4139 			return;
4140 		} else {
4141 			r = r->next;
4142 		}
4143 	}
4144 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4145 	    int, xid);
4146 }
4147 
4148 
4149 /*
4150  * Goes through all connections and closes the channel
4151  * This will cause all the WRs on those channels to be
4152  * flushed.
4153  */
4154 static void
4155 rib_close_channels(rib_conn_list_t *connlist)
4156 {
4157 	CONN 		*conn;
4158 	rib_qp_t	*qp;
4159 
4160 	rw_enter(&connlist->conn_lock, RW_READER);
4161 	conn = connlist->conn_hd;
4162 	while (conn != NULL) {
4163 		mutex_enter(&conn->c_lock);
4164 		qp = ctoqp(conn);
4165 		if (conn->c_state == C_CONNECTED) {
4166 			/*
4167 			 * Live connection in CONNECTED state.
4168 			 * Call ibt_close_rc_channel in nonblocking mode
4169 			 * with no callbacks.
4170 			 */
4171 			conn->c_state = C_ERROR_CONN;
4172 			(void) ibt_close_rc_channel(qp->qp_hdl,
4173 			    IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
4174 			(void) ibt_free_channel(qp->qp_hdl);
4175 			qp->qp_hdl = NULL;
4176 		} else {
4177 			if (conn->c_state == C_ERROR_CONN &&
4178 			    qp->qp_hdl != NULL) {
4179 				/*
4180 				 * Connection in ERROR state but
4181 				 * channel is not yet freed.
4182 				 */
4183 				(void) ibt_close_rc_channel(qp->qp_hdl,
4184 				    IBT_NOCALLBACKS, NULL, 0, NULL,
4185 				    NULL, 0);
4186 				(void) ibt_free_channel(qp->qp_hdl);
4187 				qp->qp_hdl = NULL;
4188 			}
4189 		}
4190 		mutex_exit(&conn->c_lock);
4191 		conn = conn->c_next;
4192 	}
4193 	rw_exit(&connlist->conn_lock);
4194 }
4195 
4196 /*
4197  * Frees up all connections that are no longer being referenced
4198  */
4199 static void
4200 rib_purge_connlist(rib_conn_list_t *connlist)
4201 {
4202 	CONN 		*conn;
4203 
4204 top:
4205 	rw_enter(&connlist->conn_lock, RW_READER);
4206 	conn = connlist->conn_hd;
4207 	while (conn != NULL) {
4208 		mutex_enter(&conn->c_lock);
4209 
4210 		/*
4211 		 * At this point connection is either in ERROR
4212 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4213 		 * then some other thread is culling that connection.
4214 		 * If not and if c_ref is 0, then destroy the connection.
4215 		 */
4216 		if (conn->c_ref == 0 &&
4217 		    conn->c_state != C_DISCONN_PEND) {
4218 			/*
4219 			 * Cull the connection
4220 			 */
4221 			conn->c_state = C_DISCONN_PEND;
4222 			mutex_exit(&conn->c_lock);
4223 			rw_exit(&connlist->conn_lock);
4224 			(void) rib_disconnect_channel(conn, connlist);
4225 			goto top;
4226 		} else {
4227 			/*
4228 			 * conn disconnect already scheduled or will
4229 			 * happen from conn_release when c_ref drops to 0.
4230 			 */
4231 			mutex_exit(&conn->c_lock);
4232 		}
4233 		conn = conn->c_next;
4234 	}
4235 	rw_exit(&connlist->conn_lock);
4236 
4237 	/*
4238 	 * At this point, only connections with c_ref != 0 are on the list
4239 	 */
4240 }
4241 
4242 /*
4243  * Cleans and closes up all uses of the HCA
4244  */
4245 static void
4246 rib_detach_hca(rib_hca_t *hca)
4247 {
4248 
4249 	/*
4250 	 * Stop all services on the HCA
4251 	 * Go through cl_conn_list and close all rc_channels
4252 	 * Go through svr_conn_list and close all rc_channels
4253 	 * Free connections whose c_ref has dropped to 0
4254 	 * Destroy all CQs
4255 	 * Deregister and released all buffer pool memory after all
4256 	 * connections are destroyed
4257 	 * Free the protection domain
4258 	 * ibt_close_hca()
4259 	 */
4260 	rw_enter(&hca->state_lock, RW_WRITER);
4261 	if (hca->state == HCA_DETACHED) {
4262 		rw_exit(&hca->state_lock);
4263 		return;
4264 	}
4265 
4266 	hca->state = HCA_DETACHED;
4267 	rib_stat->nhca_inited--;
4268 
4269 	rib_stop_services(hca);
4270 	rib_close_channels(&hca->cl_conn_list);
4271 	rib_close_channels(&hca->srv_conn_list);
4272 	rw_exit(&hca->state_lock);
4273 
4274 	rib_purge_connlist(&hca->cl_conn_list);
4275 	rib_purge_connlist(&hca->srv_conn_list);
4276 
4277 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4278 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4279 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4280 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4281 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4282 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4283 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4284 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4285 
4286 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4287 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4288 	if (hca->srv_conn_list.conn_hd == NULL &&
4289 	    hca->cl_conn_list.conn_hd == NULL) {
4290 		/*
4291 		 * conn_lists are NULL, so destroy
4292 		 * buffers, close hca and be done.
4293 		 */
4294 		rib_rbufpool_destroy(hca, RECV_BUFFER);
4295 		rib_rbufpool_destroy(hca, SEND_BUFFER);
4296 		rib_destroy_cache(hca);
4297 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4298 		(void) ibt_close_hca(hca->hca_hdl);
4299 		hca->hca_hdl = NULL;
4300 	}
4301 	rw_exit(&hca->cl_conn_list.conn_lock);
4302 	rw_exit(&hca->srv_conn_list.conn_lock);
4303 
4304 	if (hca->hca_hdl != NULL) {
4305 		mutex_enter(&hca->inuse_lock);
4306 		while (hca->inuse)
4307 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4308 		mutex_exit(&hca->inuse_lock);
4309 		/*
4310 		 * conn_lists are now NULL, so destroy
4311 		 * buffers, close hca and be done.
4312 		 */
4313 		rib_rbufpool_destroy(hca, RECV_BUFFER);
4314 		rib_rbufpool_destroy(hca, SEND_BUFFER);
4315 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4316 		(void) ibt_close_hca(hca->hca_hdl);
4317 		hca->hca_hdl = NULL;
4318 	}
4319 }
4320 
4321 static void
4322 rib_server_side_cache_reclaim(void *argp)
4323 {
4324 	cache_avl_struct_t    *rcas;
4325 	rib_lrc_entry_t		*rb;
4326 	rib_hca_t *hca = (rib_hca_t *)argp;
4327 
4328 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4329 	rcas = avl_first(&hca->avl_tree);
4330 	if (rcas != NULL)
4331 		avl_remove(&hca->avl_tree, rcas);
4332 
4333 	while (rcas != NULL) {
4334 		while (rcas->r.forw != &rcas->r) {
4335 			rcas->elements--;
4336 			rib_total_buffers --;
4337 			rb = rcas->r.forw;
4338 			remque(rb);
4339 			if (rb->registered)
4340 				(void) rib_deregistermem_via_hca(hca,
4341 				    rb->lrc_buf, rb->lrc_mhandle);
4342 			cache_allocation -= rb->lrc_len;
4343 			kmem_free(rb->lrc_buf, rb->lrc_len);
4344 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4345 		}
4346 		mutex_destroy(&rcas->node_lock);
4347 		kmem_cache_free(hca->server_side_cache, rcas);
4348 		rcas = avl_first(&hca->avl_tree);
4349 		if (rcas != NULL)
4350 			avl_remove(&hca->avl_tree, rcas);
4351 	}
4352 	rw_exit(&hca->avl_rw_lock);
4353 }
4354 
4355 static void
4356 rib_server_side_cache_cleanup(void *argp)
4357 {
4358 	cache_avl_struct_t    *rcas;
4359 	rib_lrc_entry_t		*rb;
4360 	rib_hca_t *hca = (rib_hca_t *)argp;
4361 
4362 	rw_enter(&hca->avl_rw_lock, RW_READER);
4363 	if (cache_allocation < cache_limit) {
4364 		rw_exit(&hca->avl_rw_lock);
4365 		return;
4366 	}
4367 	rw_exit(&hca->avl_rw_lock);
4368 
4369 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4370 	rcas = avl_last(&hca->avl_tree);
4371 	if (rcas != NULL)
4372 		avl_remove(&hca->avl_tree, rcas);
4373 
4374 	while (rcas != NULL) {
4375 		while (rcas->r.forw != &rcas->r) {
4376 			rcas->elements--;
4377 			rib_total_buffers --;
4378 			rb = rcas->r.forw;
4379 			remque(rb);
4380 			if (rb->registered)
4381 				(void) rib_deregistermem_via_hca(hca,
4382 				    rb->lrc_buf, rb->lrc_mhandle);
4383 			cache_allocation -= rb->lrc_len;
4384 			kmem_free(rb->lrc_buf, rb->lrc_len);
4385 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4386 		}
4387 		mutex_destroy(&rcas->node_lock);
4388 		kmem_cache_free(hca->server_side_cache, rcas);
4389 		if ((cache_allocation) < cache_limit) {
4390 			rw_exit(&hca->avl_rw_lock);
4391 			return;
4392 		}
4393 
4394 		rcas = avl_last(&hca->avl_tree);
4395 		if (rcas != NULL)
4396 			avl_remove(&hca->avl_tree, rcas);
4397 	}
4398 	rw_exit(&hca->avl_rw_lock);
4399 }
4400 
4401 static int
4402 avl_compare(const void *t1, const void *t2)
4403 {
4404 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
4405 		return (0);
4406 
4407 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
4408 		return (-1);
4409 
4410 	return (1);
4411 }
4412 
4413 static void
4414 rib_destroy_cache(rib_hca_t *hca)
4415 {
4416 	if (hca->reg_cache_clean_up != NULL) {
4417 		ddi_taskq_destroy(hca->reg_cache_clean_up);
4418 		hca->reg_cache_clean_up = NULL;
4419 	}
4420 	if (!hca->avl_init) {
4421 		kmem_cache_destroy(hca->server_side_cache);
4422 		avl_destroy(&hca->avl_tree);
4423 		mutex_destroy(&hca->cache_allocation);
4424 		rw_destroy(&hca->avl_rw_lock);
4425 	}
4426 	hca->avl_init = FALSE;
4427 }
4428 
4429 static void
4430 rib_force_cleanup(void *hca)
4431 {
4432 	if (((rib_hca_t *)hca)->reg_cache_clean_up != NULL)
4433 		(void) ddi_taskq_dispatch(
4434 		    ((rib_hca_t *)hca)->reg_cache_clean_up,
4435 		    rib_server_side_cache_cleanup,
4436 		    (void *)hca, DDI_NOSLEEP);
4437 }
4438 
4439 static rib_lrc_entry_t *
4440 rib_get_cache_buf(CONN *conn, uint32_t len)
4441 {
4442 	cache_avl_struct_t	cas, *rcas;
4443 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4444 	rib_lrc_entry_t *reply_buf;
4445 	avl_index_t where = NULL;
4446 	uint64_t c_alloc = 0;
4447 
4448 	if (!hca->avl_init)
4449 		goto  error_alloc;
4450 
4451 	cas.len = len;
4452 
4453 	rw_enter(&hca->avl_rw_lock, RW_READER);
4454 
4455 	mutex_enter(&hca->cache_allocation);
4456 	c_alloc = cache_allocation;
4457 	mutex_exit(&hca->cache_allocation);
4458 
4459 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
4460 	    &where)) == NULL) {
4461 		/* Am I above the cache limit */
4462 		if ((c_alloc + len) >= cache_limit) {
4463 			rib_force_cleanup((void *)hca);
4464 			rw_exit(&hca->avl_rw_lock);
4465 			cache_misses_above_the_limit ++;
4466 
4467 			/* Allocate and register the buffer directly */
4468 			goto error_alloc;
4469 		}
4470 
4471 		rw_exit(&hca->avl_rw_lock);
4472 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
4473 
4474 		/* Recheck to make sure no other thread added the entry in */
4475 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
4476 		    &cas, &where)) == NULL) {
4477 			/* Allocate an avl tree entry */
4478 			rcas = (cache_avl_struct_t *)
4479 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
4480 
4481 			bzero(rcas, sizeof (cache_avl_struct_t));
4482 			rcas->elements = 0;
4483 			rcas->r.forw = &rcas->r;
4484 			rcas->r.back = &rcas->r;
4485 			rcas->len = len;
4486 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
4487 			avl_insert(&hca->avl_tree, rcas, where);
4488 		}
4489 	}
4490 
4491 	mutex_enter(&rcas->node_lock);
4492 
4493 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
4494 		rib_total_buffers--;
4495 		cache_hits++;
4496 		reply_buf = rcas->r.forw;
4497 		remque(reply_buf);
4498 		rcas->elements--;
4499 		mutex_exit(&rcas->node_lock);
4500 		rw_exit(&hca->avl_rw_lock);
4501 		mutex_enter(&hca->cache_allocation);
4502 		cache_allocation -= len;
4503 		mutex_exit(&hca->cache_allocation);
4504 	} else {
4505 		/* Am I above the cache limit */
4506 		mutex_exit(&rcas->node_lock);
4507 		if ((c_alloc + len) >= cache_limit) {
4508 			rib_force_cleanup((void *)hca);
4509 			rw_exit(&hca->avl_rw_lock);
4510 			cache_misses_above_the_limit ++;
4511 			/* Allocate and register the buffer directly */
4512 			goto error_alloc;
4513 		}
4514 		rw_exit(&hca->avl_rw_lock);
4515 		cache_misses ++;
4516 		/* Allocate a reply_buf entry */
4517 		reply_buf = (rib_lrc_entry_t *)
4518 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4519 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
4520 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
4521 		reply_buf->lrc_len  = len;
4522 		reply_buf->registered = FALSE;
4523 		reply_buf->avl_node = (void *)rcas;
4524 	}
4525 
4526 	return (reply_buf);
4527 
4528 error_alloc:
4529 	reply_buf = (rib_lrc_entry_t *)
4530 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4531 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
4532 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
4533 	reply_buf->lrc_len = len;
4534 	reply_buf->registered = FALSE;
4535 	reply_buf->avl_node = NULL;
4536 
4537 	return (reply_buf);
4538 }
4539 
4540 /*
4541  * Return a pre-registered back to the cache (without
4542  * unregistering the buffer)..
4543  */
4544 
4545 static void
4546 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
4547 {
4548 	cache_avl_struct_t    cas, *rcas;
4549 	avl_index_t where = NULL;
4550 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4551 
4552 	if (!hca->avl_init)
4553 		goto  error_free;
4554 
4555 	cas.len = reg_buf->lrc_len;
4556 	rw_enter(&hca->avl_rw_lock, RW_READER);
4557 	if ((rcas = (cache_avl_struct_t *)
4558 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
4559 		rw_exit(&hca->avl_rw_lock);
4560 		goto error_free;
4561 	} else {
4562 		rib_total_buffers ++;
4563 		cas.len = reg_buf->lrc_len;
4564 		mutex_enter(&rcas->node_lock);
4565 		insque(reg_buf, &rcas->r);
4566 		rcas->elements ++;
4567 		mutex_exit(&rcas->node_lock);
4568 		rw_exit(&hca->avl_rw_lock);
4569 		mutex_enter(&hca->cache_allocation);
4570 		cache_allocation += cas.len;
4571 		mutex_exit(&hca->cache_allocation);
4572 	}
4573 
4574 	return;
4575 
4576 error_free:
4577 
4578 	if (reg_buf->registered)
4579 		(void) rib_deregistermem_via_hca(hca,
4580 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
4581 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
4582 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
4583 }
4584 
4585 static rdma_stat
4586 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
4587 	uint_t buflen, struct mrc *buf_handle)
4588 {
4589 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
4590 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
4591 	rdma_stat	status;
4592 
4593 
4594 	/*
4595 	 * Note: ALL buffer pools use the same memory type RDMARW.
4596 	 */
4597 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
4598 	if (status == RDMA_SUCCESS) {
4599 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
4600 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
4601 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
4602 	} else {
4603 		buf_handle->mrc_linfo = NULL;
4604 		buf_handle->mrc_lmr = 0;
4605 		buf_handle->mrc_rmr = 0;
4606 	}
4607 	return (status);
4608 }
4609 
4610 /* ARGSUSED */
4611 static rdma_stat
4612 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
4613     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
4614 {
4615 
4616 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
4617 	return (RDMA_SUCCESS);
4618 }
4619 
4620 /* ARGSUSED */
4621 static rdma_stat
4622 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
4623 {
4624 
4625 	(void) ibt_deregister_mr(hca->hca_hdl,
4626 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
4627 	return (RDMA_SUCCESS);
4628 }
4629 
4630 /*
4631  * Check if the IP interface named by `lifrp' is RDMA-capable.
4632  */
4633 static boolean_t
4634 rpcib_rdma_capable_interface(struct lifreq *lifrp)
4635 {
4636 	char ifname[LIFNAMSIZ];
4637 	char *cp;
4638 
4639 	if (lifrp->lifr_type == IFT_IB)
4640 		return (B_TRUE);
4641 
4642 	/*
4643 	 * Strip off the logical interface portion before getting
4644 	 * intimate with the name.
4645 	 */
4646 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
4647 	if ((cp = strchr(ifname, ':')) != NULL)
4648 		*cp = '\0';
4649 
4650 	return (strcmp("lo0", ifname) == 0);
4651 }
4652 
4653 static int
4654 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
4655 {
4656 	vnode_t *kvp, *vp;
4657 	TIUSER  *tiptr;
4658 	struct  strioctl iocb;
4659 	k_sigset_t smask;
4660 	int	err = 0;
4661 
4662 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) {
4663 		if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE,
4664 		    &tiptr, CRED()) == 0) {
4665 			vp = tiptr->fp->f_vnode;
4666 		} else {
4667 			VN_RELE(kvp);
4668 			return (EPROTO);
4669 		}
4670 	} else {
4671 		return (EPROTO);
4672 	}
4673 
4674 	iocb.ic_cmd = cmd;
4675 	iocb.ic_timout = 0;
4676 	iocb.ic_len = len;
4677 	iocb.ic_dp = (caddr_t)arg;
4678 	sigintr(&smask, 0);
4679 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4680 	sigunintr(&smask);
4681 	(void) t_kclose(tiptr, 0);
4682 	VN_RELE(kvp);
4683 	return (err);
4684 }
4685 
4686 /*
4687  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
4688  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
4689  */
4690 static int
4691 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
4692 {
4693 	int err;
4694 	struct lifnum lifn;
4695 
4696 	bzero(&lifn, sizeof (struct lifnum));
4697 	lifn.lifn_family = AF_UNSPEC;
4698 
4699 	err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
4700 	if (err != 0)
4701 		return (err);
4702 
4703 	/*
4704 	 * Pad the interface count to account for additional interfaces that
4705 	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
4706 	 */
4707 	lifn.lifn_count += 4;
4708 
4709 	bzero(lifcp, sizeof (struct lifconf));
4710 	lifcp->lifc_family = AF_UNSPEC;
4711 	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
4712 	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
4713 
4714 	err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
4715 	if (err != 0) {
4716 		kmem_free(lifcp->lifc_buf, *bufsizep);
4717 		return (err);
4718 	}
4719 	return (0);
4720 }
4721 
4722 static boolean_t
4723 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
4724 {
4725 	uint_t i, nifs;
4726 	uint_t bufsize;
4727 	struct lifconf lifc;
4728 	struct lifreq *lifrp;
4729 	struct sockaddr_in *sinp;
4730 	struct sockaddr_in6 *sin6p;
4731 
4732 	bzero(addrs4, sizeof (rpcib_ipaddrs_t));
4733 	bzero(addrs6, sizeof (rpcib_ipaddrs_t));
4734 
4735 	if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
4736 		return (B_FALSE);
4737 
4738 	if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
4739 		kmem_free(lifc.lifc_buf, bufsize);
4740 		return (B_FALSE);
4741 	}
4742 
4743 	/*
4744 	 * Worst case is that all of the addresses are IB-capable and have
4745 	 * the same address family, so size our buffers accordingly.
4746 	 */
4747 	addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
4748 	addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
4749 	addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
4750 	addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
4751 
4752 	for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
4753 		if (!rpcib_rdma_capable_interface(lifrp))
4754 			continue;
4755 
4756 		if (lifrp->lifr_addr.ss_family == AF_INET) {
4757 			sinp = addrs4->ri_list;
4758 			bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
4759 			    sizeof (struct sockaddr_in));
4760 		} else if (lifrp->lifr_addr.ss_family == AF_INET6) {
4761 			sin6p = addrs6->ri_list;
4762 			bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
4763 			    sizeof (struct sockaddr_in6));
4764 		}
4765 	}
4766 
4767 	kmem_free(lifc.lifc_buf, bufsize);
4768 	return (B_TRUE);
4769 }
4770 
4771 /* ARGSUSED */
4772 static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) {
4773 
4774 	if (KSTAT_WRITE == rw) {
4775 		return (EACCES);
4776 	}
4777 	rpcib_kstat.cache_limit.value.ui64 =
4778 	    (uint64_t)cache_limit;
4779 	rpcib_kstat.cache_allocation.value.ui64 =
4780 	    (uint64_t)cache_allocation;
4781 	rpcib_kstat.cache_hits.value.ui64 =
4782 	    (uint64_t)cache_hits;
4783 	rpcib_kstat.cache_misses.value.ui64 =
4784 	    (uint64_t)cache_misses;
4785 	rpcib_kstat.cache_misses_above_the_limit.value.ui64 =
4786 	    (uint64_t)cache_misses_above_the_limit;
4787 	return (0);
4788 }
4789