xref: /titanic_51/usr/src/uts/common/rpc/rpcib.c (revision 82b5053fd84070f027436bc66665b7c0fd416e85)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 
27 /*
28  * Copyright (c) 2007, The Ohio State University. All rights reserved.
29  *
30  * Portions of this source code is developed by the team members of
31  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
32  * headed by Professor Dhabaleswar K. (DK) Panda.
33  *
34  * Acknowledgements to contributions from developors:
35  *   Ranjit Noronha: noronha@cse.ohio-state.edu
36  *   Lei Chai      : chail@cse.ohio-state.edu
37  *   Weikuan Yu    : yuw@cse.ohio-state.edu
38  *
39  */
40 
41 /*
42  * The rpcib plugin. Implements the interface for RDMATF's
43  * interaction with IBTF.
44  */
45 
46 #include <sys/param.h>
47 #include <sys/types.h>
48 #include <sys/user.h>
49 #include <sys/systm.h>
50 #include <sys/sysmacros.h>
51 #include <sys/proc.h>
52 #include <sys/socket.h>
53 #include <sys/file.h>
54 #include <sys/stream.h>
55 #include <sys/strsubr.h>
56 #include <sys/stropts.h>
57 #include <sys/errno.h>
58 #include <sys/kmem.h>
59 #include <sys/debug.h>
60 #include <sys/systm.h>
61 #include <sys/pathname.h>
62 #include <sys/kstat.h>
63 #include <sys/t_lock.h>
64 #include <sys/ddi.h>
65 #include <sys/cmn_err.h>
66 #include <sys/time.h>
67 #include <sys/isa_defs.h>
68 #include <sys/callb.h>
69 #include <sys/sunddi.h>
70 #include <sys/sunndi.h>
71 #include <sys/sunldi.h>
72 #include <sys/sdt.h>
73 #include <sys/dlpi.h>
74 #include <sys/ib/ibtl/ibti.h>
75 #include <rpc/rpc.h>
76 #include <rpc/ib.h>
77 
78 #include <sys/modctl.h>
79 
80 #include <sys/pathname.h>
81 #include <sys/kstr.h>
82 #include <sys/sockio.h>
83 #include <sys/vnode.h>
84 #include <sys/tiuser.h>
85 #include <net/if.h>
86 #include <sys/cred.h>
87 #include <rpc/rpc_rdma.h>
88 
89 #include <nfs/nfs.h>
90 #include <sys/kstat.h>
91 #include <sys/atomic.h>
92 
93 #define	NFS_RDMA_PORT	2050
94 
95 extern char *inet_ntop(int, const void *, char *, int);
96 
97 
98 /*
99  * Prototype declarations for driver ops
100  */
101 
102 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
103 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
104 				void *, void **);
105 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
106 static int	rpcib_is_ib_interface(char *);
107 static int	rpcib_dl_info(ldi_handle_t, dl_info_ack_t *);
108 static int	rpcib_do_ip_ioctl(int, int, caddr_t);
109 static boolean_t	rpcib_get_ib_addresses(struct sockaddr_in *,
110 			struct sockaddr_in6 *, uint_t *, uint_t *);
111 static	uint_t rpcib_get_number_interfaces(void);
112 static int rpcib_cache_kstat_update(kstat_t *, int);
113 static void rib_force_cleanup(void *);
114 
115 struct {
116 	kstat_named_t cache_limit;
117 	kstat_named_t cache_allocation;
118 	kstat_named_t cache_hits;
119 	kstat_named_t cache_misses;
120 	kstat_named_t cache_misses_above_the_limit;
121 } rpcib_kstat = {
122 	{"cache_limit",			KSTAT_DATA_UINT64 },
123 	{"cache_allocation",		KSTAT_DATA_UINT64 },
124 	{"cache_hits",			KSTAT_DATA_UINT64 },
125 	{"cache_misses",		KSTAT_DATA_UINT64 },
126 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
127 };
128 
129 /* rpcib cb_ops */
130 static struct cb_ops rpcib_cbops = {
131 	nulldev,		/* open */
132 	nulldev,		/* close */
133 	nodev,			/* strategy */
134 	nodev,			/* print */
135 	nodev,			/* dump */
136 	nodev,			/* read */
137 	nodev,			/* write */
138 	nodev,			/* ioctl */
139 	nodev,			/* devmap */
140 	nodev,			/* mmap */
141 	nodev,			/* segmap */
142 	nochpoll,		/* poll */
143 	ddi_prop_op,		/* prop_op */
144 	NULL,			/* stream */
145 	D_MP,			/* cb_flag */
146 	CB_REV,			/* rev */
147 	nodev,			/* int (*cb_aread)() */
148 	nodev			/* int (*cb_awrite)() */
149 };
150 
151 
152 
153 
154 /*
155  * Device options
156  */
157 static struct dev_ops rpcib_ops = {
158 	DEVO_REV,		/* devo_rev, */
159 	0,			/* refcnt  */
160 	rpcib_getinfo,		/* info */
161 	nulldev,		/* identify */
162 	nulldev,		/* probe */
163 	rpcib_attach,		/* attach */
164 	rpcib_detach,		/* detach */
165 	nodev,			/* reset */
166 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
167 	NULL,			/* bus operations */
168 	NULL			/* power */
169 };
170 
171 /*
172  * Module linkage information.
173  */
174 
175 static struct modldrv rib_modldrv = {
176 	&mod_driverops,			    /* Driver module */
177 	"RPCIB plugin driver, ver 1.30", /* Driver name and version */
178 	&rpcib_ops,		    /* Driver ops */
179 };
180 
181 static struct modlinkage rib_modlinkage = {
182 	MODREV_1,
183 	(void *)&rib_modldrv,
184 	NULL
185 };
186 
187 typedef struct rib_lrc_entry {
188 	struct rib_lrc_entry *forw;
189 	struct rib_lrc_entry *back;
190 	char *lrc_buf;
191 
192 	uint32_t lrc_len;
193 	void  *avl_node;
194 	bool_t registered;
195 
196 	struct mrc lrc_mhandle;
197 	bool_t lrc_on_freed_list;
198 } rib_lrc_entry_t;
199 
200 typedef	struct cache_struct	{
201 	rib_lrc_entry_t		r;
202 	uint32_t		len;
203 	uint32_t		elements;
204 	kmutex_t		node_lock;
205 	avl_node_t		avl_link;
206 } cache_avl_struct_t;
207 
208 
209 static uint64_t 	rib_total_buffers = 0;
210 uint64_t	cache_limit = 100 * 1024 * 1024;
211 static volatile uint64_t	cache_allocation = 0;
212 static uint64_t	cache_watermark = 80 * 1024 * 1024;
213 static uint64_t	cache_hits = 0;
214 static uint64_t	cache_misses = 0;
215 static uint64_t	cache_cold_misses = 0;
216 static uint64_t	cache_hot_misses = 0;
217 static uint64_t	cache_misses_above_the_limit = 0;
218 static bool_t	stats_enabled = FALSE;
219 
220 static uint64_t max_unsignaled_rws = 5;
221 
222 /*
223  * rib_stat: private data pointer used when registering
224  *	with the IBTF.  It is returned to the consumer
225  *	in all callbacks.
226  */
227 static rpcib_state_t *rib_stat = NULL;
228 
229 #define	RNR_RETRIES	IBT_RNR_RETRY_1
230 #define	MAX_PORTS	2
231 
232 int preposted_rbufs = RDMA_BUFS_GRANT;
233 int send_threshold = 1;
234 
235 /*
236  * State of the plugin.
237  * ACCEPT = accepting new connections and requests.
238  * NO_ACCEPT = not accepting new connection and requests.
239  * This should eventually move to rpcib_state_t structure, since this
240  * will tell in which state the plugin is for a particular type of service
241  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
242  * state for one and in no_accept state for the other.
243  */
244 int		plugin_state;
245 kmutex_t	plugin_state_lock;
246 
247 ldi_ident_t rpcib_li;
248 
249 /*
250  * RPCIB RDMATF operations
251  */
252 #if defined(MEASURE_POOL_DEPTH)
253 static void rib_posted_rbufs(uint32_t x) { return; }
254 #endif
255 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
256 static rdma_stat rib_disconnect(CONN *conn);
257 static void rib_listen(struct rdma_svc_data *rd);
258 static void rib_listen_stop(struct rdma_svc_data *rd);
259 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
260 	uint_t buflen, struct mrc *buf_handle);
261 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
262 	struct mrc buf_handle);
263 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
264 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
265 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
266 		struct mrc buf_handle);
267 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
268 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
269 	void *lrc);
270 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
271 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
272 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
273 	caddr_t buf, int len, int cpu);
274 
275 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
276 
277 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
278 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
279 
280 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
281 
282 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
283 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
284 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
285 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
286 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
287 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
288 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
289 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
290 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rib_hca_t **);
291 static rdma_stat rib_conn_get(struct netbuf *, int addr_type, void *, CONN **);
292 static rdma_stat rib_conn_release(CONN *conn);
293 static rdma_stat rib_getinfo(rdma_info_t *info);
294 
295 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
296 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
297 static void rib_destroy_cache(rib_hca_t *hca);
298 static	void	rib_server_side_cache_reclaim(void *argp);
299 static int avl_compare(const void *t1, const void *t2);
300 
301 static void rib_stop_services(rib_hca_t *);
302 static void rib_close_channels(rib_conn_list_t *);
303 
304 /*
305  * RPCIB addressing operations
306  */
307 
308 /*
309  * RDMA operations the RPCIB module exports
310  */
311 static rdmaops_t rib_ops = {
312 	rib_reachable,
313 	rib_conn_get,
314 	rib_conn_release,
315 	rib_listen,
316 	rib_listen_stop,
317 	rib_registermem,
318 	rib_deregistermem,
319 	rib_registermemsync,
320 	rib_deregistermemsync,
321 	rib_syncmem,
322 	rib_reg_buf_alloc,
323 	rib_reg_buf_free,
324 	rib_send,
325 	rib_send_resp,
326 	rib_post_resp,
327 	rib_post_resp_remove,
328 	rib_post_recv,
329 	rib_recv,
330 	rib_read,
331 	rib_write,
332 	rib_getinfo,
333 };
334 
335 /*
336  * RDMATF RPCIB plugin details
337  */
338 static rdma_mod_t rib_mod = {
339 	"ibtf",		/* api name */
340 	RDMATF_VERS_1,
341 	0,
342 	&rib_ops,	/* rdma op vector for ibtf */
343 };
344 
345 static rdma_stat open_hcas(rpcib_state_t *);
346 static rdma_stat rib_qp_init(rib_qp_t *, int);
347 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
348 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
349 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
350 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
351 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
352 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
353 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
354 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
355 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
356 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, ibt_path_info_t *,
357 	ibt_ip_addr_t *, ibt_ip_addr_t *);
358 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
359 	rib_qp_t **);
360 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
361 	rib_qp_t **);
362 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
363 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
364 static int rib_free_sendwait(struct send_wid *);
365 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
366 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
367 static void rdma_done_rem_list(rib_qp_t *);
368 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
369 
370 static void rib_async_handler(void *,
371 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
372 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
373 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
374 static int rib_free_svc_recv(struct svc_recv *);
375 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
376 static void rib_free_wid(struct recv_wid *);
377 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
378 static void rib_detach_hca(rib_hca_t *);
379 static rdma_stat rib_chk_srv_ibaddr(struct netbuf *, int,
380 	ibt_path_info_t *, ibt_ip_addr_t *, ibt_ip_addr_t *);
381 
382 /*
383  * Registration with IBTF as a consumer
384  */
385 static struct ibt_clnt_modinfo_s rib_modinfo = {
386 	IBTI_V2,
387 	IBT_GENERIC,
388 	rib_async_handler,	/* async event handler */
389 	NULL,			/* Memory Region Handler */
390 	"nfs/ib"
391 };
392 
393 /*
394  * Global strucuture
395  */
396 
397 typedef struct rpcib_s {
398 	dev_info_t	*rpcib_dip;
399 	kmutex_t	rpcib_mutex;
400 } rpcib_t;
401 
402 rpcib_t rpcib;
403 
404 /*
405  * /etc/system controlled variable to control
406  * debugging in rpcib kernel module.
407  * Set it to values greater that 1 to control
408  * the amount of debugging messages required.
409  */
410 int rib_debug = 0;
411 
412 
413 int
414 _init(void)
415 {
416 	int		error;
417 	int ret;
418 
419 	error = mod_install((struct modlinkage *)&rib_modlinkage);
420 	if (error != 0) {
421 		/*
422 		 * Could not load module
423 		 */
424 		return (error);
425 	}
426 	ret = ldi_ident_from_mod(&rib_modlinkage, &rpcib_li);
427 	if (ret != 0)
428 		rpcib_li = NULL;
429 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
430 
431 	return (0);
432 }
433 
434 int
435 _fini()
436 {
437 	int status;
438 
439 	if ((status = rdma_unregister_mod(&rib_mod)) != RDMA_SUCCESS) {
440 		return (EBUSY);
441 	}
442 
443 	/*
444 	 * Remove module
445 	 */
446 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
447 		(void) rdma_register_mod(&rib_mod);
448 		return (status);
449 	}
450 	mutex_destroy(&plugin_state_lock);
451 	ldi_ident_release(rpcib_li);
452 	return (0);
453 }
454 
455 int
456 _info(struct modinfo *modinfop)
457 {
458 	return (mod_info(&rib_modlinkage, modinfop));
459 }
460 
461 
462 /*
463  * rpcib_getinfo()
464  * Given the device number, return the devinfo pointer or the
465  * instance number.
466  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
467  */
468 
469 /*ARGSUSED*/
470 static int
471 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
472 {
473 	int ret = DDI_SUCCESS;
474 
475 	switch (cmd) {
476 	case DDI_INFO_DEVT2DEVINFO:
477 		if (rpcib.rpcib_dip != NULL)
478 			*result = rpcib.rpcib_dip;
479 		else {
480 			*result = NULL;
481 			ret = DDI_FAILURE;
482 		}
483 		break;
484 
485 	case DDI_INFO_DEVT2INSTANCE:
486 		*result = NULL;
487 		break;
488 
489 	default:
490 		ret = DDI_FAILURE;
491 	}
492 	return (ret);
493 }
494 
495 static int
496 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
497 {
498 	ibt_status_t	ibt_status;
499 	rdma_stat	r_status;
500 
501 	switch (cmd) {
502 	case DDI_ATTACH:
503 		break;
504 	case DDI_RESUME:
505 		return (DDI_SUCCESS);
506 	default:
507 		return (DDI_FAILURE);
508 	}
509 
510 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
511 
512 	mutex_enter(&rpcib.rpcib_mutex);
513 	if (rpcib.rpcib_dip != NULL) {
514 		mutex_exit(&rpcib.rpcib_mutex);
515 		return (DDI_FAILURE);
516 	}
517 	rpcib.rpcib_dip = dip;
518 	mutex_exit(&rpcib.rpcib_mutex);
519 	/*
520 	 * Create the "rpcib" minor-node.
521 	 */
522 	if (ddi_create_minor_node(dip,
523 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
524 		/* Error message, no cmn_err as they print on console */
525 		return (DDI_FAILURE);
526 	}
527 
528 	if (rib_stat == NULL) {
529 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
530 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
531 	}
532 
533 	rib_stat->hca_count = ibt_get_hca_list(&rib_stat->hca_guids);
534 	if (rib_stat->hca_count < 1) {
535 		mutex_destroy(&rib_stat->open_hca_lock);
536 		kmem_free(rib_stat, sizeof (*rib_stat));
537 		rib_stat = NULL;
538 		return (DDI_FAILURE);
539 	}
540 
541 	ibt_status = ibt_attach(&rib_modinfo, dip,
542 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
543 
544 	if (ibt_status != IBT_SUCCESS) {
545 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
546 		mutex_destroy(&rib_stat->open_hca_lock);
547 		kmem_free(rib_stat, sizeof (*rib_stat));
548 		rib_stat = NULL;
549 		return (DDI_FAILURE);
550 	}
551 
552 	mutex_enter(&rib_stat->open_hca_lock);
553 	if (open_hcas(rib_stat) != RDMA_SUCCESS) {
554 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
555 		(void) ibt_detach(rib_stat->ibt_clnt_hdl);
556 		mutex_exit(&rib_stat->open_hca_lock);
557 		mutex_destroy(&rib_stat->open_hca_lock);
558 		kmem_free(rib_stat, sizeof (*rib_stat));
559 		rib_stat = NULL;
560 		return (DDI_FAILURE);
561 	}
562 	mutex_exit(&rib_stat->open_hca_lock);
563 
564 	/*
565 	 * Register with rdmatf
566 	 */
567 	rib_mod.rdma_count = rib_stat->hca_count;
568 	r_status = rdma_register_mod(&rib_mod);
569 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
570 		rib_detach_hca(rib_stat->hca);
571 		ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
572 		(void) ibt_detach(rib_stat->ibt_clnt_hdl);
573 		mutex_destroy(&rib_stat->open_hca_lock);
574 		kmem_free(rib_stat, sizeof (*rib_stat));
575 		rib_stat = NULL;
576 		return (DDI_FAILURE);
577 	}
578 
579 
580 	return (DDI_SUCCESS);
581 }
582 
583 /*ARGSUSED*/
584 static int
585 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
586 {
587 	switch (cmd) {
588 
589 	case DDI_DETACH:
590 		break;
591 
592 	case DDI_SUSPEND:
593 	default:
594 		return (DDI_FAILURE);
595 	}
596 
597 	/*
598 	 * Detach the hca and free resources
599 	 */
600 	mutex_enter(&plugin_state_lock);
601 	plugin_state = NO_ACCEPT;
602 	mutex_exit(&plugin_state_lock);
603 	rib_detach_hca(rib_stat->hca);
604 	ibt_free_hca_list(rib_stat->hca_guids, rib_stat->hca_count);
605 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
606 
607 	mutex_enter(&rpcib.rpcib_mutex);
608 	rpcib.rpcib_dip = NULL;
609 	mutex_exit(&rpcib.rpcib_mutex);
610 
611 	mutex_destroy(&rpcib.rpcib_mutex);
612 	return (DDI_SUCCESS);
613 }
614 
615 
616 static void rib_rbufpool_free(rib_hca_t *, int);
617 static void rib_rbufpool_deregister(rib_hca_t *, int);
618 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
619 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
620 static rdma_stat rib_rem_replylist(rib_qp_t *);
621 static int rib_remreply(rib_qp_t *, struct reply *);
622 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
623 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
624 
625 
626 /*
627  * One CQ pair per HCA
628  */
629 static rdma_stat
630 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
631 	rib_cq_t **cqp, rpcib_state_t *ribstat)
632 {
633 	rib_cq_t	*cq;
634 	ibt_cq_attr_t	cq_attr;
635 	uint32_t	real_size;
636 	ibt_status_t	status;
637 	rdma_stat	error = RDMA_SUCCESS;
638 
639 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
640 	cq->rib_hca = hca;
641 	cq_attr.cq_size = cq_size;
642 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
643 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
644 	    &real_size);
645 	if (status != IBT_SUCCESS) {
646 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
647 		    " status=%d", status);
648 		error = RDMA_FAILED;
649 		goto fail;
650 	}
651 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, ribstat);
652 
653 	/*
654 	 * Enable CQ callbacks. CQ Callbacks are single shot
655 	 * (e.g. you have to call ibt_enable_cq_notify()
656 	 * after each callback to get another one).
657 	 */
658 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
659 	if (status != IBT_SUCCESS) {
660 		cmn_err(CE_WARN, "rib_create_cq: "
661 		    "enable_cq_notify failed, status %d", status);
662 		error = RDMA_FAILED;
663 		goto fail;
664 	}
665 	*cqp = cq;
666 
667 	return (error);
668 fail:
669 	if (cq->rib_cq_hdl)
670 		(void) ibt_free_cq(cq->rib_cq_hdl);
671 	if (cq)
672 		kmem_free(cq, sizeof (rib_cq_t));
673 	return (error);
674 }
675 
676 static rdma_stat
677 open_hcas(rpcib_state_t *ribstat)
678 {
679 	rib_hca_t		*hca;
680 	ibt_status_t		ibt_status;
681 	rdma_stat		status;
682 	ibt_hca_portinfo_t	*pinfop;
683 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
684 	uint_t			size, cq_size;
685 	int			i;
686 	kstat_t *ksp;
687 	cache_avl_struct_t example_avl_node;
688 	char rssc_name[32];
689 
690 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
691 
692 	if (ribstat->hcas == NULL)
693 		ribstat->hcas = kmem_zalloc(ribstat->hca_count *
694 		    sizeof (rib_hca_t), KM_SLEEP);
695 
696 	/*
697 	 * Open a hca and setup for RDMA
698 	 */
699 	for (i = 0; i < ribstat->hca_count; i++) {
700 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
701 		    ribstat->hca_guids[i],
702 		    &ribstat->hcas[i].hca_hdl);
703 		if (ibt_status != IBT_SUCCESS) {
704 			continue;
705 		}
706 		ribstat->hcas[i].hca_guid = ribstat->hca_guids[i];
707 		hca = &(ribstat->hcas[i]);
708 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
709 		hca->state = HCA_INITED;
710 
711 		/*
712 		 * query HCA info
713 		 */
714 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
715 		if (ibt_status != IBT_SUCCESS) {
716 			goto fail1;
717 		}
718 
719 		/*
720 		 * One PD (Protection Domain) per HCA.
721 		 * A qp is allowed to access a memory region
722 		 * only when it's in the same PD as that of
723 		 * the memory region.
724 		 */
725 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
726 		if (ibt_status != IBT_SUCCESS) {
727 			goto fail1;
728 		}
729 
730 		/*
731 		 * query HCA ports
732 		 */
733 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
734 		    0, &pinfop, &hca->hca_nports, &size);
735 		if (ibt_status != IBT_SUCCESS) {
736 			goto fail2;
737 		}
738 		hca->hca_ports = pinfop;
739 		hca->hca_pinfosz = size;
740 		pinfop = NULL;
741 
742 		cq_size = DEF_CQ_SIZE; /* default cq size */
743 		/*
744 		 * Create 2 pairs of cq's (1 pair for client
745 		 * and the other pair for server) on this hca.
746 		 * If number of qp's gets too large, then several
747 		 * cq's will be needed.
748 		 */
749 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
750 		    &hca->svc_rcq, ribstat);
751 		if (status != RDMA_SUCCESS) {
752 			goto fail3;
753 		}
754 
755 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
756 		    &hca->svc_scq, ribstat);
757 		if (status != RDMA_SUCCESS) {
758 			goto fail3;
759 		}
760 
761 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
762 		    &hca->clnt_rcq, ribstat);
763 		if (status != RDMA_SUCCESS) {
764 			goto fail3;
765 		}
766 
767 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
768 		    &hca->clnt_scq, ribstat);
769 		if (status != RDMA_SUCCESS) {
770 			goto fail3;
771 		}
772 
773 		/*
774 		 * Create buffer pools.
775 		 * Note rib_rbuf_create also allocates memory windows.
776 		 */
777 		hca->recv_pool = rib_rbufpool_create(hca,
778 		    RECV_BUFFER, MAX_BUFS);
779 		if (hca->recv_pool == NULL) {
780 			goto fail3;
781 		}
782 
783 		hca->send_pool = rib_rbufpool_create(hca,
784 		    SEND_BUFFER, MAX_BUFS);
785 		if (hca->send_pool == NULL) {
786 			rib_rbufpool_destroy(hca, RECV_BUFFER);
787 			goto fail3;
788 		}
789 
790 		if (hca->server_side_cache == NULL) {
791 			(void) sprintf(rssc_name,
792 			    "rib_server_side_cache_%04d", i);
793 			hca->server_side_cache = kmem_cache_create(
794 			    rssc_name,
795 			    sizeof (cache_avl_struct_t), 0,
796 			    NULL,
797 			    NULL,
798 			    rib_server_side_cache_reclaim,
799 			    hca, NULL, 0);
800 		}
801 
802 		avl_create(&hca->avl_tree,
803 		    avl_compare,
804 		    sizeof (cache_avl_struct_t),
805 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
806 		    (uint_t)(uintptr_t)&example_avl_node);
807 
808 		rw_init(&hca->avl_rw_lock,
809 		    NULL, RW_DRIVER, hca->iblock);
810 		mutex_init(&hca->cache_allocation,
811 		    NULL, MUTEX_DRIVER, NULL);
812 		hca->avl_init = TRUE;
813 
814 		/* Create kstats for the cache */
815 		ASSERT(INGLOBALZONE(curproc));
816 
817 		if (!stats_enabled) {
818 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
819 			    KSTAT_TYPE_NAMED,
820 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
821 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
822 			    GLOBAL_ZONEID);
823 			if (ksp) {
824 				ksp->ks_data = (void *) &rpcib_kstat;
825 				ksp->ks_update = rpcib_cache_kstat_update;
826 				kstat_install(ksp);
827 				stats_enabled = TRUE;
828 			}
829 		}
830 		if (NULL == hca->reg_cache_clean_up) {
831 			hca->reg_cache_clean_up = ddi_taskq_create(NULL,
832 			    "REG_CACHE_CLEANUP", 1, TASKQ_DEFAULTPRI, 0);
833 		}
834 
835 		/*
836 		 * Initialize the registered service list and
837 		 * the lock
838 		 */
839 		hca->service_list = NULL;
840 		rw_init(&hca->service_list_lock, NULL, RW_DRIVER, hca->iblock);
841 
842 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
843 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
844 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
845 		    hca->iblock);
846 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
847 		    hca->iblock);
848 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
849 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
850 		hca->inuse = TRUE;
851 		/*
852 		 * XXX One hca only. Add multi-hca functionality if needed
853 		 * later.
854 		 */
855 		ribstat->hca = hca;
856 		ribstat->nhca_inited++;
857 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
858 		break;
859 
860 fail3:
861 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
862 fail2:
863 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
864 fail1:
865 		(void) ibt_close_hca(hca->hca_hdl);
866 
867 	}
868 	if (ribstat->hca != NULL)
869 		return (RDMA_SUCCESS);
870 	else
871 		return (RDMA_FAILED);
872 }
873 
874 /*
875  * Callback routines
876  */
877 
878 /*
879  * SCQ handlers
880  */
881 /* ARGSUSED */
882 static void
883 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
884 {
885 	ibt_status_t	ibt_status;
886 	ibt_wc_t	wc;
887 	int		i;
888 
889 	/*
890 	 * Re-enable cq notify here to avoid missing any
891 	 * completion queue notification.
892 	 */
893 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
894 
895 	ibt_status = IBT_SUCCESS;
896 	while (ibt_status != IBT_CQ_EMPTY) {
897 	bzero(&wc, sizeof (wc));
898 	ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
899 	if (ibt_status != IBT_SUCCESS)
900 		return;
901 
902 	/*
903 	 * Got a send completion
904 	 */
905 	if (wc.wc_id != NULL) {	/* XXX can it be otherwise ???? */
906 		struct send_wid *wd = (struct send_wid *)(uintptr_t)wc.wc_id;
907 		CONN	*conn = qptoc(wd->qp);
908 
909 		mutex_enter(&wd->sendwait_lock);
910 		switch (wc.wc_status) {
911 		case IBT_WC_SUCCESS:
912 			wd->status = RDMA_SUCCESS;
913 			break;
914 		case IBT_WC_WR_FLUSHED_ERR:
915 			wd->status = RDMA_FAILED;
916 			break;
917 		default:
918 /*
919  *    RC Send Q Error Code		Local state     Remote State
920  *    ==================== 		===========     ============
921  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
922  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
923  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
924  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
925  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
926  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
927  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
928  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
929  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
930  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
931  *    IBT_WC_WR_FLUSHED_ERR               None            None
932  */
933 			/*
934 			 * Channel in error state. Set connection to
935 			 * ERROR and cleanup will happen either from
936 			 * conn_release  or from rib_conn_get
937 			 */
938 			wd->status = RDMA_FAILED;
939 			mutex_enter(&conn->c_lock);
940 			if (conn->c_state != C_DISCONN_PEND)
941 				conn->c_state = C_ERROR_CONN;
942 			mutex_exit(&conn->c_lock);
943 			break;
944 		}
945 
946 		if (wd->cv_sig == 1) {
947 			/*
948 			 * Notify poster
949 			 */
950 			cv_signal(&wd->wait_cv);
951 			mutex_exit(&wd->sendwait_lock);
952 		} else {
953 			/*
954 			 * Poster not waiting for notification.
955 			 * Free the send buffers and send_wid
956 			 */
957 			for (i = 0; i < wd->nsbufs; i++) {
958 				rib_rbuf_free(qptoc(wd->qp), SEND_BUFFER,
959 				    (void *)(uintptr_t)wd->sbufaddr[i]);
960 				}
961 			mutex_exit(&wd->sendwait_lock);
962 			(void) rib_free_sendwait(wd);
963 			}
964 		}
965 	}
966 }
967 
968 /* ARGSUSED */
969 static void
970 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
971 {
972 	ibt_status_t	ibt_status;
973 	ibt_wc_t	wc;
974 	int		i;
975 
976 	/*
977 	 * Re-enable cq notify here to avoid missing any
978 	 * completion queue notification.
979 	 */
980 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
981 
982 	ibt_status = IBT_SUCCESS;
983 	while (ibt_status != IBT_CQ_EMPTY) {
984 		bzero(&wc, sizeof (wc));
985 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
986 		if (ibt_status != IBT_SUCCESS)
987 			return;
988 
989 		/*
990 		 * Got a send completion
991 		 */
992 		if (wc.wc_id != NULL) { /* XXX NULL possible ???? */
993 			struct send_wid *wd =
994 			    (struct send_wid *)(uintptr_t)wc.wc_id;
995 			mutex_enter(&wd->sendwait_lock);
996 			if (wd->cv_sig == 1) {
997 				/*
998 				 * Update completion status and notify poster
999 				 */
1000 				if (wc.wc_status == IBT_WC_SUCCESS)
1001 					wd->status = RDMA_SUCCESS;
1002 				else
1003 					wd->status = RDMA_FAILED;
1004 				cv_signal(&wd->wait_cv);
1005 				mutex_exit(&wd->sendwait_lock);
1006 			} else {
1007 				/*
1008 				 * Poster not waiting for notification.
1009 				 * Free the send buffers and send_wid
1010 				 */
1011 				for (i = 0; i < wd->nsbufs; i++) {
1012 					rib_rbuf_free(qptoc(wd->qp),
1013 					    SEND_BUFFER,
1014 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1015 				}
1016 				mutex_exit(&wd->sendwait_lock);
1017 				(void) rib_free_sendwait(wd);
1018 			}
1019 		}
1020 	}
1021 }
1022 
1023 /*
1024  * RCQ handler
1025  */
1026 /* ARGSUSED */
1027 static void
1028 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1029 {
1030 	rib_qp_t	*qp;
1031 	ibt_status_t	ibt_status;
1032 	ibt_wc_t	wc;
1033 	struct recv_wid	*rwid;
1034 
1035 	/*
1036 	 * Re-enable cq notify here to avoid missing any
1037 	 * completion queue notification.
1038 	 */
1039 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1040 
1041 	ibt_status = IBT_SUCCESS;
1042 	while (ibt_status != IBT_CQ_EMPTY) {
1043 		bzero(&wc, sizeof (wc));
1044 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1045 		if (ibt_status != IBT_SUCCESS)
1046 			return;
1047 
1048 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1049 		qp = rwid->qp;
1050 		if (wc.wc_status == IBT_WC_SUCCESS) {
1051 			XDR	inxdrs, *xdrs;
1052 			uint_t	xid, vers, op, find_xid = 0;
1053 			struct reply	*r;
1054 			CONN *conn = qptoc(qp);
1055 			uint32_t rdma_credit = 0;
1056 
1057 			xdrs = &inxdrs;
1058 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1059 			    wc.wc_bytes_xfer, XDR_DECODE);
1060 			/*
1061 			 * Treat xid as opaque (xid is the first entity
1062 			 * in the rpc rdma message).
1063 			 */
1064 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1065 
1066 			/* Skip xid and set the xdr position accordingly. */
1067 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1068 			(void) xdr_u_int(xdrs, &vers);
1069 			(void) xdr_u_int(xdrs, &rdma_credit);
1070 			(void) xdr_u_int(xdrs, &op);
1071 			XDR_DESTROY(xdrs);
1072 
1073 			if (vers != RPCRDMA_VERS) {
1074 				/*
1075 				 * Invalid RPC/RDMA version. Cannot
1076 				 * interoperate.  Set connection to
1077 				 * ERROR state and bail out.
1078 				 */
1079 				mutex_enter(&conn->c_lock);
1080 				if (conn->c_state != C_DISCONN_PEND)
1081 					conn->c_state = C_ERROR_CONN;
1082 				mutex_exit(&conn->c_lock);
1083 				rib_rbuf_free(conn, RECV_BUFFER,
1084 				    (void *)(uintptr_t)rwid->addr);
1085 				rib_free_wid(rwid);
1086 				continue;
1087 			}
1088 
1089 			mutex_enter(&qp->replylist_lock);
1090 			for (r = qp->replylist; r != NULL; r = r->next) {
1091 				if (r->xid == xid) {
1092 					find_xid = 1;
1093 					switch (op) {
1094 					case RDMA_MSG:
1095 					case RDMA_NOMSG:
1096 					case RDMA_MSGP:
1097 						r->status = RDMA_SUCCESS;
1098 						r->vaddr_cq = rwid->addr;
1099 						r->bytes_xfer =
1100 						    wc.wc_bytes_xfer;
1101 						cv_signal(&r->wait_cv);
1102 						break;
1103 					default:
1104 						rib_rbuf_free(qptoc(qp),
1105 						    RECV_BUFFER,
1106 						    (void *)(uintptr_t)
1107 						    rwid->addr);
1108 						break;
1109 					}
1110 					break;
1111 				}
1112 			}
1113 			mutex_exit(&qp->replylist_lock);
1114 			if (find_xid == 0) {
1115 				/* RPC caller not waiting for reply */
1116 
1117 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1118 				    int, xid);
1119 
1120 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1121 				    (void *)(uintptr_t)rwid->addr);
1122 			}
1123 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1124 			CONN *conn = qptoc(qp);
1125 
1126 			/*
1127 			 * Connection being flushed. Just free
1128 			 * the posted buffer
1129 			 */
1130 			rib_rbuf_free(conn, RECV_BUFFER,
1131 			    (void *)(uintptr_t)rwid->addr);
1132 		} else {
1133 			CONN *conn = qptoc(qp);
1134 /*
1135  *  RC Recv Q Error Code		Local state     Remote State
1136  *  ====================		===========     ============
1137  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1138  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1139  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1140  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1141  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1142  *  IBT_WC_WR_FLUSHED_ERR               None            None
1143  */
1144 			/*
1145 			 * Channel in error state. Set connection
1146 			 * in ERROR state.
1147 			 */
1148 			mutex_enter(&conn->c_lock);
1149 			if (conn->c_state != C_DISCONN_PEND)
1150 				conn->c_state = C_ERROR_CONN;
1151 			mutex_exit(&conn->c_lock);
1152 			rib_rbuf_free(conn, RECV_BUFFER,
1153 			    (void *)(uintptr_t)rwid->addr);
1154 		}
1155 		rib_free_wid(rwid);
1156 	}
1157 }
1158 
1159 /* Server side */
1160 /* ARGSUSED */
1161 static void
1162 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1163 {
1164 	rdma_recv_data_t *rdp;
1165 	rib_qp_t	*qp;
1166 	ibt_status_t	ibt_status;
1167 	ibt_wc_t	wc;
1168 	struct svc_recv	*s_recvp;
1169 	CONN		*conn;
1170 	mblk_t		*mp;
1171 
1172 	/*
1173 	 * Re-enable cq notify here to avoid missing any
1174 	 * completion queue notification.
1175 	 */
1176 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1177 
1178 	ibt_status = IBT_SUCCESS;
1179 	while (ibt_status != IBT_CQ_EMPTY) {
1180 		bzero(&wc, sizeof (wc));
1181 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1182 		if (ibt_status != IBT_SUCCESS)
1183 			return;
1184 
1185 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1186 		qp = s_recvp->qp;
1187 		conn = qptoc(qp);
1188 		mutex_enter(&qp->posted_rbufs_lock);
1189 		qp->n_posted_rbufs--;
1190 #if defined(MEASURE_POOL_DEPTH)
1191 		rib_posted_rbufs(preposted_rbufs -  qp->n_posted_rbufs);
1192 #endif
1193 		if (qp->n_posted_rbufs == 0)
1194 			cv_signal(&qp->posted_rbufs_cv);
1195 		mutex_exit(&qp->posted_rbufs_lock);
1196 
1197 		if (wc.wc_status == IBT_WC_SUCCESS) {
1198 			XDR	inxdrs, *xdrs;
1199 			uint_t	xid, vers, op;
1200 			uint32_t rdma_credit;
1201 
1202 			xdrs = &inxdrs;
1203 			/* s_recvp->vaddr stores data */
1204 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1205 			    wc.wc_bytes_xfer, XDR_DECODE);
1206 
1207 			/*
1208 			 * Treat xid as opaque (xid is the first entity
1209 			 * in the rpc rdma message).
1210 			 */
1211 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1212 			/* Skip xid and set the xdr position accordingly. */
1213 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1214 			if (!xdr_u_int(xdrs, &vers) ||
1215 			    !xdr_u_int(xdrs, &rdma_credit) ||
1216 			    !xdr_u_int(xdrs, &op)) {
1217 				rib_rbuf_free(conn, RECV_BUFFER,
1218 				    (void *)(uintptr_t)s_recvp->vaddr);
1219 				XDR_DESTROY(xdrs);
1220 				(void) rib_free_svc_recv(s_recvp);
1221 				continue;
1222 			}
1223 			XDR_DESTROY(xdrs);
1224 
1225 			if (vers != RPCRDMA_VERS) {
1226 				/*
1227 				 * Invalid RPC/RDMA version.
1228 				 * Drop rpc rdma message.
1229 				 */
1230 				rib_rbuf_free(conn, RECV_BUFFER,
1231 				    (void *)(uintptr_t)s_recvp->vaddr);
1232 				(void) rib_free_svc_recv(s_recvp);
1233 				continue;
1234 			}
1235 			/*
1236 			 * Is this for RDMA_DONE?
1237 			 */
1238 			if (op == RDMA_DONE) {
1239 				rib_rbuf_free(conn, RECV_BUFFER,
1240 				    (void *)(uintptr_t)s_recvp->vaddr);
1241 				/*
1242 				 * Wake up the thread waiting on
1243 				 * a RDMA_DONE for xid
1244 				 */
1245 				mutex_enter(&qp->rdlist_lock);
1246 				rdma_done_notify(qp, xid);
1247 				mutex_exit(&qp->rdlist_lock);
1248 				(void) rib_free_svc_recv(s_recvp);
1249 				continue;
1250 			}
1251 
1252 			mutex_enter(&plugin_state_lock);
1253 			if (plugin_state == ACCEPT) {
1254 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1255 				    == NULL)
1256 					(void) strwaitbuf(
1257 					    sizeof (*rdp), BPRI_LO);
1258 				/*
1259 				 * Plugin is in accept state, hence the master
1260 				 * transport queue for this is still accepting
1261 				 * requests. Hence we can call svc_queuereq to
1262 				 * queue this recieved msg.
1263 				 */
1264 				rdp = (rdma_recv_data_t *)mp->b_rptr;
1265 				rdp->conn = conn;
1266 				rdp->rpcmsg.addr =
1267 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1268 				rdp->rpcmsg.type = RECV_BUFFER;
1269 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1270 				rdp->status = wc.wc_status;
1271 				mutex_enter(&conn->c_lock);
1272 				conn->c_ref++;
1273 				mutex_exit(&conn->c_lock);
1274 				mp->b_wptr += sizeof (*rdp);
1275 				svc_queuereq((queue_t *)rib_stat->q, mp);
1276 				mutex_exit(&plugin_state_lock);
1277 			} else {
1278 				/*
1279 				 * The master transport for this is going
1280 				 * away and the queue is not accepting anymore
1281 				 * requests for krpc, so don't do anything, just
1282 				 * free the msg.
1283 				 */
1284 				mutex_exit(&plugin_state_lock);
1285 				rib_rbuf_free(conn, RECV_BUFFER,
1286 				    (void *)(uintptr_t)s_recvp->vaddr);
1287 			}
1288 		} else {
1289 			rib_rbuf_free(conn, RECV_BUFFER,
1290 			    (void *)(uintptr_t)s_recvp->vaddr);
1291 		}
1292 		(void) rib_free_svc_recv(s_recvp);
1293 	}
1294 }
1295 
1296 /*
1297  * Handles DR event of IBT_HCA_DETACH_EVENT.
1298  */
1299 /* ARGSUSED */
1300 static void
1301 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1302 	ibt_async_code_t code, ibt_async_event_t *event)
1303 {
1304 
1305 	switch (code) {
1306 	case IBT_HCA_ATTACH_EVENT:
1307 		/* ignore */
1308 		break;
1309 	case IBT_HCA_DETACH_EVENT:
1310 	{
1311 		ASSERT(rib_stat->hca->hca_hdl == hca_hdl);
1312 		rib_detach_hca(rib_stat->hca);
1313 #ifdef DEBUG
1314 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1315 #endif
1316 		break;
1317 	}
1318 #ifdef DEBUG
1319 	case IBT_EVENT_PATH_MIGRATED:
1320 		cmn_err(CE_NOTE, "rib_async_handler(): "
1321 		    "IBT_EVENT_PATH_MIGRATED\n");
1322 		break;
1323 	case IBT_EVENT_SQD:
1324 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1325 		break;
1326 	case IBT_EVENT_COM_EST:
1327 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1328 		break;
1329 	case IBT_ERROR_CATASTROPHIC_CHAN:
1330 		cmn_err(CE_NOTE, "rib_async_handler(): "
1331 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1332 		break;
1333 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1334 		cmn_err(CE_NOTE, "rib_async_handler(): "
1335 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1336 		break;
1337 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1338 		cmn_err(CE_NOTE, "rib_async_handler(): "
1339 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1340 		break;
1341 	case IBT_ERROR_PATH_MIGRATE_REQ:
1342 		cmn_err(CE_NOTE, "rib_async_handler(): "
1343 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1344 		break;
1345 	case IBT_ERROR_CQ:
1346 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1347 		break;
1348 	case IBT_ERROR_PORT_DOWN:
1349 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1350 		break;
1351 	case IBT_EVENT_PORT_UP:
1352 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1353 		break;
1354 	case IBT_ASYNC_OPAQUE1:
1355 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1356 		break;
1357 	case IBT_ASYNC_OPAQUE2:
1358 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1359 		break;
1360 	case IBT_ASYNC_OPAQUE3:
1361 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1362 		break;
1363 	case IBT_ASYNC_OPAQUE4:
1364 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1365 		break;
1366 #endif
1367 	default:
1368 		break;
1369 	}
1370 }
1371 
1372 /*
1373  * Client's reachable function.
1374  */
1375 static rdma_stat
1376 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1377 {
1378 	rib_hca_t	*hca;
1379 	rdma_stat	status;
1380 
1381 	/*
1382 	 * First check if a hca is still attached
1383 	 */
1384 	*handle = NULL;
1385 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
1386 	if (rib_stat->hca->state != HCA_INITED) {
1387 		rw_exit(&rib_stat->hca->state_lock);
1388 		return (RDMA_FAILED);
1389 	}
1390 	status = rib_ping_srv(addr_type, raddr, &hca);
1391 	rw_exit(&rib_stat->hca->state_lock);
1392 
1393 	if (status == RDMA_SUCCESS) {
1394 		*handle = (void *)hca;
1395 		return (RDMA_SUCCESS);
1396 	} else {
1397 		*handle = NULL;
1398 		DTRACE_PROBE(rpcib__i__pingfailed);
1399 		return (RDMA_FAILED);
1400 	}
1401 }
1402 
1403 /* Client side qp creation */
1404 static rdma_stat
1405 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1406 {
1407 	rib_qp_t	*kqp = NULL;
1408 	CONN		*conn;
1409 	rdma_clnt_cred_ctrl_t *cc_info;
1410 
1411 	ASSERT(qp != NULL);
1412 	*qp = NULL;
1413 
1414 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1415 	conn = qptoc(kqp);
1416 	kqp->hca = hca;
1417 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1418 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1419 
1420 	kqp->mode = RIB_CLIENT;
1421 	kqp->chan_flags = IBT_BLOCKING;
1422 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1423 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1424 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1425 	/*
1426 	 * Initialize
1427 	 */
1428 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1429 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1430 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1431 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1432 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1433 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1434 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1435 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1436 	/*
1437 	 * Initialize the client credit control
1438 	 * portion of the rdmaconn struct.
1439 	 */
1440 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1441 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1442 	cc_info->clnt_cc_granted_ops = 0;
1443 	cc_info->clnt_cc_in_flight_ops = 0;
1444 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1445 
1446 	*qp = kqp;
1447 	return (RDMA_SUCCESS);
1448 }
1449 
1450 /* Server side qp creation */
1451 static rdma_stat
1452 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1453 {
1454 	rib_qp_t	*kqp = NULL;
1455 	ibt_chan_sizes_t	chan_sizes;
1456 	ibt_rc_chan_alloc_args_t	qp_attr;
1457 	ibt_status_t		ibt_status;
1458 	rdma_srv_cred_ctrl_t *cc_info;
1459 
1460 	*qp = NULL;
1461 
1462 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1463 	kqp->hca = hca;
1464 	kqp->port_num = port;
1465 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1466 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1467 
1468 	/*
1469 	 * Create the qp handle
1470 	 */
1471 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1472 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1473 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1474 	qp_attr.rc_pd = hca->pd_hdl;
1475 	qp_attr.rc_hca_port_num = port;
1476 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1477 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1478 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1479 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1480 	qp_attr.rc_clone_chan = NULL;
1481 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1482 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1483 
1484 	rw_enter(&hca->state_lock, RW_READER);
1485 	if (hca->state != HCA_DETACHED) {
1486 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1487 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1488 		    &chan_sizes);
1489 	} else {
1490 		rw_exit(&hca->state_lock);
1491 		goto fail;
1492 	}
1493 	rw_exit(&hca->state_lock);
1494 
1495 	if (ibt_status != IBT_SUCCESS) {
1496 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1497 		    int, ibt_status);
1498 		goto fail;
1499 	}
1500 
1501 	kqp->mode = RIB_SERVER;
1502 	kqp->chan_flags = IBT_BLOCKING;
1503 	kqp->q = q;	/* server ONLY */
1504 
1505 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1506 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1507 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1508 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1509 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1510 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1511 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1512 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1513 	/*
1514 	 * Set the private data area to qp to be used in callbacks
1515 	 */
1516 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1517 	kqp->rdmaconn.c_state = C_CONNECTED;
1518 
1519 	/*
1520 	 * Initialize the server credit control
1521 	 * portion of the rdmaconn struct.
1522 	 */
1523 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1524 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1525 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1526 	cc_info->srv_cc_cur_buffers_used = 0;
1527 	cc_info->srv_cc_posted = preposted_rbufs;
1528 
1529 	*qp = kqp;
1530 
1531 	return (RDMA_SUCCESS);
1532 fail:
1533 	if (kqp)
1534 		kmem_free(kqp, sizeof (rib_qp_t));
1535 
1536 	return (RDMA_FAILED);
1537 }
1538 
1539 /* ARGSUSED */
1540 ibt_cm_status_t
1541 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1542     ibt_cm_return_args_t *ret_args, void *priv_data,
1543     ibt_priv_data_len_t len)
1544 {
1545 	rpcib_state_t   *ribstat;
1546 	rib_hca_t	*hca;
1547 
1548 	ribstat = (rpcib_state_t *)clnt_hdl;
1549 	hca = (rib_hca_t *)ribstat->hca;
1550 
1551 	switch (event->cm_type) {
1552 
1553 	/* got a connection close event */
1554 	case IBT_CM_EVENT_CONN_CLOSED:
1555 	{
1556 		CONN	*conn;
1557 		rib_qp_t *qp;
1558 
1559 		/* check reason why connection was closed */
1560 		switch (event->cm_event.closed) {
1561 		case IBT_CM_CLOSED_DREP_RCVD:
1562 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1563 		case IBT_CM_CLOSED_DUP:
1564 		case IBT_CM_CLOSED_ABORT:
1565 		case IBT_CM_CLOSED_ALREADY:
1566 			/*
1567 			 * These cases indicate the local end initiated
1568 			 * the closing of the channel. Nothing to do here.
1569 			 */
1570 			break;
1571 		default:
1572 			/*
1573 			 * Reason for CONN_CLOSED event must be one of
1574 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1575 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1576 			 * the remote end is closing the channel. In these
1577 			 * cases free the channel and transition to error
1578 			 * state
1579 			 */
1580 			qp = ibt_get_chan_private(event->cm_channel);
1581 			conn = qptoc(qp);
1582 			mutex_enter(&conn->c_lock);
1583 			if (conn->c_state == C_DISCONN_PEND) {
1584 				mutex_exit(&conn->c_lock);
1585 				break;
1586 			}
1587 
1588 			conn->c_state = C_ERROR_CONN;
1589 
1590 			/*
1591 			 * Free the rc_channel. Channel has already
1592 			 * transitioned to ERROR state and WRs have been
1593 			 * FLUSHED_ERR already.
1594 			 */
1595 			(void) ibt_free_channel(qp->qp_hdl);
1596 			qp->qp_hdl = NULL;
1597 
1598 			/*
1599 			 * Free the conn if c_ref is down to 0 already
1600 			 */
1601 			if (conn->c_ref == 0) {
1602 				/*
1603 				 * Remove from list and free conn
1604 				 */
1605 				conn->c_state = C_DISCONN_PEND;
1606 				mutex_exit(&conn->c_lock);
1607 				(void) rib_disconnect_channel(conn,
1608 				    &hca->cl_conn_list);
1609 			} else {
1610 				mutex_exit(&conn->c_lock);
1611 			}
1612 #ifdef DEBUG
1613 			if (rib_debug)
1614 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1615 				    "(CONN_CLOSED) channel disconnected");
1616 #endif
1617 			break;
1618 		}
1619 		break;
1620 	}
1621 	default:
1622 		break;
1623 	}
1624 	return (IBT_CM_ACCEPT);
1625 }
1626 
1627 /* Check server ib address */
1628 rdma_stat
1629 rib_chk_srv_ibaddr(struct netbuf *raddr,
1630 	int addr_type, ibt_path_info_t *path, ibt_ip_addr_t *s_ip,
1631 	ibt_ip_addr_t *d_ip)
1632 {
1633 	struct sockaddr_in	*sin4;
1634 	struct sockaddr_in6	*sin6;
1635 	ibt_status_t		ibt_status;
1636 	ibt_ip_path_attr_t	ipattr;
1637 	uint8_t npaths = 0;
1638 	ibt_path_ip_src_t	srcip;
1639 
1640 	ASSERT(raddr->buf != NULL);
1641 
1642 	(void) bzero(path, sizeof (ibt_path_info_t));
1643 
1644 	switch (addr_type) {
1645 	case AF_INET:
1646 		sin4 = (struct sockaddr_in *)raddr->buf;
1647 		d_ip->family = AF_INET;
1648 		d_ip->un.ip4addr = htonl(sin4->sin_addr.s_addr);
1649 		break;
1650 
1651 	case AF_INET6:
1652 		sin6 = (struct sockaddr_in6 *)raddr->buf;
1653 		d_ip->family = AF_INET6;
1654 		d_ip->un.ip6addr = sin6->sin6_addr;
1655 		break;
1656 
1657 	default:
1658 		return (RDMA_INVAL);
1659 	}
1660 
1661 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1662 	bzero(&srcip, sizeof (ibt_path_ip_src_t));
1663 
1664 	ipattr.ipa_dst_ip 	= d_ip;
1665 	ipattr.ipa_hca_guid 	= rib_stat->hca->hca_guid;
1666 	ipattr.ipa_ndst		= 1;
1667 	ipattr.ipa_max_paths	= 1;
1668 	npaths = 0;
1669 
1670 	ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1671 	    IBT_PATH_NO_FLAGS,
1672 	    &ipattr,
1673 	    path,
1674 	    &npaths,
1675 	    &srcip);
1676 
1677 	if (ibt_status != IBT_SUCCESS ||
1678 	    npaths < 1 ||
1679 	    path->pi_hca_guid != rib_stat->hca->hca_guid) {
1680 
1681 		bzero(s_ip, sizeof (ibt_path_ip_src_t));
1682 		return (RDMA_FAILED);
1683 	}
1684 
1685 	if (srcip.ip_primary.family == AF_INET) {
1686 		s_ip->family = AF_INET;
1687 		s_ip->un.ip4addr = htonl(srcip.ip_primary.un.ip4addr);
1688 	} else {
1689 		s_ip->family = AF_INET6;
1690 		s_ip->un.ip6addr = srcip.ip_primary.un.ip6addr;
1691 	}
1692 
1693 	return (RDMA_SUCCESS);
1694 }
1695 
1696 
1697 /*
1698  * Connect to the server.
1699  */
1700 rdma_stat
1701 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, ibt_path_info_t *path,
1702 		ibt_ip_addr_t *s_ip, ibt_ip_addr_t *d_ip)
1703 {
1704 	ibt_chan_open_args_t	chan_args;	/* channel args */
1705 	ibt_chan_sizes_t	chan_sizes;
1706 	ibt_rc_chan_alloc_args_t	qp_attr;
1707 	ibt_status_t		ibt_status;
1708 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1709 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1710 	ibt_ip_cm_info_t	ipcm_info;
1711 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1712 
1713 
1714 	(void) bzero(&chan_args, sizeof (chan_args));
1715 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1716 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1717 
1718 	switch (ipcm_info.src_addr.family = s_ip->family) {
1719 	case AF_INET:
1720 		ipcm_info.src_addr.un.ip4addr = s_ip->un.ip4addr;
1721 		break;
1722 	case AF_INET6:
1723 		ipcm_info.src_addr.un.ip6addr = s_ip->un.ip6addr;
1724 		break;
1725 	}
1726 
1727 	switch (ipcm_info.dst_addr.family = d_ip->family) {
1728 	case AF_INET:
1729 		ipcm_info.dst_addr.un.ip4addr = d_ip->un.ip4addr;
1730 		break;
1731 	case AF_INET6:
1732 		ipcm_info.dst_addr.un.ip6addr = d_ip->un.ip6addr;
1733 		break;
1734 	}
1735 
1736 	ipcm_info.src_port = NFS_RDMA_PORT;
1737 
1738 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1739 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1740 
1741 	if (ibt_status != IBT_SUCCESS) {
1742 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1743 		return (-1);
1744 	}
1745 
1746 	qp_attr.rc_hca_port_num = path->pi_prim_cep_path.cep_hca_port_num;
1747 	/* Alloc a RC channel */
1748 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1749 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1750 	qp_attr.rc_pd = hca->pd_hdl;
1751 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1752 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1753 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1754 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1755 	qp_attr.rc_clone_chan = NULL;
1756 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1757 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1758 
1759 	path->pi_sid = ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT);
1760 	chan_args.oc_path = path;
1761 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1762 	chan_args.oc_cm_clnt_private = (void *)rib_stat;
1763 	chan_args.oc_rdma_ra_out = 4;
1764 	chan_args.oc_rdma_ra_in = 4;
1765 	chan_args.oc_path_retry_cnt = 2;
1766 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1767 	chan_args.oc_priv_data = cmp_ip_pvt;
1768 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1769 
1770 refresh:
1771 	rw_enter(&hca->state_lock, RW_READER);
1772 	if (hca->state != HCA_DETACHED) {
1773 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1774 		    IBT_ACHAN_NO_FLAGS,
1775 		    &qp_attr, &qp->qp_hdl,
1776 		    &chan_sizes);
1777 	} else {
1778 		rw_exit(&hca->state_lock);
1779 		return (RDMA_FAILED);
1780 	}
1781 	rw_exit(&hca->state_lock);
1782 
1783 	if (ibt_status != IBT_SUCCESS) {
1784 		DTRACE_PROBE1(rpcib__i_conntosrv,
1785 		    int, ibt_status);
1786 		return (RDMA_FAILED);
1787 	}
1788 
1789 	/* Connect to the Server */
1790 	(void) bzero(&ret_args, sizeof (ret_args));
1791 	mutex_enter(&qp->cb_lock);
1792 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1793 	    IBT_BLOCKING, &chan_args, &ret_args);
1794 	if (ibt_status != IBT_SUCCESS) {
1795 		DTRACE_PROBE2(rpcib__i_openrctosrv,
1796 		    int, ibt_status, int, ret_args.rc_status);
1797 
1798 		(void) ibt_free_channel(qp->qp_hdl);
1799 		qp->qp_hdl = NULL;
1800 		mutex_exit(&qp->cb_lock);
1801 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1802 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1803 			/*
1804 			 * Got IBT_CM_CONN_STALE probably because of stale
1805 			 * data on the passive end of a channel that existed
1806 			 * prior to reboot. Retry establishing a channel
1807 			 * REFRESH_ATTEMPTS times, during which time the
1808 			 * stale conditions on the server might clear up.
1809 			 */
1810 			goto refresh;
1811 		}
1812 		return (RDMA_FAILED);
1813 	}
1814 	mutex_exit(&qp->cb_lock);
1815 	/*
1816 	 * Set the private data area to qp to be used in callbacks
1817 	 */
1818 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1819 	return (RDMA_SUCCESS);
1820 }
1821 
1822 rdma_stat
1823 rib_ping_srv(int addr_type, struct netbuf *raddr, rib_hca_t **hca)
1824 {
1825 	struct sockaddr_in	*sin4, *sin4arr;
1826 	struct sockaddr_in6	*sin6, *sin6arr;
1827 	uint_t			nif, nif4, nif6, i;
1828 	ibt_path_info_t		path;
1829 	ibt_status_t		ibt_status;
1830 	uint8_t			num_paths_p;
1831 	ibt_ip_path_attr_t	ipattr;
1832 	ibt_ip_addr_t		dstip;
1833 	ibt_path_ip_src_t	srcip;
1834 
1835 
1836 	*hca = NULL;
1837 
1838 	ASSERT(raddr->buf != NULL);
1839 
1840 	bzero(&path, sizeof (ibt_path_info_t));
1841 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1842 	bzero(&srcip, sizeof (ibt_path_ip_src_t));
1843 
1844 	/* Obtain the source IP addresses for the system */
1845 	nif = rpcib_get_number_interfaces();
1846 	sin4arr = (struct sockaddr_in *)
1847 	    kmem_zalloc(sizeof (struct sockaddr_in) * nif, KM_SLEEP);
1848 	sin6arr = (struct sockaddr_in6 *)
1849 	    kmem_zalloc(sizeof (struct sockaddr_in6) * nif, KM_SLEEP);
1850 
1851 	(void) rpcib_get_ib_addresses(sin4arr, sin6arr, &nif4, &nif6);
1852 
1853 	/* Are there really any IB interfaces available */
1854 	if (nif4 == 0 && nif6 == 0) {
1855 		kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif);
1856 		kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif);
1857 		return (RDMA_FAILED);
1858 	}
1859 
1860 	/* Prep the destination address */
1861 	switch (addr_type) {
1862 	case AF_INET:
1863 		sin4 = (struct sockaddr_in *)raddr->buf;
1864 		dstip.family = AF_INET;
1865 		dstip.un.ip4addr = htonl(sin4->sin_addr.s_addr);
1866 
1867 		for (i = 0; i < nif4; i++) {
1868 			num_paths_p = 0;
1869 			ipattr.ipa_dst_ip 	= &dstip;
1870 			ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
1871 			ipattr.ipa_ndst		= 1;
1872 			ipattr.ipa_max_paths	= 1;
1873 			ipattr.ipa_src_ip.family = dstip.family;
1874 			ipattr.ipa_src_ip.un.ip4addr =
1875 			    htonl(sin4arr[i].sin_addr.s_addr);
1876 
1877 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1878 			    IBT_PATH_NO_FLAGS,
1879 			    &ipattr,
1880 			    &path,
1881 			    &num_paths_p,
1882 			    &srcip);
1883 			if (ibt_status == IBT_SUCCESS &&
1884 			    num_paths_p != 0 &&
1885 			    path.pi_hca_guid == rib_stat->hca->hca_guid) {
1886 				*hca = rib_stat->hca;
1887 
1888 				kmem_free(sin4arr,
1889 				    sizeof (struct sockaddr_in) * nif);
1890 				kmem_free(sin6arr,
1891 				    sizeof (struct sockaddr_in6) * nif);
1892 
1893 				return (RDMA_SUCCESS);
1894 			}
1895 		}
1896 		break;
1897 
1898 	case AF_INET6:
1899 		sin6 = (struct sockaddr_in6 *)raddr->buf;
1900 		dstip.family = AF_INET6;
1901 		dstip.un.ip6addr = sin6->sin6_addr;
1902 
1903 		for (i = 0; i < nif6; i++) {
1904 			num_paths_p = 0;
1905 			ipattr.ipa_dst_ip 	= &dstip;
1906 			ipattr.ipa_hca_guid	= rib_stat->hca->hca_guid;
1907 			ipattr.ipa_ndst		= 1;
1908 			ipattr.ipa_max_paths	= 1;
1909 			ipattr.ipa_src_ip.family = dstip.family;
1910 			ipattr.ipa_src_ip.un.ip6addr = sin6arr[i].sin6_addr;
1911 
1912 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1913 			    IBT_PATH_NO_FLAGS,
1914 			    &ipattr,
1915 			    &path,
1916 			    &num_paths_p,
1917 			    &srcip);
1918 			if (ibt_status == IBT_SUCCESS &&
1919 			    num_paths_p != 0 &&
1920 			    path.pi_hca_guid == rib_stat->hca->hca_guid) {
1921 				*hca = rib_stat->hca;
1922 
1923 				kmem_free(sin4arr,
1924 				    sizeof (struct sockaddr_in) * nif);
1925 				kmem_free(sin6arr,
1926 				    sizeof (struct sockaddr_in6) * nif);
1927 
1928 				return (RDMA_SUCCESS);
1929 			}
1930 		}
1931 
1932 		break;
1933 
1934 	default:
1935 		kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif);
1936 		kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif);
1937 		return (RDMA_INVAL);
1938 	}
1939 
1940 	kmem_free(sin4arr, sizeof (struct sockaddr_in) * nif);
1941 	kmem_free(sin6arr, sizeof (struct sockaddr_in6) * nif);
1942 	return (RDMA_FAILED);
1943 }
1944 
1945 /*
1946  * Close channel, remove from connection list and
1947  * free up resources allocated for that channel.
1948  */
1949 rdma_stat
1950 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
1951 {
1952 	rib_qp_t	*qp = ctoqp(conn);
1953 	rib_hca_t	*hca;
1954 
1955 	/*
1956 	 * c_ref == 0 and connection is in C_DISCONN_PEND
1957 	 */
1958 	hca = qp->hca;
1959 	if (conn_list != NULL)
1960 		(void) rib_rm_conn(conn, conn_list);
1961 
1962 	if (qp->qp_hdl != NULL) {
1963 		/*
1964 		 * If the channel has not been establised,
1965 		 * ibt_flush_channel is called to flush outstanding WRs
1966 		 * on the Qs.  Otherwise, ibt_close_rc_channel() is
1967 		 * called.  The channel is then freed.
1968 		 */
1969 		if (conn_list != NULL)
1970 			(void) ibt_close_rc_channel(qp->qp_hdl,
1971 			    IBT_BLOCKING, NULL, 0, NULL, NULL, 0);
1972 		else
1973 			(void) ibt_flush_channel(qp->qp_hdl);
1974 
1975 		mutex_enter(&qp->posted_rbufs_lock);
1976 		while (qp->n_posted_rbufs)
1977 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
1978 		mutex_exit(&qp->posted_rbufs_lock);
1979 		(void) ibt_free_channel(qp->qp_hdl);
1980 		qp->qp_hdl = NULL;
1981 	}
1982 
1983 	ASSERT(qp->rdlist == NULL);
1984 
1985 	if (qp->replylist != NULL) {
1986 		(void) rib_rem_replylist(qp);
1987 	}
1988 
1989 	cv_destroy(&qp->cb_conn_cv);
1990 	cv_destroy(&qp->posted_rbufs_cv);
1991 	mutex_destroy(&qp->cb_lock);
1992 
1993 	mutex_destroy(&qp->replylist_lock);
1994 	mutex_destroy(&qp->posted_rbufs_lock);
1995 	mutex_destroy(&qp->rdlist_lock);
1996 
1997 	cv_destroy(&conn->c_cv);
1998 	mutex_destroy(&conn->c_lock);
1999 
2000 	if (conn->c_raddr.buf != NULL) {
2001 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2002 	}
2003 	if (conn->c_laddr.buf != NULL) {
2004 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2005 	}
2006 
2007 	/*
2008 	 * Credit control cleanup.
2009 	 */
2010 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2011 		rdma_clnt_cred_ctrl_t *cc_info;
2012 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2013 		cv_destroy(&cc_info->clnt_cc_cv);
2014 	}
2015 
2016 	kmem_free(qp, sizeof (rib_qp_t));
2017 
2018 	/*
2019 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2020 	 * then the hca is no longer being used.
2021 	 */
2022 	if (conn_list != NULL) {
2023 		rw_enter(&hca->state_lock, RW_READER);
2024 		if (hca->state == HCA_DETACHED) {
2025 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2026 			if (hca->srv_conn_list.conn_hd == NULL) {
2027 				rw_enter(&hca->cl_conn_list.conn_lock,
2028 				    RW_READER);
2029 
2030 				if (hca->cl_conn_list.conn_hd == NULL) {
2031 					mutex_enter(&hca->inuse_lock);
2032 					hca->inuse = FALSE;
2033 					cv_signal(&hca->cb_cv);
2034 					mutex_exit(&hca->inuse_lock);
2035 				}
2036 				rw_exit(&hca->cl_conn_list.conn_lock);
2037 			}
2038 			rw_exit(&hca->srv_conn_list.conn_lock);
2039 		}
2040 		rw_exit(&hca->state_lock);
2041 	}
2042 
2043 	return (RDMA_SUCCESS);
2044 }
2045 
2046 /*
2047  * Wait for send completion notification. Only on receiving a
2048  * notification be it a successful or error completion, free the
2049  * send_wid.
2050  */
2051 static rdma_stat
2052 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2053 {
2054 	clock_t timout, cv_wait_ret;
2055 	rdma_stat error = RDMA_SUCCESS;
2056 	int	i;
2057 
2058 	/*
2059 	 * Wait for send to complete
2060 	 */
2061 	ASSERT(wd != NULL);
2062 	mutex_enter(&wd->sendwait_lock);
2063 	if (wd->status == (uint_t)SEND_WAIT) {
2064 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2065 		    ddi_get_lbolt();
2066 
2067 		if (qp->mode == RIB_SERVER) {
2068 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2069 			    &wd->sendwait_lock, timout)) > 0 &&
2070 			    wd->status == (uint_t)SEND_WAIT)
2071 				;
2072 			switch (cv_wait_ret) {
2073 			case -1:	/* timeout */
2074 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2075 
2076 				wd->cv_sig = 0;		/* no signal needed */
2077 				error = RDMA_TIMEDOUT;
2078 				break;
2079 			default:	/* got send completion */
2080 				break;
2081 			}
2082 		} else {
2083 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2084 			    &wd->sendwait_lock, timout)) > 0 &&
2085 			    wd->status == (uint_t)SEND_WAIT)
2086 				;
2087 			switch (cv_wait_ret) {
2088 			case -1:	/* timeout */
2089 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2090 
2091 				wd->cv_sig = 0;		/* no signal needed */
2092 				error = RDMA_TIMEDOUT;
2093 				break;
2094 			case 0:		/* interrupted */
2095 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2096 
2097 				wd->cv_sig = 0;		/* no signal needed */
2098 				error = RDMA_INTR;
2099 				break;
2100 			default:	/* got send completion */
2101 				break;
2102 			}
2103 		}
2104 	}
2105 
2106 	if (wd->status != (uint_t)SEND_WAIT) {
2107 		/* got send completion */
2108 		if (wd->status != RDMA_SUCCESS) {
2109 			error = wd->status;
2110 		if (wd->status != RDMA_CONNLOST)
2111 			error = RDMA_FAILED;
2112 		}
2113 		for (i = 0; i < wd->nsbufs; i++) {
2114 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2115 			    (void *)(uintptr_t)wd->sbufaddr[i]);
2116 		}
2117 		mutex_exit(&wd->sendwait_lock);
2118 		(void) rib_free_sendwait(wd);
2119 	} else {
2120 		mutex_exit(&wd->sendwait_lock);
2121 	}
2122 	return (error);
2123 }
2124 
2125 static struct send_wid *
2126 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2127 {
2128 	struct send_wid	*wd;
2129 
2130 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2131 	wd->xid = xid;
2132 	wd->cv_sig = cv_sig;
2133 	wd->qp = qp;
2134 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2135 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2136 	wd->status = (uint_t)SEND_WAIT;
2137 
2138 	return (wd);
2139 }
2140 
2141 static int
2142 rib_free_sendwait(struct send_wid *wdesc)
2143 {
2144 	cv_destroy(&wdesc->wait_cv);
2145 	mutex_destroy(&wdesc->sendwait_lock);
2146 	kmem_free(wdesc, sizeof (*wdesc));
2147 
2148 	return (0);
2149 }
2150 
2151 static rdma_stat
2152 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2153 {
2154 	mutex_enter(&qp->replylist_lock);
2155 	if (rep != NULL) {
2156 		(void) rib_remreply(qp, rep);
2157 		mutex_exit(&qp->replylist_lock);
2158 		return (RDMA_SUCCESS);
2159 	}
2160 	mutex_exit(&qp->replylist_lock);
2161 	return (RDMA_FAILED);
2162 }
2163 
2164 /*
2165  * Send buffers are freed here only in case of error in posting
2166  * on QP. If the post succeeded, the send buffers are freed upon
2167  * send completion in rib_sendwait() or in the scq_handler.
2168  */
2169 rdma_stat
2170 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2171 	int send_sig, int cv_sig, caddr_t *swid)
2172 {
2173 	struct send_wid	*wdesc;
2174 	struct clist	*clp;
2175 	ibt_status_t	ibt_status = IBT_SUCCESS;
2176 	rdma_stat	ret = RDMA_SUCCESS;
2177 	ibt_send_wr_t	tx_wr;
2178 	int		i, nds;
2179 	ibt_wr_ds_t	sgl[DSEG_MAX];
2180 	uint_t		total_msg_size;
2181 	rib_qp_t	*qp;
2182 
2183 	qp = ctoqp(conn);
2184 
2185 	ASSERT(cl != NULL);
2186 
2187 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2188 
2189 	nds = 0;
2190 	total_msg_size = 0;
2191 	clp = cl;
2192 	while (clp != NULL) {
2193 		if (nds >= DSEG_MAX) {
2194 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2195 			return (RDMA_FAILED);
2196 		}
2197 		sgl[nds].ds_va = clp->w.c_saddr;
2198 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2199 		sgl[nds].ds_len = clp->c_len;
2200 		total_msg_size += clp->c_len;
2201 		clp = clp->c_next;
2202 		nds++;
2203 	}
2204 
2205 	if (send_sig) {
2206 		/* Set SEND_SIGNAL flag. */
2207 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2208 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2209 		*swid = (caddr_t)wdesc;
2210 	} else {
2211 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2212 		wdesc = rib_init_sendwait(msgid, 0, qp);
2213 		*swid = (caddr_t)wdesc;
2214 	}
2215 	wdesc->nsbufs = nds;
2216 	for (i = 0; i < nds; i++) {
2217 		wdesc->sbufaddr[i] = sgl[i].ds_va;
2218 	}
2219 
2220 	tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2221 	tx_wr.wr_opcode = IBT_WRC_SEND;
2222 	tx_wr.wr_trans = IBT_RC_SRV;
2223 	tx_wr.wr_nds = nds;
2224 	tx_wr.wr_sgl = sgl;
2225 
2226 	mutex_enter(&conn->c_lock);
2227 	if (conn->c_state == C_CONNECTED) {
2228 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2229 	}
2230 	if (conn->c_state != C_CONNECTED ||
2231 	    ibt_status != IBT_SUCCESS) {
2232 		if (conn->c_state != C_DISCONN_PEND)
2233 			conn->c_state = C_ERROR_CONN;
2234 		mutex_exit(&conn->c_lock);
2235 		for (i = 0; i < nds; i++) {
2236 			rib_rbuf_free(conn, SEND_BUFFER,
2237 			    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2238 		}
2239 
2240 		(void) rib_free_sendwait(wdesc);
2241 
2242 		return (RDMA_CONNLOST);
2243 	}
2244 	mutex_exit(&conn->c_lock);
2245 
2246 	if (send_sig) {
2247 		if (cv_sig) {
2248 			/*
2249 			 * cv_wait for send to complete.
2250 			 * We can fail due to a timeout or signal or
2251 			 * unsuccessful send.
2252 			 */
2253 			ret = rib_sendwait(qp, wdesc);
2254 
2255 			return (ret);
2256 		}
2257 	}
2258 
2259 	return (RDMA_SUCCESS);
2260 }
2261 
2262 
2263 rdma_stat
2264 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2265 {
2266 	rdma_stat	ret;
2267 	caddr_t		wd;
2268 
2269 	/* send-wait & cv_signal */
2270 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2271 	return (ret);
2272 }
2273 
2274 /*
2275  * Server interface (svc_rdma_ksend).
2276  * Send RPC reply and wait for RDMA_DONE.
2277  */
2278 rdma_stat
2279 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2280 {
2281 	rdma_stat ret = RDMA_SUCCESS;
2282 	struct rdma_done_list *rd;
2283 	clock_t timout, cv_wait_ret;
2284 	caddr_t *wid = NULL;
2285 	rib_qp_t *qp = ctoqp(conn);
2286 
2287 	mutex_enter(&qp->rdlist_lock);
2288 	rd = rdma_done_add(qp, msgid);
2289 
2290 	/* No cv_signal (whether send-wait or no-send-wait) */
2291 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2292 
2293 	if (ret != RDMA_SUCCESS) {
2294 		rdma_done_rm(qp, rd);
2295 	} else {
2296 		/*
2297 		 * Wait for RDMA_DONE from remote end
2298 		 */
2299 		timout =
2300 		    drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2301 		cv_wait_ret = cv_timedwait(&rd->rdma_done_cv,
2302 		    &qp->rdlist_lock,
2303 		    timout);
2304 
2305 		rdma_done_rm(qp, rd);
2306 
2307 		if (cv_wait_ret < 0) {
2308 			ret = RDMA_TIMEDOUT;
2309 		}
2310 	}
2311 
2312 	mutex_exit(&qp->rdlist_lock);
2313 	return (ret);
2314 }
2315 
2316 static struct recv_wid *
2317 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2318 {
2319 	struct recv_wid	*rwid;
2320 
2321 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2322 	rwid->xid = msgid;
2323 	rwid->addr = sgl->ds_va;
2324 	rwid->qp = qp;
2325 
2326 	return (rwid);
2327 }
2328 
2329 static void
2330 rib_free_wid(struct recv_wid *rwid)
2331 {
2332 	kmem_free(rwid, sizeof (struct recv_wid));
2333 }
2334 
2335 rdma_stat
2336 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2337 {
2338 	rib_qp_t	*qp = ctoqp(conn);
2339 	struct clist	*clp = cl;
2340 	struct reply	*rep;
2341 	struct recv_wid	*rwid;
2342 	int		nds;
2343 	ibt_wr_ds_t	sgl[DSEG_MAX];
2344 	ibt_recv_wr_t	recv_wr;
2345 	rdma_stat	ret;
2346 	ibt_status_t	ibt_status;
2347 
2348 	/*
2349 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2350 	 */
2351 
2352 	nds = 0;
2353 	while (cl != NULL) {
2354 		if (nds >= DSEG_MAX) {
2355 			ret = RDMA_FAILED;
2356 			goto done;
2357 		}
2358 		sgl[nds].ds_va = cl->w.c_saddr;
2359 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2360 		sgl[nds].ds_len = cl->c_len;
2361 		cl = cl->c_next;
2362 		nds++;
2363 	}
2364 
2365 	if (nds != 1) {
2366 		ret = RDMA_FAILED;
2367 		goto done;
2368 	}
2369 
2370 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2371 	recv_wr.wr_nds = nds;
2372 	recv_wr.wr_sgl = sgl;
2373 
2374 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2375 	if (rwid) {
2376 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2377 	} else {
2378 		ret = RDMA_NORESOURCE;
2379 		goto done;
2380 	}
2381 	rep = rib_addreplylist(qp, msgid);
2382 	if (!rep) {
2383 		rib_free_wid(rwid);
2384 		ret = RDMA_NORESOURCE;
2385 		goto done;
2386 	}
2387 
2388 	mutex_enter(&conn->c_lock);
2389 
2390 	if (conn->c_state == C_CONNECTED) {
2391 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2392 	}
2393 
2394 	if (conn->c_state != C_CONNECTED ||
2395 	    ibt_status != IBT_SUCCESS) {
2396 		if (conn->c_state != C_DISCONN_PEND)
2397 			conn->c_state = C_ERROR_CONN;
2398 		mutex_exit(&conn->c_lock);
2399 		rib_free_wid(rwid);
2400 		(void) rib_rem_rep(qp, rep);
2401 		ret = RDMA_CONNLOST;
2402 		goto done;
2403 	}
2404 	mutex_exit(&conn->c_lock);
2405 	return (RDMA_SUCCESS);
2406 
2407 done:
2408 	while (clp != NULL) {
2409 		rib_rbuf_free(conn, RECV_BUFFER,
2410 		    (void *)(uintptr_t)clp->w.c_saddr3);
2411 		clp = clp->c_next;
2412 	}
2413 	return (ret);
2414 }
2415 
2416 rdma_stat
2417 rib_svc_post(CONN* conn, struct clist *cl)
2418 {
2419 	rib_qp_t	*qp = ctoqp(conn);
2420 	struct svc_recv	*s_recvp;
2421 	int		nds;
2422 	ibt_wr_ds_t	sgl[DSEG_MAX];
2423 	ibt_recv_wr_t	recv_wr;
2424 	ibt_status_t	ibt_status;
2425 
2426 	nds = 0;
2427 	while (cl != NULL) {
2428 		if (nds >= DSEG_MAX) {
2429 			return (RDMA_FAILED);
2430 		}
2431 		sgl[nds].ds_va = cl->w.c_saddr;
2432 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2433 		sgl[nds].ds_len = cl->c_len;
2434 		cl = cl->c_next;
2435 		nds++;
2436 	}
2437 
2438 	if (nds != 1) {
2439 		rib_rbuf_free(conn, RECV_BUFFER,
2440 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2441 
2442 		return (RDMA_FAILED);
2443 	}
2444 
2445 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2446 	recv_wr.wr_nds = nds;
2447 	recv_wr.wr_sgl = sgl;
2448 
2449 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2450 	/* Use s_recvp's addr as wr id */
2451 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2452 	mutex_enter(&conn->c_lock);
2453 	if (conn->c_state == C_CONNECTED) {
2454 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2455 	}
2456 	if (conn->c_state != C_CONNECTED ||
2457 	    ibt_status != IBT_SUCCESS) {
2458 		if (conn->c_state != C_DISCONN_PEND)
2459 			conn->c_state = C_ERROR_CONN;
2460 		mutex_exit(&conn->c_lock);
2461 		rib_rbuf_free(conn, RECV_BUFFER,
2462 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2463 		(void) rib_free_svc_recv(s_recvp);
2464 
2465 		return (RDMA_CONNLOST);
2466 	}
2467 	mutex_exit(&conn->c_lock);
2468 
2469 	return (RDMA_SUCCESS);
2470 }
2471 
2472 /* Client */
2473 rdma_stat
2474 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2475 {
2476 
2477 	return (rib_clnt_post(conn, cl, msgid));
2478 }
2479 
2480 /* Client */
2481 rdma_stat
2482 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2483 {
2484 	rib_qp_t	*qp = ctoqp(conn);
2485 	struct reply	*rep;
2486 
2487 	mutex_enter(&qp->replylist_lock);
2488 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2489 		if (rep->xid == msgid) {
2490 			if (rep->vaddr_cq) {
2491 				rib_rbuf_free(conn, RECV_BUFFER,
2492 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2493 			}
2494 			(void) rib_remreply(qp, rep);
2495 			break;
2496 		}
2497 	}
2498 	mutex_exit(&qp->replylist_lock);
2499 
2500 	return (RDMA_SUCCESS);
2501 }
2502 
2503 /* Server */
2504 rdma_stat
2505 rib_post_recv(CONN *conn, struct clist *cl)
2506 {
2507 	rib_qp_t	*qp = ctoqp(conn);
2508 
2509 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2510 		mutex_enter(&qp->posted_rbufs_lock);
2511 		qp->n_posted_rbufs++;
2512 		mutex_exit(&qp->posted_rbufs_lock);
2513 		return (RDMA_SUCCESS);
2514 	}
2515 	return (RDMA_FAILED);
2516 }
2517 
2518 /*
2519  * Client side only interface to "recv" the rpc reply buf
2520  * posted earlier by rib_post_resp(conn, cl, msgid).
2521  */
2522 rdma_stat
2523 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2524 {
2525 	struct reply *rep = NULL;
2526 	clock_t timout, cv_wait_ret;
2527 	rdma_stat ret = RDMA_SUCCESS;
2528 	rib_qp_t *qp = ctoqp(conn);
2529 
2530 	/*
2531 	 * Find the reply structure for this msgid
2532 	 */
2533 	mutex_enter(&qp->replylist_lock);
2534 
2535 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2536 		if (rep->xid == msgid)
2537 			break;
2538 	}
2539 
2540 	if (rep != NULL) {
2541 		/*
2542 		 * If message not yet received, wait.
2543 		 */
2544 		if (rep->status == (uint_t)REPLY_WAIT) {
2545 			timout = ddi_get_lbolt() +
2546 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2547 
2548 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2549 			    &qp->replylist_lock, timout)) > 0 &&
2550 			    rep->status == (uint_t)REPLY_WAIT)
2551 				;
2552 
2553 			switch (cv_wait_ret) {
2554 			case -1:	/* timeout */
2555 				ret = RDMA_TIMEDOUT;
2556 				break;
2557 			case 0:
2558 				ret = RDMA_INTR;
2559 				break;
2560 			default:
2561 				break;
2562 			}
2563 		}
2564 
2565 		if (rep->status == RDMA_SUCCESS) {
2566 			struct clist *cl = NULL;
2567 
2568 			/*
2569 			 * Got message successfully
2570 			 */
2571 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2572 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2573 			*clp = cl;
2574 		} else {
2575 			if (rep->status != (uint_t)REPLY_WAIT) {
2576 				/*
2577 				 * Got error in reply message. Free
2578 				 * recv buffer here.
2579 				 */
2580 				ret = rep->status;
2581 				rib_rbuf_free(conn, RECV_BUFFER,
2582 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2583 			}
2584 		}
2585 		(void) rib_remreply(qp, rep);
2586 	} else {
2587 		/*
2588 		 * No matching reply structure found for given msgid on the
2589 		 * reply wait list.
2590 		 */
2591 		ret = RDMA_INVAL;
2592 		DTRACE_PROBE(rpcib__i__nomatchxid2);
2593 	}
2594 
2595 	/*
2596 	 * Done.
2597 	 */
2598 	mutex_exit(&qp->replylist_lock);
2599 	return (ret);
2600 }
2601 
2602 /*
2603  * RDMA write a buffer to the remote address.
2604  */
2605 rdma_stat
2606 rib_write(CONN *conn, struct clist *cl, int wait)
2607 {
2608 	ibt_send_wr_t	tx_wr;
2609 	int		cv_sig;
2610 	int		i;
2611 	ibt_wr_ds_t	sgl[DSEG_MAX];
2612 	struct send_wid	*wdesc;
2613 	ibt_status_t	ibt_status;
2614 	rdma_stat	ret = RDMA_SUCCESS;
2615 	rib_qp_t	*qp = ctoqp(conn);
2616 	uint64_t	n_writes = 0;
2617 	bool_t		force_wait = FALSE;
2618 
2619 	if (cl == NULL) {
2620 		return (RDMA_FAILED);
2621 	}
2622 
2623 
2624 	while ((cl != NULL)) {
2625 		if (cl->c_len > 0) {
2626 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2627 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2628 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2629 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2630 			sgl[0].ds_va = cl->w.c_saddr;
2631 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2632 			sgl[0].ds_len = cl->c_len;
2633 
2634 			if (wait) {
2635 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2636 				cv_sig = 1;
2637 			} else {
2638 				if (n_writes > max_unsignaled_rws) {
2639 					n_writes = 0;
2640 					force_wait = TRUE;
2641 					tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2642 					cv_sig = 1;
2643 				} else {
2644 					tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2645 					cv_sig = 0;
2646 				}
2647 			}
2648 
2649 			wdesc = rib_init_sendwait(0, cv_sig, qp);
2650 			tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2651 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2652 			tx_wr.wr_trans = IBT_RC_SRV;
2653 			tx_wr.wr_nds = 1;
2654 			tx_wr.wr_sgl = sgl;
2655 
2656 			mutex_enter(&conn->c_lock);
2657 			if (conn->c_state == C_CONNECTED) {
2658 				ibt_status =
2659 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2660 			}
2661 			if (conn->c_state != C_CONNECTED ||
2662 			    ibt_status != IBT_SUCCESS) {
2663 				if (conn->c_state != C_DISCONN_PEND)
2664 					conn->c_state = C_ERROR_CONN;
2665 				mutex_exit(&conn->c_lock);
2666 				(void) rib_free_sendwait(wdesc);
2667 				return (RDMA_CONNLOST);
2668 			}
2669 			mutex_exit(&conn->c_lock);
2670 
2671 			/*
2672 			 * Wait for send to complete
2673 			 */
2674 			if (wait || force_wait) {
2675 				force_wait = FALSE;
2676 				ret = rib_sendwait(qp, wdesc);
2677 				if (ret != 0) {
2678 					return (ret);
2679 				}
2680 			} else {
2681 				mutex_enter(&wdesc->sendwait_lock);
2682 				for (i = 0; i < wdesc->nsbufs; i++) {
2683 					rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2684 					    (void *)(uintptr_t)
2685 					    wdesc->sbufaddr[i]);
2686 				}
2687 				mutex_exit(&wdesc->sendwait_lock);
2688 				(void) rib_free_sendwait(wdesc);
2689 			}
2690 			n_writes ++;
2691 		}
2692 		cl = cl->c_next;
2693 	}
2694 	return (RDMA_SUCCESS);
2695 }
2696 
2697 /*
2698  * RDMA Read a buffer from the remote address.
2699  */
2700 rdma_stat
2701 rib_read(CONN *conn, struct clist *cl, int wait)
2702 {
2703 	ibt_send_wr_t	rx_wr;
2704 	int		cv_sig;
2705 	int		i;
2706 	ibt_wr_ds_t	sgl;
2707 	struct send_wid	*wdesc;
2708 	ibt_status_t	ibt_status = IBT_SUCCESS;
2709 	rdma_stat	ret = RDMA_SUCCESS;
2710 	rib_qp_t	*qp = ctoqp(conn);
2711 
2712 	if (cl == NULL) {
2713 		return (RDMA_FAILED);
2714 	}
2715 
2716 	while (cl != NULL) {
2717 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2718 		/*
2719 		 * Remote address is at the head chunk item in list.
2720 		 */
2721 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2722 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2723 
2724 		sgl.ds_va = cl->u.c_daddr;
2725 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2726 		sgl.ds_len = cl->c_len;
2727 
2728 		if (wait) {
2729 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2730 			cv_sig = 1;
2731 		} else {
2732 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2733 			cv_sig = 0;
2734 		}
2735 
2736 		wdesc = rib_init_sendwait(0, cv_sig, qp);
2737 		rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2738 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2739 		rx_wr.wr_trans = IBT_RC_SRV;
2740 		rx_wr.wr_nds = 1;
2741 		rx_wr.wr_sgl = &sgl;
2742 
2743 		mutex_enter(&conn->c_lock);
2744 		if (conn->c_state == C_CONNECTED) {
2745 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2746 		}
2747 		if (conn->c_state != C_CONNECTED ||
2748 		    ibt_status != IBT_SUCCESS) {
2749 			if (conn->c_state != C_DISCONN_PEND)
2750 				conn->c_state = C_ERROR_CONN;
2751 			mutex_exit(&conn->c_lock);
2752 			(void) rib_free_sendwait(wdesc);
2753 			return (RDMA_CONNLOST);
2754 		}
2755 		mutex_exit(&conn->c_lock);
2756 
2757 		/*
2758 		 * Wait for send to complete if this is the
2759 		 * last item in the list.
2760 		 */
2761 		if (wait && cl->c_next == NULL) {
2762 			ret = rib_sendwait(qp, wdesc);
2763 			if (ret != 0) {
2764 				return (ret);
2765 			}
2766 		} else {
2767 			mutex_enter(&wdesc->sendwait_lock);
2768 			for (i = 0; i < wdesc->nsbufs; i++) {
2769 				rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2770 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2771 			}
2772 			mutex_exit(&wdesc->sendwait_lock);
2773 			(void) rib_free_sendwait(wdesc);
2774 		}
2775 		cl = cl->c_next;
2776 	}
2777 	return (RDMA_SUCCESS);
2778 }
2779 
2780 /*
2781  * rib_srv_cm_handler()
2782  *    Connection Manager callback to handle RC connection requests.
2783  */
2784 /* ARGSUSED */
2785 static ibt_cm_status_t
2786 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2787 	ibt_cm_return_args_t *ret_args, void *priv_data,
2788 	ibt_priv_data_len_t len)
2789 {
2790 	queue_t		*q;
2791 	rib_qp_t	*qp;
2792 	rpcib_state_t	*ribstat;
2793 	rib_hca_t	*hca;
2794 	rdma_stat	status = RDMA_SUCCESS;
2795 	int		i;
2796 	struct clist	cl;
2797 	rdma_buf_t	rdbuf = {0};
2798 	void		*buf = NULL;
2799 	CONN		*conn;
2800 	ibt_ip_cm_info_t	ipinfo;
2801 	struct sockaddr_in *s;
2802 	struct sockaddr_in6 *s6;
2803 	int sin_size = sizeof (struct sockaddr_in);
2804 	int in_size = sizeof (struct in_addr);
2805 	int sin6_size = sizeof (struct sockaddr_in6);
2806 
2807 	ASSERT(any != NULL);
2808 	ASSERT(event != NULL);
2809 
2810 	ribstat = (rpcib_state_t *)any;
2811 	hca = (rib_hca_t *)ribstat->hca;
2812 	ASSERT(hca != NULL);
2813 
2814 	/* got a connection request */
2815 	switch (event->cm_type) {
2816 	case IBT_CM_EVENT_REQ_RCV:
2817 		/*
2818 		 * If the plugin is in the NO_ACCEPT state, bail out.
2819 		 */
2820 		mutex_enter(&plugin_state_lock);
2821 		if (plugin_state == NO_ACCEPT) {
2822 			mutex_exit(&plugin_state_lock);
2823 			return (IBT_CM_REJECT);
2824 		}
2825 		mutex_exit(&plugin_state_lock);
2826 
2827 		/*
2828 		 * Need to send a MRA MAD to CM so that it does not
2829 		 * timeout on us.
2830 		 */
2831 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2832 		    event->cm_event.req.req_timeout * 8, NULL, 0);
2833 
2834 		mutex_enter(&rib_stat->open_hca_lock);
2835 		q = rib_stat->q;
2836 		mutex_exit(&rib_stat->open_hca_lock);
2837 
2838 		status = rib_svc_create_chan(hca, (caddr_t)q,
2839 		    event->cm_event.req.req_prim_hca_port, &qp);
2840 
2841 		if (status) {
2842 			return (IBT_CM_REJECT);
2843 		}
2844 
2845 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2846 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2847 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2848 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2849 
2850 		/*
2851 		 * Pre-posts RECV buffers
2852 		 */
2853 		conn = qptoc(qp);
2854 		for (i = 0; i < preposted_rbufs; i++) {
2855 			bzero(&rdbuf, sizeof (rdbuf));
2856 			rdbuf.type = RECV_BUFFER;
2857 			buf = rib_rbuf_alloc(conn, &rdbuf);
2858 			if (buf == NULL) {
2859 				(void) rib_disconnect_channel(conn, NULL);
2860 				return (IBT_CM_REJECT);
2861 			}
2862 
2863 			bzero(&cl, sizeof (cl));
2864 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
2865 			cl.c_len = rdbuf.len;
2866 			cl.c_smemhandle.mrc_lmr =
2867 			    rdbuf.handle.mrc_lmr; /* lkey */
2868 			cl.c_next = NULL;
2869 			status = rib_post_recv(conn, &cl);
2870 			if (status != RDMA_SUCCESS) {
2871 				(void) rib_disconnect_channel(conn, NULL);
2872 				return (IBT_CM_REJECT);
2873 			}
2874 		}
2875 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
2876 
2877 		/*
2878 		 * Get the address translation
2879 		 */
2880 		rw_enter(&hca->state_lock, RW_READER);
2881 		if (hca->state == HCA_DETACHED) {
2882 			rw_exit(&hca->state_lock);
2883 			return (IBT_CM_REJECT);
2884 		}
2885 		rw_exit(&hca->state_lock);
2886 
2887 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
2888 
2889 		if (ibt_get_ip_data(event->cm_priv_data_len,
2890 		    event->cm_priv_data,
2891 		    &ipinfo) != IBT_SUCCESS) {
2892 
2893 			return (IBT_CM_REJECT);
2894 		}
2895 
2896 		switch (ipinfo.src_addr.family) {
2897 		case AF_INET:
2898 
2899 			conn->c_raddr.maxlen =
2900 			    conn->c_raddr.len = sin_size;
2901 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
2902 
2903 			s = (struct sockaddr_in *)conn->c_raddr.buf;
2904 			s->sin_family = AF_INET;
2905 
2906 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
2907 			    &s->sin_addr, in_size);
2908 
2909 			break;
2910 
2911 		case AF_INET6:
2912 
2913 			conn->c_raddr.maxlen =
2914 			    conn->c_raddr.len = sin6_size;
2915 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
2916 
2917 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
2918 			s6->sin6_family = AF_INET6;
2919 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
2920 			    &s6->sin6_addr,
2921 			    sizeof (struct in6_addr));
2922 
2923 			break;
2924 
2925 		default:
2926 			return (IBT_CM_REJECT);
2927 		}
2928 
2929 		break;
2930 
2931 	case IBT_CM_EVENT_CONN_CLOSED:
2932 	{
2933 		CONN		*conn;
2934 		rib_qp_t	*qp;
2935 
2936 		switch (event->cm_event.closed) {
2937 		case IBT_CM_CLOSED_DREP_RCVD:
2938 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
2939 		case IBT_CM_CLOSED_DUP:
2940 		case IBT_CM_CLOSED_ABORT:
2941 		case IBT_CM_CLOSED_ALREADY:
2942 			/*
2943 			 * These cases indicate the local end initiated
2944 			 * the closing of the channel. Nothing to do here.
2945 			 */
2946 			break;
2947 		default:
2948 			/*
2949 			 * Reason for CONN_CLOSED event must be one of
2950 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
2951 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
2952 			 * the remote end is closing the channel. In these
2953 			 * cases free the channel and transition to error
2954 			 * state
2955 			 */
2956 			qp = ibt_get_chan_private(event->cm_channel);
2957 			conn = qptoc(qp);
2958 			mutex_enter(&conn->c_lock);
2959 			if (conn->c_state == C_DISCONN_PEND) {
2960 				mutex_exit(&conn->c_lock);
2961 				break;
2962 			}
2963 			conn->c_state = C_ERROR_CONN;
2964 
2965 			/*
2966 			 * Free the rc_channel. Channel has already
2967 			 * transitioned to ERROR state and WRs have been
2968 			 * FLUSHED_ERR already.
2969 			 */
2970 			(void) ibt_free_channel(qp->qp_hdl);
2971 			qp->qp_hdl = NULL;
2972 
2973 			/*
2974 			 * Free the conn if c_ref goes down to 0
2975 			 */
2976 			if (conn->c_ref == 0) {
2977 				/*
2978 				 * Remove from list and free conn
2979 				 */
2980 				conn->c_state = C_DISCONN_PEND;
2981 				mutex_exit(&conn->c_lock);
2982 				(void) rib_disconnect_channel(conn,
2983 				    &hca->srv_conn_list);
2984 			} else {
2985 				mutex_exit(&conn->c_lock);
2986 			}
2987 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
2988 			break;
2989 		}
2990 		break;
2991 	}
2992 	case IBT_CM_EVENT_CONN_EST:
2993 		/*
2994 		 * RTU received, hence connection established.
2995 		 */
2996 		if (rib_debug > 1)
2997 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
2998 			    "(CONN_EST) channel established");
2999 		break;
3000 
3001 	default:
3002 		if (rib_debug > 2) {
3003 			/* Let CM handle the following events. */
3004 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3005 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3006 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3007 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3008 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3009 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3010 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3011 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3012 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3013 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3014 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3015 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3016 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3017 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3018 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
3019 			}
3020 		}
3021 		return (IBT_CM_DEFAULT);
3022 	}
3023 
3024 	/* accept all other CM messages (i.e. let the CM handle them) */
3025 	return (IBT_CM_ACCEPT);
3026 }
3027 
3028 static rdma_stat
3029 rib_register_service(rib_hca_t *hca, int service_type)
3030 {
3031 	ibt_srv_desc_t		sdesc;
3032 	ibt_hca_portinfo_t	*port_infop;
3033 	ib_svc_id_t		srv_id;
3034 	ibt_srv_hdl_t		srv_hdl;
3035 	uint_t			port_size;
3036 	uint_t			pki, i, num_ports, nbinds;
3037 	ibt_status_t		ibt_status;
3038 	rib_service_t		*new_service;
3039 	ib_pkey_t		pkey;
3040 
3041 	/*
3042 	 * Query all ports for the given HCA
3043 	 */
3044 	rw_enter(&hca->state_lock, RW_READER);
3045 	if (hca->state != HCA_DETACHED) {
3046 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3047 		    &num_ports, &port_size);
3048 		rw_exit(&hca->state_lock);
3049 	} else {
3050 		rw_exit(&hca->state_lock);
3051 		return (RDMA_FAILED);
3052 	}
3053 	if (ibt_status != IBT_SUCCESS) {
3054 		return (RDMA_FAILED);
3055 	}
3056 
3057 	DTRACE_PROBE1(rpcib__i__regservice_numports,
3058 	    int, num_ports);
3059 
3060 	for (i = 0; i < num_ports; i++) {
3061 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3062 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3063 			    int, i+1);
3064 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3065 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
3066 			    int, i+1);
3067 		}
3068 	}
3069 
3070 	/*
3071 	 * Get all the IP addresses on this system to register the
3072 	 * given "service type" on all DNS recognized IP addrs.
3073 	 * Each service type such as NFS will have all the systems
3074 	 * IP addresses as its different names. For now the only
3075 	 * type of service we support in RPCIB is NFS.
3076 	 */
3077 	rw_enter(&hca->service_list_lock, RW_WRITER);
3078 	/*
3079 	 * Start registering and binding service to active
3080 	 * on active ports on this HCA.
3081 	 */
3082 	nbinds = 0;
3083 	new_service = NULL;
3084 
3085 	/*
3086 	 * We use IP addresses as the service names for
3087 	 * service registration.  Register each of them
3088 	 * with CM to obtain a svc_id and svc_hdl.  We do not
3089 	 * register the service with machine's loopback address.
3090 	 */
3091 	(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3092 	(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3093 	(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3094 
3095 	sdesc.sd_handler = rib_srv_cm_handler;
3096 	sdesc.sd_flags = 0;
3097 	ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3098 	    &sdesc, ibt_get_ip_sid(IPPROTO_TCP, NFS_RDMA_PORT),
3099 	    1, &srv_hdl, &srv_id);
3100 
3101 	for (i = 0; i < num_ports; i++) {
3102 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3103 			continue;
3104 
3105 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3106 			pkey = port_infop[i].p_pkey_tbl[pki];
3107 			if ((pkey & IBSRM_HB) &&
3108 			    (pkey != IB_PKEY_INVALID_FULL)) {
3109 
3110 				/*
3111 				 * Allocate and prepare a service entry
3112 				 */
3113 				new_service =
3114 				    kmem_zalloc(1 * sizeof (rib_service_t),
3115 				    KM_SLEEP);
3116 
3117 				new_service->srv_type = service_type;
3118 				new_service->srv_hdl = srv_hdl;
3119 				new_service->srv_next = NULL;
3120 
3121 				ibt_status = ibt_bind_service(srv_hdl,
3122 				    port_infop[i].p_sgid_tbl[0],
3123 				    NULL, rib_stat, NULL);
3124 
3125 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3126 				    int, ibt_status);
3127 
3128 				if (ibt_status != IBT_SUCCESS) {
3129 					kmem_free(new_service,
3130 					    sizeof (rib_service_t));
3131 					new_service = NULL;
3132 					continue;
3133 				}
3134 
3135 				/*
3136 				 * Add to the service list for this HCA
3137 				 */
3138 				new_service->srv_next = hca->service_list;
3139 				hca->service_list = new_service;
3140 				new_service = NULL;
3141 				nbinds++;
3142 			}
3143 		}
3144 	}
3145 	rw_exit(&hca->service_list_lock);
3146 
3147 	ibt_free_portinfo(port_infop, port_size);
3148 
3149 	if (nbinds == 0) {
3150 		return (RDMA_FAILED);
3151 	} else {
3152 		/*
3153 		 * Put this plugin into accept state, since atleast
3154 		 * one registration was successful.
3155 		 */
3156 		mutex_enter(&plugin_state_lock);
3157 		plugin_state = ACCEPT;
3158 		mutex_exit(&plugin_state_lock);
3159 		return (RDMA_SUCCESS);
3160 	}
3161 }
3162 
3163 void
3164 rib_listen(struct rdma_svc_data *rd)
3165 {
3166 	rdma_stat status = RDMA_SUCCESS;
3167 
3168 	rd->active = 0;
3169 	rd->err_code = RDMA_FAILED;
3170 
3171 	/*
3172 	 * First check if a hca is still attached
3173 	 */
3174 	rw_enter(&rib_stat->hca->state_lock, RW_READER);
3175 	if (rib_stat->hca->state != HCA_INITED) {
3176 		rw_exit(&rib_stat->hca->state_lock);
3177 		return;
3178 	}
3179 	rw_exit(&rib_stat->hca->state_lock);
3180 
3181 	rib_stat->q = &rd->q;
3182 	/*
3183 	 * Right now the only service type is NFS. Hence force feed this
3184 	 * value. Ideally to communicate the service type it should be
3185 	 * passed down in rdma_svc_data.
3186 	 */
3187 	rib_stat->service_type = NFS;
3188 	status = rib_register_service(rib_stat->hca, NFS);
3189 	if (status != RDMA_SUCCESS) {
3190 		rd->err_code = status;
3191 		return;
3192 	}
3193 	/*
3194 	 * Service active on an HCA, check rd->err_code for more
3195 	 * explainable errors.
3196 	 */
3197 	rd->active = 1;
3198 	rd->err_code = status;
3199 }
3200 
3201 /* XXXX */
3202 /* ARGSUSED */
3203 static void
3204 rib_listen_stop(struct rdma_svc_data *svcdata)
3205 {
3206 	rib_hca_t		*hca;
3207 
3208 	/*
3209 	 * KRPC called the RDMATF to stop the listeners, this means
3210 	 * stop sending incomming or recieved requests to KRPC master
3211 	 * transport handle for RDMA-IB. This is also means that the
3212 	 * master transport handle, responsible for us, is going away.
3213 	 */
3214 	mutex_enter(&plugin_state_lock);
3215 	plugin_state = NO_ACCEPT;
3216 	if (svcdata != NULL)
3217 		svcdata->active = 0;
3218 	mutex_exit(&plugin_state_lock);
3219 
3220 	/*
3221 	 * First check if a hca is still attached
3222 	 */
3223 	hca = rib_stat->hca;
3224 	rw_enter(&hca->state_lock, RW_READER);
3225 	if (hca->state != HCA_INITED) {
3226 		rw_exit(&hca->state_lock);
3227 		return;
3228 	}
3229 	rib_close_channels(&hca->srv_conn_list);
3230 	rib_stop_services(hca);
3231 	rw_exit(&hca->state_lock);
3232 }
3233 
3234 /*
3235  * Traverse the HCA's service list to unbind and deregister services.
3236  * Instead of unbinding the service for a service handle by
3237  * calling ibt_unbind_service() for each port/pkey, we unbind
3238  * all the services for the service handle by making only one
3239  * call to ibt_unbind_all_services().  Then, we deregister the
3240  * service for the service handle.
3241  *
3242  * When traversing the entries in service_list, we compare the
3243  * srv_hdl of the current entry with that of the next.  If they
3244  * are different or if the next entry is NULL, the current entry
3245  * marks the last binding of the service handle.  In this case,
3246  * call ibt_unbind_all_services() and deregister the service for
3247  * the service handle.  If they are the same, the current and the
3248  * next entries are bound to the same service handle.  In this
3249  * case, move on to the next entry.
3250  */
3251 static void
3252 rib_stop_services(rib_hca_t *hca)
3253 {
3254 	rib_service_t		*srv_list, *to_remove;
3255 
3256 	/*
3257 	 * unbind and deregister the services for this service type.
3258 	 * Right now there is only one service type. In future it will
3259 	 * be passed down to this function.
3260 	 */
3261 	rw_enter(&hca->service_list_lock, RW_WRITER);
3262 	srv_list = hca->service_list;
3263 	while (srv_list != NULL) {
3264 		to_remove = srv_list;
3265 		srv_list = to_remove->srv_next;
3266 		if (srv_list == NULL || bcmp(to_remove->srv_hdl,
3267 		    srv_list->srv_hdl, sizeof (ibt_srv_hdl_t))) {
3268 
3269 			(void) ibt_unbind_all_services(to_remove->srv_hdl);
3270 			(void) ibt_deregister_service(hca->ibt_clnt_hdl,
3271 			    to_remove->srv_hdl);
3272 		}
3273 
3274 		kmem_free(to_remove, sizeof (rib_service_t));
3275 	}
3276 	hca->service_list = NULL;
3277 	rw_exit(&hca->service_list_lock);
3278 }
3279 
3280 static struct svc_recv *
3281 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3282 {
3283 	struct svc_recv	*recvp;
3284 
3285 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3286 	recvp->vaddr = sgl->ds_va;
3287 	recvp->qp = qp;
3288 	recvp->bytes_xfer = 0;
3289 	return (recvp);
3290 }
3291 
3292 static int
3293 rib_free_svc_recv(struct svc_recv *recvp)
3294 {
3295 	kmem_free(recvp, sizeof (*recvp));
3296 
3297 	return (0);
3298 }
3299 
3300 static struct reply *
3301 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3302 {
3303 	struct reply	*rep;
3304 
3305 
3306 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3307 	if (rep == NULL) {
3308 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3309 		return (NULL);
3310 	}
3311 	rep->xid = msgid;
3312 	rep->vaddr_cq = NULL;
3313 	rep->bytes_xfer = 0;
3314 	rep->status = (uint_t)REPLY_WAIT;
3315 	rep->prev = NULL;
3316 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3317 
3318 	mutex_enter(&qp->replylist_lock);
3319 	if (qp->replylist) {
3320 		rep->next = qp->replylist;
3321 		qp->replylist->prev = rep;
3322 	}
3323 	qp->rep_list_size++;
3324 
3325 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3326 	    int, qp->rep_list_size);
3327 
3328 	qp->replylist = rep;
3329 	mutex_exit(&qp->replylist_lock);
3330 
3331 	return (rep);
3332 }
3333 
3334 static rdma_stat
3335 rib_rem_replylist(rib_qp_t *qp)
3336 {
3337 	struct reply	*r, *n;
3338 
3339 	mutex_enter(&qp->replylist_lock);
3340 	for (r = qp->replylist; r != NULL; r = n) {
3341 		n = r->next;
3342 		(void) rib_remreply(qp, r);
3343 	}
3344 	mutex_exit(&qp->replylist_lock);
3345 
3346 	return (RDMA_SUCCESS);
3347 }
3348 
3349 static int
3350 rib_remreply(rib_qp_t *qp, struct reply *rep)
3351 {
3352 
3353 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3354 	if (rep->prev) {
3355 		rep->prev->next = rep->next;
3356 	}
3357 	if (rep->next) {
3358 		rep->next->prev = rep->prev;
3359 	}
3360 	if (qp->replylist == rep)
3361 		qp->replylist = rep->next;
3362 
3363 	cv_destroy(&rep->wait_cv);
3364 	qp->rep_list_size--;
3365 
3366 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3367 	    int, qp->rep_list_size);
3368 
3369 	kmem_free(rep, sizeof (*rep));
3370 
3371 	return (0);
3372 }
3373 
3374 rdma_stat
3375 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3376 	struct mrc *buf_handle)
3377 {
3378 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3379 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3380 	rdma_stat	status;
3381 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3382 
3383 	/*
3384 	 * Note: ALL buffer pools use the same memory type RDMARW.
3385 	 */
3386 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3387 	if (status == RDMA_SUCCESS) {
3388 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3389 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3390 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3391 	} else {
3392 		buf_handle->mrc_linfo = NULL;
3393 		buf_handle->mrc_lmr = 0;
3394 		buf_handle->mrc_rmr = 0;
3395 	}
3396 	return (status);
3397 }
3398 
3399 static rdma_stat
3400 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3401 	ibt_mr_flags_t spec,
3402 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3403 {
3404 	ibt_mr_attr_t	mem_attr;
3405 	ibt_status_t	ibt_status;
3406 	mem_attr.mr_vaddr = (uintptr_t)buf;
3407 	mem_attr.mr_len = (ib_msglen_t)size;
3408 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3409 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3410 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3411 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3412 
3413 	rw_enter(&hca->state_lock, RW_READER);
3414 	if (hca->state == HCA_INITED) {
3415 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3416 		    &mem_attr, mr_hdlp, mr_descp);
3417 		rw_exit(&hca->state_lock);
3418 	} else {
3419 		rw_exit(&hca->state_lock);
3420 		return (RDMA_FAILED);
3421 	}
3422 
3423 	if (ibt_status != IBT_SUCCESS) {
3424 		return (RDMA_FAILED);
3425 	}
3426 	return (RDMA_SUCCESS);
3427 }
3428 
3429 rdma_stat
3430 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3431 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3432 {
3433 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3434 	rib_lrc_entry_t *l;
3435 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3436 	rdma_stat	status;
3437 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3438 
3439 	/*
3440 	 * Non-coherent memory registration.
3441 	 */
3442 	l = (rib_lrc_entry_t *)lrc;
3443 	if (l) {
3444 		if (l->registered) {
3445 			buf_handle->mrc_linfo =
3446 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3447 			buf_handle->mrc_lmr =
3448 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3449 			buf_handle->mrc_rmr =
3450 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3451 			*sync_handle = (RIB_SYNCMEM_HANDLE)
3452 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3453 			return (RDMA_SUCCESS);
3454 		} else {
3455 			/* Always register the whole buffer */
3456 			buf = (caddr_t)l->lrc_buf;
3457 			buflen = l->lrc_len;
3458 		}
3459 	}
3460 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3461 
3462 	if (status == RDMA_SUCCESS) {
3463 		if (l) {
3464 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3465 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3466 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3467 			l->registered		 = TRUE;
3468 		}
3469 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3470 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3471 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3472 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3473 	} else {
3474 		buf_handle->mrc_linfo = NULL;
3475 		buf_handle->mrc_lmr = 0;
3476 		buf_handle->mrc_rmr = 0;
3477 	}
3478 	return (status);
3479 }
3480 
3481 /* ARGSUSED */
3482 rdma_stat
3483 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3484 {
3485 	rib_hca_t *hca = (ctoqp(conn))->hca;
3486 	/*
3487 	 * Allow memory deregistration even if HCA is
3488 	 * getting detached. Need all outstanding
3489 	 * memory registrations to be deregistered
3490 	 * before HCA_DETACH_EVENT can be accepted.
3491 	 */
3492 	(void) ibt_deregister_mr(hca->hca_hdl,
3493 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3494 	return (RDMA_SUCCESS);
3495 }
3496 
3497 /* ARGSUSED */
3498 rdma_stat
3499 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3500 		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3501 {
3502 	rib_lrc_entry_t *l;
3503 	l = (rib_lrc_entry_t *)lrc;
3504 	if (l)
3505 		if (l->registered)
3506 			return (RDMA_SUCCESS);
3507 
3508 	(void) rib_deregistermem(conn, buf, buf_handle);
3509 
3510 	return (RDMA_SUCCESS);
3511 }
3512 
3513 /* ARGSUSED */
3514 rdma_stat
3515 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3516 		int len, int cpu)
3517 {
3518 	ibt_status_t	status;
3519 	rib_hca_t *hca = (ctoqp(conn))->hca;
3520 	ibt_mr_sync_t	mr_segment;
3521 
3522 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3523 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3524 	mr_segment.ms_len = (ib_memlen_t)len;
3525 	if (cpu) {
3526 		/* make incoming data visible to memory */
3527 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3528 	} else {
3529 		/* make memory changes visible to IO */
3530 		mr_segment.ms_flags = IBT_SYNC_READ;
3531 	}
3532 	rw_enter(&hca->state_lock, RW_READER);
3533 	if (hca->state == HCA_INITED) {
3534 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3535 		rw_exit(&hca->state_lock);
3536 	} else {
3537 		rw_exit(&hca->state_lock);
3538 		return (RDMA_FAILED);
3539 	}
3540 
3541 	if (status == IBT_SUCCESS)
3542 		return (RDMA_SUCCESS);
3543 	else {
3544 		return (RDMA_FAILED);
3545 	}
3546 }
3547 
3548 /*
3549  * XXXX	????
3550  */
3551 static rdma_stat
3552 rib_getinfo(rdma_info_t *info)
3553 {
3554 	/*
3555 	 * XXXX	Hack!
3556 	 */
3557 	info->addrlen = 16;
3558 	info->mts = 1000000;
3559 	info->mtu = 1000000;
3560 
3561 	return (RDMA_SUCCESS);
3562 }
3563 
3564 rib_bufpool_t *
3565 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3566 {
3567 	rib_bufpool_t	*rbp = NULL;
3568 	bufpool_t	*bp = NULL;
3569 	caddr_t		buf;
3570 	ibt_mr_attr_t	mem_attr;
3571 	ibt_status_t	ibt_status;
3572 	int		i, j;
3573 
3574 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3575 
3576 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3577 	    num * sizeof (void *), KM_SLEEP);
3578 
3579 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3580 	bp->numelems = num;
3581 
3582 
3583 	switch (ptype) {
3584 	case SEND_BUFFER:
3585 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3586 		bp->rsize = RPC_MSG_SZ;
3587 		break;
3588 	case RECV_BUFFER:
3589 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3590 		bp->rsize = RPC_BUF_SIZE;
3591 		break;
3592 	default:
3593 		goto fail;
3594 	}
3595 
3596 	/*
3597 	 * Register the pool.
3598 	 */
3599 	bp->bufsize = num * bp->rsize;
3600 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3601 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3602 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3603 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3604 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3605 	rw_enter(&hca->state_lock, RW_READER);
3606 
3607 	if (hca->state != HCA_INITED) {
3608 		rw_exit(&hca->state_lock);
3609 		goto fail;
3610 	}
3611 
3612 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3613 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3614 		mem_attr.mr_vaddr = (uintptr_t)buf;
3615 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3616 		mem_attr.mr_as = NULL;
3617 		ibt_status = ibt_register_mr(hca->hca_hdl,
3618 		    hca->pd_hdl, &mem_attr,
3619 		    &rbp->mr_hdl[i],
3620 		    &rbp->mr_desc[i]);
3621 		if (ibt_status != IBT_SUCCESS) {
3622 			for (j = 0; j < i; j++) {
3623 				(void) ibt_deregister_mr(hca->hca_hdl,
3624 				    rbp->mr_hdl[j]);
3625 			}
3626 			rw_exit(&hca->state_lock);
3627 			goto fail;
3628 		}
3629 	}
3630 	rw_exit(&hca->state_lock);
3631 	buf = (caddr_t)bp->buf;
3632 	for (i = 0; i < num; i++, buf += bp->rsize) {
3633 		bp->buflist[i] = (void *)buf;
3634 	}
3635 	bp->buffree = num - 1;	/* no. of free buffers */
3636 	rbp->bpool = bp;
3637 
3638 	return (rbp);
3639 fail:
3640 	if (bp) {
3641 		if (bp->buf)
3642 			kmem_free(bp->buf, bp->bufsize);
3643 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3644 	}
3645 	if (rbp) {
3646 		if (rbp->mr_hdl)
3647 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3648 		if (rbp->mr_desc)
3649 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3650 		kmem_free(rbp, sizeof (rib_bufpool_t));
3651 	}
3652 	return (NULL);
3653 }
3654 
3655 static void
3656 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3657 {
3658 	int i;
3659 	rib_bufpool_t *rbp = NULL;
3660 	bufpool_t *bp;
3661 
3662 	/*
3663 	 * Obtain pool address based on type of pool
3664 	 */
3665 	switch (ptype) {
3666 		case SEND_BUFFER:
3667 			rbp = hca->send_pool;
3668 			break;
3669 		case RECV_BUFFER:
3670 			rbp = hca->recv_pool;
3671 			break;
3672 		default:
3673 			return;
3674 	}
3675 	if (rbp == NULL)
3676 		return;
3677 
3678 	bp = rbp->bpool;
3679 
3680 	/*
3681 	 * Deregister the pool memory and free it.
3682 	 */
3683 	for (i = 0; i < bp->numelems; i++) {
3684 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3685 	}
3686 }
3687 
3688 static void
3689 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3690 {
3691 
3692 	rib_bufpool_t *rbp = NULL;
3693 	bufpool_t *bp;
3694 
3695 	/*
3696 	 * Obtain pool address based on type of pool
3697 	 */
3698 	switch (ptype) {
3699 		case SEND_BUFFER:
3700 			rbp = hca->send_pool;
3701 			break;
3702 		case RECV_BUFFER:
3703 			rbp = hca->recv_pool;
3704 			break;
3705 		default:
3706 			return;
3707 	}
3708 	if (rbp == NULL)
3709 		return;
3710 
3711 	bp = rbp->bpool;
3712 
3713 	/*
3714 	 * Free the pool memory.
3715 	 */
3716 	if (rbp->mr_hdl)
3717 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3718 
3719 	if (rbp->mr_desc)
3720 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
3721 	if (bp->buf)
3722 		kmem_free(bp->buf, bp->bufsize);
3723 	mutex_destroy(&bp->buflock);
3724 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
3725 	kmem_free(rbp, sizeof (rib_bufpool_t));
3726 }
3727 
3728 void
3729 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
3730 {
3731 	/*
3732 	 * Deregister the pool memory and free it.
3733 	 */
3734 	rib_rbufpool_deregister(hca, ptype);
3735 	rib_rbufpool_free(hca, ptype);
3736 }
3737 
3738 /*
3739  * Fetch a buffer from the pool of type specified in rdbuf->type.
3740  */
3741 static rdma_stat
3742 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3743 {
3744 	rib_lrc_entry_t *rlep;
3745 
3746 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
3747 		rlep = rib_get_cache_buf(conn, rdbuf->len);
3748 		rdbuf->rb_private =  (caddr_t)rlep;
3749 		rdbuf->addr = rlep->lrc_buf;
3750 		rdbuf->handle = rlep->lrc_mhandle;
3751 		return (RDMA_SUCCESS);
3752 	}
3753 
3754 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
3755 	if (rdbuf->addr) {
3756 		switch (rdbuf->type) {
3757 		case SEND_BUFFER:
3758 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
3759 			break;
3760 		case RECV_BUFFER:
3761 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
3762 			break;
3763 		default:
3764 			rdbuf->len = 0;
3765 		}
3766 		return (RDMA_SUCCESS);
3767 	} else
3768 		return (RDMA_FAILED);
3769 }
3770 
3771 #if defined(MEASURE_POOL_DEPTH)
3772 static void rib_recv_bufs(uint32_t x) {
3773 
3774 }
3775 
3776 static void rib_send_bufs(uint32_t x) {
3777 
3778 }
3779 #endif
3780 
3781 /*
3782  * Fetch a buffer of specified type.
3783  * Note that rdbuf->handle is mw's rkey.
3784  */
3785 static void *
3786 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3787 {
3788 	rib_qp_t	*qp = ctoqp(conn);
3789 	rib_hca_t	*hca = qp->hca;
3790 	rdma_btype	ptype = rdbuf->type;
3791 	void		*buf;
3792 	rib_bufpool_t	*rbp = NULL;
3793 	bufpool_t	*bp;
3794 	int		i;
3795 
3796 	/*
3797 	 * Obtain pool address based on type of pool
3798 	 */
3799 	switch (ptype) {
3800 	case SEND_BUFFER:
3801 		rbp = hca->send_pool;
3802 		break;
3803 	case RECV_BUFFER:
3804 		rbp = hca->recv_pool;
3805 		break;
3806 	default:
3807 		return (NULL);
3808 	}
3809 	if (rbp == NULL)
3810 		return (NULL);
3811 
3812 	bp = rbp->bpool;
3813 
3814 	mutex_enter(&bp->buflock);
3815 	if (bp->buffree < 0) {
3816 		mutex_exit(&bp->buflock);
3817 		return (NULL);
3818 	}
3819 
3820 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
3821 	buf = bp->buflist[bp->buffree];
3822 	rdbuf->addr = buf;
3823 	rdbuf->len = bp->rsize;
3824 	for (i = bp->numelems - 1; i >= 0; i--) {
3825 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
3826 			rdbuf->handle.mrc_rmr =
3827 			    (uint32_t)rbp->mr_desc[i].md_rkey;
3828 			rdbuf->handle.mrc_linfo =
3829 			    (uintptr_t)rbp->mr_hdl[i];
3830 			rdbuf->handle.mrc_lmr =
3831 			    (uint32_t)rbp->mr_desc[i].md_lkey;
3832 #if defined(MEASURE_POOL_DEPTH)
3833 			if (ptype == SEND_BUFFER)
3834 				rib_send_bufs(MAX_BUFS - (bp->buffree+1));
3835 			if (ptype == RECV_BUFFER)
3836 				rib_recv_bufs(MAX_BUFS - (bp->buffree+1));
3837 #endif
3838 			bp->buffree--;
3839 
3840 			mutex_exit(&bp->buflock);
3841 
3842 			return (buf);
3843 		}
3844 	}
3845 
3846 	mutex_exit(&bp->buflock);
3847 
3848 	return (NULL);
3849 }
3850 
3851 static void
3852 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
3853 {
3854 
3855 	if (rdbuf->type == RDMA_LONG_BUFFER) {
3856 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
3857 		rdbuf->rb_private = NULL;
3858 		return;
3859 	}
3860 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
3861 }
3862 
3863 static void
3864 rib_rbuf_free(CONN *conn, int ptype, void *buf)
3865 {
3866 	rib_qp_t *qp = ctoqp(conn);
3867 	rib_hca_t *hca = qp->hca;
3868 	rib_bufpool_t *rbp = NULL;
3869 	bufpool_t *bp;
3870 
3871 	/*
3872 	 * Obtain pool address based on type of pool
3873 	 */
3874 	switch (ptype) {
3875 	case SEND_BUFFER:
3876 		rbp = hca->send_pool;
3877 		break;
3878 	case RECV_BUFFER:
3879 		rbp = hca->recv_pool;
3880 		break;
3881 	default:
3882 		return;
3883 	}
3884 	if (rbp == NULL)
3885 		return;
3886 
3887 	bp = rbp->bpool;
3888 
3889 	mutex_enter(&bp->buflock);
3890 	if (++bp->buffree >= bp->numelems) {
3891 		/*
3892 		 * Should never happen
3893 		 */
3894 		bp->buffree--;
3895 	} else {
3896 		bp->buflist[bp->buffree] = buf;
3897 	}
3898 	mutex_exit(&bp->buflock);
3899 }
3900 
3901 static rdma_stat
3902 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
3903 {
3904 	rw_enter(&connlist->conn_lock, RW_WRITER);
3905 	if (connlist->conn_hd) {
3906 		cn->c_next = connlist->conn_hd;
3907 		connlist->conn_hd->c_prev = cn;
3908 	}
3909 	connlist->conn_hd = cn;
3910 	rw_exit(&connlist->conn_lock);
3911 
3912 	return (RDMA_SUCCESS);
3913 }
3914 
3915 static rdma_stat
3916 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
3917 {
3918 	rw_enter(&connlist->conn_lock, RW_WRITER);
3919 	if (cn->c_prev) {
3920 		cn->c_prev->c_next = cn->c_next;
3921 	}
3922 	if (cn->c_next) {
3923 		cn->c_next->c_prev = cn->c_prev;
3924 	}
3925 	if (connlist->conn_hd == cn)
3926 		connlist->conn_hd = cn->c_next;
3927 	rw_exit(&connlist->conn_lock);
3928 
3929 	return (RDMA_SUCCESS);
3930 }
3931 
3932 /*
3933  * Connection management.
3934  * IBTF does not support recycling of channels. So connections are only
3935  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
3936  * C_DISCONN_PEND state. No C_IDLE state.
3937  * C_CONN_PEND state: Connection establishment in progress to the server.
3938  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
3939  * It has an RC channel associated with it. ibt_post_send/recv are allowed
3940  * only in this state.
3941  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
3942  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
3943  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
3944  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
3945  * c_ref drops to 0 (this indicates that RPC has no more references to this
3946  * connection), the connection should be destroyed. A connection transitions
3947  * into this state when it is being destroyed.
3948  */
3949 static rdma_stat
3950 rib_conn_get(struct netbuf *svcaddr, int addr_type, void *handle, CONN **conn)
3951 {
3952 	CONN *cn;
3953 	int status = RDMA_SUCCESS;
3954 	rib_hca_t *hca = (rib_hca_t *)handle;
3955 	rib_qp_t *qp;
3956 	clock_t cv_stat, timout;
3957 	ibt_path_info_t path;
3958 	ibt_ip_addr_t s_ip, d_ip;
3959 
3960 again:
3961 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
3962 	cn = hca->cl_conn_list.conn_hd;
3963 	while (cn != NULL) {
3964 		/*
3965 		 * First, clear up any connection in the ERROR state
3966 		 */
3967 		mutex_enter(&cn->c_lock);
3968 		if (cn->c_state == C_ERROR_CONN) {
3969 			if (cn->c_ref == 0) {
3970 				/*
3971 				 * Remove connection from list and destroy it.
3972 				 */
3973 				cn->c_state = C_DISCONN_PEND;
3974 				mutex_exit(&cn->c_lock);
3975 				rw_exit(&hca->cl_conn_list.conn_lock);
3976 				(void) rib_disconnect_channel(cn,
3977 				    &hca->cl_conn_list);
3978 				goto again;
3979 			}
3980 			mutex_exit(&cn->c_lock);
3981 			cn = cn->c_next;
3982 			continue;
3983 		}
3984 		if (cn->c_state == C_DISCONN_PEND) {
3985 			mutex_exit(&cn->c_lock);
3986 			cn = cn->c_next;
3987 			continue;
3988 		}
3989 		if ((cn->c_raddr.len == svcaddr->len) &&
3990 		    bcmp(svcaddr->buf, cn->c_raddr.buf, svcaddr->len) == 0) {
3991 			/*
3992 			 * Our connection. Give up conn list lock
3993 			 * as we are done traversing the list.
3994 			 */
3995 			rw_exit(&hca->cl_conn_list.conn_lock);
3996 			if (cn->c_state == C_CONNECTED) {
3997 				cn->c_ref++;	/* sharing a conn */
3998 				mutex_exit(&cn->c_lock);
3999 				*conn = cn;
4000 				return (status);
4001 			}
4002 			if (cn->c_state == C_CONN_PEND) {
4003 				/*
4004 				 * Hold a reference to this conn before
4005 				 * we give up the lock.
4006 				 */
4007 				cn->c_ref++;
4008 				timout =  ddi_get_lbolt() +
4009 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
4010 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4011 				    &cn->c_lock, timout)) > 0 &&
4012 				    cn->c_state == C_CONN_PEND)
4013 					;
4014 				if (cv_stat == 0) {
4015 					cn->c_ref--;
4016 					mutex_exit(&cn->c_lock);
4017 					return (RDMA_INTR);
4018 				}
4019 				if (cv_stat < 0) {
4020 					cn->c_ref--;
4021 					mutex_exit(&cn->c_lock);
4022 					return (RDMA_TIMEDOUT);
4023 				}
4024 				if (cn->c_state == C_CONNECTED) {
4025 					*conn = cn;
4026 					mutex_exit(&cn->c_lock);
4027 					return (status);
4028 				} else {
4029 					cn->c_ref--;
4030 					mutex_exit(&cn->c_lock);
4031 					return (RDMA_TIMEDOUT);
4032 				}
4033 			}
4034 		}
4035 		mutex_exit(&cn->c_lock);
4036 		cn = cn->c_next;
4037 	}
4038 	rw_exit(&hca->cl_conn_list.conn_lock);
4039 
4040 	bzero(&path, sizeof (ibt_path_info_t));
4041 	bzero(&s_ip, sizeof (ibt_ip_addr_t));
4042 	bzero(&d_ip, sizeof (ibt_ip_addr_t));
4043 
4044 	status = rib_chk_srv_ibaddr(svcaddr, addr_type, &path, &s_ip, &d_ip);
4045 	if (status != RDMA_SUCCESS) {
4046 		return (RDMA_FAILED);
4047 	}
4048 
4049 	/*
4050 	 * Channel to server doesn't exist yet, create one.
4051 	 */
4052 	if (rib_clnt_create_chan(hca, svcaddr, &qp) != RDMA_SUCCESS) {
4053 		return (RDMA_FAILED);
4054 	}
4055 	cn = qptoc(qp);
4056 	cn->c_state = C_CONN_PEND;
4057 	cn->c_ref = 1;
4058 
4059 	/*
4060 	 * Add to conn list.
4061 	 * We had given up the READER lock. In the time since then,
4062 	 * another thread might have created the connection we are
4063 	 * trying here. But for now, that is quiet alright - there
4064 	 * might be two connections between a pair of hosts instead
4065 	 * of one. If we really want to close that window,
4066 	 * then need to check the list after acquiring the
4067 	 * WRITER lock.
4068 	 */
4069 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4070 	status = rib_conn_to_srv(hca, qp, &path, &s_ip, &d_ip);
4071 	mutex_enter(&cn->c_lock);
4072 	if (status == RDMA_SUCCESS) {
4073 		cn->c_state = C_CONNECTED;
4074 		*conn = cn;
4075 	} else {
4076 		cn->c_state = C_ERROR_CONN;
4077 		cn->c_ref--;
4078 	}
4079 	cv_broadcast(&cn->c_cv);
4080 	mutex_exit(&cn->c_lock);
4081 	return (status);
4082 }
4083 
4084 static rdma_stat
4085 rib_conn_release(CONN *conn)
4086 {
4087 	rib_qp_t	*qp = ctoqp(conn);
4088 
4089 	mutex_enter(&conn->c_lock);
4090 	conn->c_ref--;
4091 
4092 	/*
4093 	 * If a conn is C_ERROR_CONN, close the channel.
4094 	 * If it's CONNECTED, keep it that way.
4095 	 */
4096 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4097 		conn->c_state = C_DISCONN_PEND;
4098 		mutex_exit(&conn->c_lock);
4099 		if (qp->mode == RIB_SERVER)
4100 			(void) rib_disconnect_channel(conn,
4101 			    &qp->hca->srv_conn_list);
4102 		else
4103 			(void) rib_disconnect_channel(conn,
4104 			    &qp->hca->cl_conn_list);
4105 		return (RDMA_SUCCESS);
4106 	}
4107 	mutex_exit(&conn->c_lock);
4108 	return (RDMA_SUCCESS);
4109 }
4110 
4111 /*
4112  * Add at front of list
4113  */
4114 static struct rdma_done_list *
4115 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4116 {
4117 	struct rdma_done_list *rd;
4118 
4119 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4120 
4121 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4122 	rd->xid = xid;
4123 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4124 
4125 	rd->prev = NULL;
4126 	rd->next = qp->rdlist;
4127 	if (qp->rdlist != NULL)
4128 		qp->rdlist->prev = rd;
4129 	qp->rdlist = rd;
4130 
4131 	return (rd);
4132 }
4133 
4134 static void
4135 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4136 {
4137 	struct rdma_done_list *r;
4138 
4139 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4140 
4141 	r = rd->next;
4142 	if (r != NULL) {
4143 		r->prev = rd->prev;
4144 	}
4145 
4146 	r = rd->prev;
4147 	if (r != NULL) {
4148 		r->next = rd->next;
4149 	} else {
4150 		qp->rdlist = rd->next;
4151 	}
4152 
4153 	cv_destroy(&rd->rdma_done_cv);
4154 	kmem_free(rd, sizeof (*rd));
4155 }
4156 
4157 static void
4158 rdma_done_rem_list(rib_qp_t *qp)
4159 {
4160 	struct rdma_done_list	*r, *n;
4161 
4162 	mutex_enter(&qp->rdlist_lock);
4163 	for (r = qp->rdlist; r != NULL; r = n) {
4164 		n = r->next;
4165 		rdma_done_rm(qp, r);
4166 	}
4167 	mutex_exit(&qp->rdlist_lock);
4168 }
4169 
4170 static void
4171 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4172 {
4173 	struct rdma_done_list *r = qp->rdlist;
4174 
4175 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4176 
4177 	while (r) {
4178 		if (r->xid == xid) {
4179 			cv_signal(&r->rdma_done_cv);
4180 			return;
4181 		} else {
4182 			r = r->next;
4183 		}
4184 	}
4185 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4186 	    int, xid);
4187 }
4188 
4189 
4190 /*
4191  * Goes through all connections and closes the channel
4192  * This will cause all the WRs on those channels to be
4193  * flushed.
4194  */
4195 static void
4196 rib_close_channels(rib_conn_list_t *connlist)
4197 {
4198 	CONN 		*conn;
4199 	rib_qp_t	*qp;
4200 
4201 	rw_enter(&connlist->conn_lock, RW_READER);
4202 	conn = connlist->conn_hd;
4203 	while (conn != NULL) {
4204 		mutex_enter(&conn->c_lock);
4205 		qp = ctoqp(conn);
4206 		if (conn->c_state == C_CONNECTED) {
4207 			/*
4208 			 * Live connection in CONNECTED state.
4209 			 * Call ibt_close_rc_channel in nonblocking mode
4210 			 * with no callbacks.
4211 			 */
4212 			conn->c_state = C_ERROR_CONN;
4213 			(void) ibt_close_rc_channel(qp->qp_hdl,
4214 			    IBT_NOCALLBACKS, NULL, 0, NULL, NULL, 0);
4215 			(void) ibt_free_channel(qp->qp_hdl);
4216 			qp->qp_hdl = NULL;
4217 		} else {
4218 			if (conn->c_state == C_ERROR_CONN &&
4219 			    qp->qp_hdl != NULL) {
4220 				/*
4221 				 * Connection in ERROR state but
4222 				 * channel is not yet freed.
4223 				 */
4224 				(void) ibt_close_rc_channel(qp->qp_hdl,
4225 				    IBT_NOCALLBACKS, NULL, 0, NULL,
4226 				    NULL, 0);
4227 				(void) ibt_free_channel(qp->qp_hdl);
4228 				qp->qp_hdl = NULL;
4229 			}
4230 		}
4231 		mutex_exit(&conn->c_lock);
4232 		conn = conn->c_next;
4233 	}
4234 	rw_exit(&connlist->conn_lock);
4235 }
4236 
4237 /*
4238  * Frees up all connections that are no longer being referenced
4239  */
4240 static void
4241 rib_purge_connlist(rib_conn_list_t *connlist)
4242 {
4243 	CONN 		*conn;
4244 
4245 top:
4246 	rw_enter(&connlist->conn_lock, RW_READER);
4247 	conn = connlist->conn_hd;
4248 	while (conn != NULL) {
4249 		mutex_enter(&conn->c_lock);
4250 
4251 		/*
4252 		 * At this point connection is either in ERROR
4253 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4254 		 * then some other thread is culling that connection.
4255 		 * If not and if c_ref is 0, then destroy the connection.
4256 		 */
4257 		if (conn->c_ref == 0 &&
4258 		    conn->c_state != C_DISCONN_PEND) {
4259 			/*
4260 			 * Cull the connection
4261 			 */
4262 			conn->c_state = C_DISCONN_PEND;
4263 			mutex_exit(&conn->c_lock);
4264 			rw_exit(&connlist->conn_lock);
4265 			(void) rib_disconnect_channel(conn, connlist);
4266 			goto top;
4267 		} else {
4268 			/*
4269 			 * conn disconnect already scheduled or will
4270 			 * happen from conn_release when c_ref drops to 0.
4271 			 */
4272 			mutex_exit(&conn->c_lock);
4273 		}
4274 		conn = conn->c_next;
4275 	}
4276 	rw_exit(&connlist->conn_lock);
4277 
4278 	/*
4279 	 * At this point, only connections with c_ref != 0 are on the list
4280 	 */
4281 }
4282 
4283 /*
4284  * Cleans and closes up all uses of the HCA
4285  */
4286 static void
4287 rib_detach_hca(rib_hca_t *hca)
4288 {
4289 
4290 	/*
4291 	 * Stop all services on the HCA
4292 	 * Go through cl_conn_list and close all rc_channels
4293 	 * Go through svr_conn_list and close all rc_channels
4294 	 * Free connections whose c_ref has dropped to 0
4295 	 * Destroy all CQs
4296 	 * Deregister and released all buffer pool memory after all
4297 	 * connections are destroyed
4298 	 * Free the protection domain
4299 	 * ibt_close_hca()
4300 	 */
4301 	rw_enter(&hca->state_lock, RW_WRITER);
4302 	if (hca->state == HCA_DETACHED) {
4303 		rw_exit(&hca->state_lock);
4304 		return;
4305 	}
4306 
4307 	hca->state = HCA_DETACHED;
4308 	rib_stat->nhca_inited--;
4309 
4310 	rib_stop_services(hca);
4311 	rib_close_channels(&hca->cl_conn_list);
4312 	rib_close_channels(&hca->srv_conn_list);
4313 	rw_exit(&hca->state_lock);
4314 
4315 	rib_purge_connlist(&hca->cl_conn_list);
4316 	rib_purge_connlist(&hca->srv_conn_list);
4317 
4318 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4319 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4320 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4321 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4322 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4323 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4324 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4325 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4326 
4327 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4328 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4329 	if (hca->srv_conn_list.conn_hd == NULL &&
4330 	    hca->cl_conn_list.conn_hd == NULL) {
4331 		/*
4332 		 * conn_lists are NULL, so destroy
4333 		 * buffers, close hca and be done.
4334 		 */
4335 		rib_rbufpool_destroy(hca, RECV_BUFFER);
4336 		rib_rbufpool_destroy(hca, SEND_BUFFER);
4337 		rib_destroy_cache(hca);
4338 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4339 		(void) ibt_close_hca(hca->hca_hdl);
4340 		hca->hca_hdl = NULL;
4341 	}
4342 	rw_exit(&hca->cl_conn_list.conn_lock);
4343 	rw_exit(&hca->srv_conn_list.conn_lock);
4344 
4345 	if (hca->hca_hdl != NULL) {
4346 		mutex_enter(&hca->inuse_lock);
4347 		while (hca->inuse)
4348 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4349 		mutex_exit(&hca->inuse_lock);
4350 		/*
4351 		 * conn_lists are now NULL, so destroy
4352 		 * buffers, close hca and be done.
4353 		 */
4354 		rib_rbufpool_destroy(hca, RECV_BUFFER);
4355 		rib_rbufpool_destroy(hca, SEND_BUFFER);
4356 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4357 		(void) ibt_close_hca(hca->hca_hdl);
4358 		hca->hca_hdl = NULL;
4359 	}
4360 }
4361 
4362 static void
4363 rib_server_side_cache_reclaim(void *argp)
4364 {
4365 	cache_avl_struct_t    *rcas;
4366 	rib_lrc_entry_t		*rb;
4367 	rib_hca_t *hca = (rib_hca_t *)argp;
4368 
4369 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4370 	rcas = avl_first(&hca->avl_tree);
4371 	if (rcas != NULL)
4372 		avl_remove(&hca->avl_tree, rcas);
4373 
4374 	while (rcas != NULL) {
4375 		while (rcas->r.forw != &rcas->r) {
4376 			rcas->elements--;
4377 			rib_total_buffers --;
4378 			rb = rcas->r.forw;
4379 			remque(rb);
4380 			if (rb->registered)
4381 				(void) rib_deregistermem_via_hca(hca,
4382 				    rb->lrc_buf, rb->lrc_mhandle);
4383 			cache_allocation -= rb->lrc_len;
4384 			kmem_free(rb->lrc_buf, rb->lrc_len);
4385 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4386 		}
4387 		mutex_destroy(&rcas->node_lock);
4388 		kmem_cache_free(hca->server_side_cache, rcas);
4389 		rcas = avl_first(&hca->avl_tree);
4390 		if (rcas != NULL)
4391 			avl_remove(&hca->avl_tree, rcas);
4392 	}
4393 	rw_exit(&hca->avl_rw_lock);
4394 }
4395 
4396 static void
4397 rib_server_side_cache_cleanup(void *argp)
4398 {
4399 	cache_avl_struct_t    *rcas;
4400 	rib_lrc_entry_t		*rb;
4401 	rib_hca_t *hca = (rib_hca_t *)argp;
4402 
4403 	rw_enter(&hca->avl_rw_lock, RW_READER);
4404 	if (cache_allocation < cache_limit) {
4405 		rw_exit(&hca->avl_rw_lock);
4406 		return;
4407 	}
4408 	rw_exit(&hca->avl_rw_lock);
4409 
4410 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4411 	rcas = avl_last(&hca->avl_tree);
4412 	if (rcas != NULL)
4413 		avl_remove(&hca->avl_tree, rcas);
4414 
4415 	while (rcas != NULL) {
4416 		while (rcas->r.forw != &rcas->r) {
4417 			rcas->elements--;
4418 			rib_total_buffers --;
4419 			rb = rcas->r.forw;
4420 			remque(rb);
4421 			if (rb->registered)
4422 				(void) rib_deregistermem_via_hca(hca,
4423 				    rb->lrc_buf, rb->lrc_mhandle);
4424 			cache_allocation -= rb->lrc_len;
4425 			kmem_free(rb->lrc_buf, rb->lrc_len);
4426 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4427 		}
4428 		mutex_destroy(&rcas->node_lock);
4429 		kmem_cache_free(hca->server_side_cache, rcas);
4430 		if ((cache_allocation) < cache_limit) {
4431 			rw_exit(&hca->avl_rw_lock);
4432 			return;
4433 		}
4434 
4435 		rcas = avl_last(&hca->avl_tree);
4436 		if (rcas != NULL)
4437 			avl_remove(&hca->avl_tree, rcas);
4438 	}
4439 	rw_exit(&hca->avl_rw_lock);
4440 }
4441 
4442 static int
4443 avl_compare(const void *t1, const void *t2)
4444 {
4445 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
4446 		return (0);
4447 
4448 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
4449 		return (-1);
4450 
4451 	return (1);
4452 }
4453 
4454 static void
4455 rib_destroy_cache(rib_hca_t *hca)
4456 {
4457 	if (hca->reg_cache_clean_up != NULL) {
4458 		ddi_taskq_destroy(hca->reg_cache_clean_up);
4459 		hca->reg_cache_clean_up = NULL;
4460 	}
4461 	if (!hca->avl_init) {
4462 		kmem_cache_destroy(hca->server_side_cache);
4463 		avl_destroy(&hca->avl_tree);
4464 		mutex_destroy(&hca->cache_allocation);
4465 		rw_destroy(&hca->avl_rw_lock);
4466 	}
4467 	hca->avl_init = FALSE;
4468 }
4469 
4470 static void
4471 rib_force_cleanup(void *hca)
4472 {
4473 	if (((rib_hca_t *)hca)->reg_cache_clean_up != NULL)
4474 		(void) ddi_taskq_dispatch(
4475 		    ((rib_hca_t *)hca)->reg_cache_clean_up,
4476 		    rib_server_side_cache_cleanup,
4477 		    (void *)hca, DDI_NOSLEEP);
4478 }
4479 
4480 static rib_lrc_entry_t *
4481 rib_get_cache_buf(CONN *conn, uint32_t len)
4482 {
4483 	cache_avl_struct_t	cas, *rcas;
4484 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4485 	rib_lrc_entry_t *reply_buf;
4486 	avl_index_t where = NULL;
4487 	uint64_t c_alloc = 0;
4488 
4489 	if (!hca->avl_init)
4490 		goto  error_alloc;
4491 
4492 	cas.len = len;
4493 
4494 	rw_enter(&hca->avl_rw_lock, RW_READER);
4495 
4496 	mutex_enter(&hca->cache_allocation);
4497 	c_alloc = cache_allocation;
4498 	mutex_exit(&hca->cache_allocation);
4499 
4500 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
4501 	    &where)) == NULL) {
4502 		/* Am I above the cache limit */
4503 		if ((c_alloc + len) >= cache_limit) {
4504 			rib_force_cleanup((void *)hca);
4505 			rw_exit(&hca->avl_rw_lock);
4506 			cache_misses_above_the_limit ++;
4507 
4508 			/* Allocate and register the buffer directly */
4509 			goto error_alloc;
4510 		}
4511 
4512 		rw_exit(&hca->avl_rw_lock);
4513 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
4514 
4515 		/* Recheck to make sure no other thread added the entry in */
4516 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
4517 		    &cas, &where)) == NULL) {
4518 			/* Allocate an avl tree entry */
4519 			rcas = (cache_avl_struct_t *)
4520 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
4521 
4522 			bzero(rcas, sizeof (cache_avl_struct_t));
4523 			rcas->elements = 0;
4524 			rcas->r.forw = &rcas->r;
4525 			rcas->r.back = &rcas->r;
4526 			rcas->len = len;
4527 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
4528 			avl_insert(&hca->avl_tree, rcas, where);
4529 		}
4530 	}
4531 
4532 	mutex_enter(&rcas->node_lock);
4533 
4534 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
4535 		rib_total_buffers--;
4536 		cache_hits++;
4537 		reply_buf = rcas->r.forw;
4538 		remque(reply_buf);
4539 		rcas->elements--;
4540 		mutex_exit(&rcas->node_lock);
4541 		rw_exit(&hca->avl_rw_lock);
4542 		mutex_enter(&hca->cache_allocation);
4543 		cache_allocation -= len;
4544 		mutex_exit(&hca->cache_allocation);
4545 	} else {
4546 		/* Am I above the cache limit */
4547 		mutex_exit(&rcas->node_lock);
4548 		if ((c_alloc + len) >= cache_limit) {
4549 			rib_force_cleanup((void *)hca);
4550 			rw_exit(&hca->avl_rw_lock);
4551 			cache_misses_above_the_limit ++;
4552 			/* Allocate and register the buffer directly */
4553 			goto error_alloc;
4554 		}
4555 		rw_exit(&hca->avl_rw_lock);
4556 		cache_misses ++;
4557 		/* Allocate a reply_buf entry */
4558 		reply_buf = (rib_lrc_entry_t *)
4559 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4560 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
4561 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
4562 		reply_buf->lrc_len  = len;
4563 		reply_buf->registered = FALSE;
4564 		reply_buf->avl_node = (void *)rcas;
4565 	}
4566 
4567 	return (reply_buf);
4568 
4569 error_alloc:
4570 	reply_buf = (rib_lrc_entry_t *)
4571 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4572 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
4573 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
4574 	reply_buf->lrc_len = len;
4575 	reply_buf->registered = FALSE;
4576 	reply_buf->avl_node = NULL;
4577 
4578 	return (reply_buf);
4579 }
4580 
4581 /*
4582  * Return a pre-registered back to the cache (without
4583  * unregistering the buffer)..
4584  */
4585 
4586 static void
4587 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
4588 {
4589 	cache_avl_struct_t    cas, *rcas;
4590 	avl_index_t where = NULL;
4591 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4592 
4593 	if (!hca->avl_init)
4594 		goto  error_free;
4595 
4596 	cas.len = reg_buf->lrc_len;
4597 	rw_enter(&hca->avl_rw_lock, RW_READER);
4598 	if ((rcas = (cache_avl_struct_t *)
4599 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
4600 		rw_exit(&hca->avl_rw_lock);
4601 		goto error_free;
4602 	} else {
4603 		rib_total_buffers ++;
4604 		cas.len = reg_buf->lrc_len;
4605 		mutex_enter(&rcas->node_lock);
4606 		insque(reg_buf, &rcas->r);
4607 		rcas->elements ++;
4608 		mutex_exit(&rcas->node_lock);
4609 		rw_exit(&hca->avl_rw_lock);
4610 		mutex_enter(&hca->cache_allocation);
4611 		cache_allocation += cas.len;
4612 		mutex_exit(&hca->cache_allocation);
4613 	}
4614 
4615 	return;
4616 
4617 error_free:
4618 
4619 	if (reg_buf->registered)
4620 		(void) rib_deregistermem_via_hca(hca,
4621 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
4622 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
4623 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
4624 }
4625 
4626 static rdma_stat
4627 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
4628 	uint_t buflen, struct mrc *buf_handle)
4629 {
4630 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
4631 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
4632 	rdma_stat	status;
4633 
4634 
4635 	/*
4636 	 * Note: ALL buffer pools use the same memory type RDMARW.
4637 	 */
4638 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
4639 	if (status == RDMA_SUCCESS) {
4640 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
4641 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
4642 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
4643 	} else {
4644 		buf_handle->mrc_linfo = NULL;
4645 		buf_handle->mrc_lmr = 0;
4646 		buf_handle->mrc_rmr = 0;
4647 	}
4648 	return (status);
4649 }
4650 
4651 /* ARGSUSED */
4652 static rdma_stat
4653 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
4654     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
4655 {
4656 
4657 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
4658 	return (RDMA_SUCCESS);
4659 }
4660 
4661 /* ARGSUSED */
4662 static rdma_stat
4663 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
4664 {
4665 
4666 	(void) ibt_deregister_mr(hca->hca_hdl,
4667 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
4668 	return (RDMA_SUCCESS);
4669 }
4670 
4671 
4672 /*
4673  * Return 0 if the interface is IB.
4674  * Return error (>0) if any error is encountered during processing.
4675  * Return -1 if the interface is not IB and no error.
4676  */
4677 #define	isalpha(ch)	(((ch) >= 'a' && (ch) <= 'z') || \
4678 			((ch) >= 'A' && (ch) <= 'Z'))
4679 static int
4680 rpcib_is_ib_interface(char *name)
4681 {
4682 
4683 	char	dev_path[MAXPATHLEN];
4684 	char	devname[MAXNAMELEN];
4685 	ldi_handle_t	lh;
4686 	dl_info_ack_t	info;
4687 	int	ret = 0;
4688 	int	i;
4689 
4690 	/*
4691 	 * ibd devices are only style 2 devices
4692 	 * so we will open only style 2 devices
4693 	 * by ignoring the ppa
4694 	 */
4695 
4696 	i = strlen(name) - 1;
4697 	while ((i >= 0) && (!isalpha(name[i]))) i--;
4698 
4699 	if (i < 0) {
4700 		/* Invalid interface name, no alphabet */
4701 		return (-1);
4702 	}
4703 
4704 	(void) strncpy(devname, name, i + 1);
4705 	devname[i + 1] = '\0';
4706 
4707 	if (strcmp("lo", devname) == 0) {
4708 		/*
4709 		 * loopback interface  not rpc/rdma capable
4710 		 */
4711 		return (-1);
4712 	}
4713 
4714 	(void) strncpy(dev_path, "/dev/", MAXPATHLEN);
4715 	if (strlcat(dev_path, devname, MAXPATHLEN) >= MAXPATHLEN) {
4716 		/* string overflow */
4717 		return (-1);
4718 	}
4719 
4720 	ret = ldi_open_by_name(dev_path, FREAD|FWRITE, kcred, &lh, rpcib_li);
4721 	if (ret != 0) {
4722 		return (ret);
4723 	}
4724 	ret = rpcib_dl_info(lh, &info);
4725 	(void) ldi_close(lh, FREAD|FWRITE, kcred);
4726 	if (ret != 0) {
4727 		return (ret);
4728 	}
4729 
4730 	if (info.dl_mac_type != DL_IB) {
4731 		return (-1);
4732 	}
4733 
4734 	return (0);
4735 }
4736 
4737 static int
4738 rpcib_dl_info(ldi_handle_t lh, dl_info_ack_t *info)
4739 {
4740 	dl_info_req_t *info_req;
4741 	union DL_primitives *dl_prim;
4742 	mblk_t *mp;
4743 	k_sigset_t smask;
4744 	int error;
4745 
4746 	if ((mp = allocb(sizeof (dl_info_req_t), BPRI_MED)) == NULL) {
4747 		return (ENOMEM);
4748 	}
4749 
4750 	mp->b_datap->db_type = M_PROTO;
4751 
4752 	info_req = (dl_info_req_t *)(uintptr_t)mp->b_wptr;
4753 	mp->b_wptr += sizeof (dl_info_req_t);
4754 	info_req->dl_primitive = DL_INFO_REQ;
4755 
4756 	sigintr(&smask, 0);
4757 	if ((error = ldi_putmsg(lh, mp)) != 0) {
4758 		sigunintr(&smask);
4759 		return (error);
4760 	}
4761 	if ((error = ldi_getmsg(lh, &mp, (timestruc_t *)NULL)) != 0) {
4762 		sigunintr(&smask);
4763 		return (error);
4764 	}
4765 	sigunintr(&smask);
4766 
4767 	dl_prim = (union DL_primitives *)(uintptr_t)mp->b_rptr;
4768 	switch (dl_prim->dl_primitive) {
4769 		case DL_INFO_ACK:
4770 			if (((uintptr_t)mp->b_wptr - (uintptr_t)mp->b_rptr) <
4771 			    sizeof (dl_info_ack_t)) {
4772 			error = -1;
4773 			} else {
4774 				*info = *(dl_info_ack_t *)(uintptr_t)mp->b_rptr;
4775 				error = 0;
4776 			}
4777 			break;
4778 		default:
4779 			error = -1;
4780 			break;
4781 	}
4782 
4783 	freemsg(mp);
4784 	return (error);
4785 }
4786 static int
4787 rpcib_do_ip_ioctl(int cmd, int len, caddr_t arg)
4788 {
4789 	vnode_t *kvp, *vp;
4790 	TIUSER  *tiptr;
4791 	struct  strioctl iocb;
4792 	k_sigset_t smask;
4793 	int	err = 0;
4794 
4795 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP,
4796 	    &kvp) == 0) {
4797 		if (t_kopen((file_t *)NULL, kvp->v_rdev, FREAD|FWRITE,
4798 		    &tiptr, CRED()) == 0) {
4799 		vp = tiptr->fp->f_vnode;
4800 	} else {
4801 		VN_RELE(kvp);
4802 		return (EPROTO);
4803 		}
4804 	} else {
4805 			return (EPROTO);
4806 	}
4807 
4808 	iocb.ic_cmd = cmd;
4809 	iocb.ic_timout = 0;
4810 	iocb.ic_len = len;
4811 	iocb.ic_dp = arg;
4812 	sigintr(&smask, 0);
4813 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
4814 	sigunintr(&smask);
4815 	(void) t_kclose(tiptr, 0);
4816 	VN_RELE(kvp);
4817 	return (err);
4818 }
4819 
4820 static uint_t rpcib_get_number_interfaces(void) {
4821 uint_t	numifs;
4822 	if (rpcib_do_ip_ioctl(SIOCGIFNUM, sizeof (uint_t), (caddr_t)&numifs)) {
4823 		return (0);
4824 	}
4825 	return (numifs);
4826 }
4827 
4828 static boolean_t
4829 rpcib_get_ib_addresses(
4830 	struct sockaddr_in *saddr4,
4831 	struct sockaddr_in6 *saddr6,
4832 	uint_t *number4,
4833 	uint_t *number6)
4834 {
4835 	int	numifs;
4836 	struct	ifconf	kifc;
4837 	struct  ifreq *ifr;
4838 	boolean_t ret = B_FALSE;
4839 
4840 	*number4 = 0;
4841 	*number6 = 0;
4842 
4843 	if (rpcib_do_ip_ioctl(SIOCGIFNUM, sizeof (int), (caddr_t)&numifs)) {
4844 		return (ret);
4845 	}
4846 
4847 	kifc.ifc_len = numifs * sizeof (struct ifreq);
4848 	kifc.ifc_buf = kmem_zalloc(kifc.ifc_len, KM_SLEEP);
4849 
4850 	if (rpcib_do_ip_ioctl(SIOCGIFCONF, sizeof (struct ifconf),
4851 	    (caddr_t)&kifc)) {
4852 		goto done;
4853 	}
4854 
4855 	ifr = kifc.ifc_req;
4856 	for (numifs = kifc.ifc_len / sizeof (struct ifreq);
4857 	    numifs > 0; numifs--, ifr++) {
4858 		struct sockaddr_in *sin4;
4859 		struct sockaddr_in6 *sin6;
4860 
4861 		if ((rpcib_is_ib_interface(ifr->ifr_name) == 0)) {
4862 			sin4 = (struct sockaddr_in *)(uintptr_t)&ifr->ifr_addr;
4863 			sin6 = (struct sockaddr_in6 *)(uintptr_t)&ifr->ifr_addr;
4864 			if (sin4->sin_family == AF_INET) {
4865 				saddr4[*number4] = *(struct sockaddr_in *)
4866 				    (uintptr_t)&ifr->ifr_addr;
4867 				*number4 = *number4 + 1;
4868 			} else if (sin6->sin6_family == AF_INET6) {
4869 				saddr6[*number6] = *(struct sockaddr_in6 *)
4870 				    (uintptr_t)&ifr->ifr_addr;
4871 				*number6 = *number6 + 1;
4872 			}
4873 		}
4874 	}
4875 	ret = B_TRUE;
4876 done:
4877 	kmem_free(kifc.ifc_buf, kifc.ifc_len);
4878 	return (ret);
4879 }
4880 
4881 /* ARGSUSED */
4882 static int rpcib_cache_kstat_update(kstat_t *ksp, int rw) {
4883 
4884 	if (KSTAT_WRITE == rw) {
4885 		return (EACCES);
4886 	}
4887 	rpcib_kstat.cache_limit.value.ui64 =
4888 	    (uint64_t)cache_limit;
4889 	rpcib_kstat.cache_allocation.value.ui64 =
4890 	    (uint64_t)cache_allocation;
4891 	rpcib_kstat.cache_hits.value.ui64 =
4892 	    (uint64_t)cache_hits;
4893 	rpcib_kstat.cache_misses.value.ui64 =
4894 	    (uint64_t)cache_misses;
4895 	rpcib_kstat.cache_misses_above_the_limit.value.ui64 =
4896 	    (uint64_t)cache_misses_above_the_limit;
4897 	return (0);
4898 }
4899