xref: /illumos-gate/usr/src/uts/common/rpc/rpcib.c (revision 129b3e6c5b0ac55b5021a4c38db6387b6acdaaf1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
28  *
29  * Portions of this source code is developed by the team members of
30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31  * headed by Professor Dhabaleswar K. (DK) Panda.
32  *
33  * Acknowledgements to contributions from developors:
34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
35  *   Lei Chai      : chail@cse.ohio-state.edu
36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
37  *
38  */
39 
40 /*
41  * The rpcib plugin. Implements the interface for RDMATF's
42  * interaction with IBTF.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/user.h>
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/file.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/stropts.h>
56 #include <sys/errno.h>
57 #include <sys/kmem.h>
58 #include <sys/debug.h>
59 #include <sys/pathname.h>
60 #include <sys/kstat.h>
61 #include <sys/t_lock.h>
62 #include <sys/ddi.h>
63 #include <sys/cmn_err.h>
64 #include <sys/time.h>
65 #include <sys/isa_defs.h>
66 #include <sys/callb.h>
67 #include <sys/sunddi.h>
68 #include <sys/sunndi.h>
69 #include <sys/sdt.h>
70 #include <sys/ib/ibtl/ibti.h>
71 #include <rpc/rpc.h>
72 #include <rpc/ib.h>
73 #include <sys/modctl.h>
74 #include <sys/kstr.h>
75 #include <sys/sockio.h>
76 #include <sys/vnode.h>
77 #include <sys/tiuser.h>
78 #include <net/if.h>
79 #include <net/if_types.h>
80 #include <sys/cred.h>
81 #include <rpc/rpc_rdma.h>
82 #include <nfs/nfs.h>
83 #include <sys/atomic.h>
84 
85 #define	NFS_RDMA_PORT	20049
86 
87 
88 /*
89  * Convenience structures for connection management
90  */
91 typedef struct rpcib_ipaddrs {
92 	void	*ri_list;	/* pointer to list of addresses */
93 	uint_t	ri_count;	/* number of addresses in list */
94 	uint_t	ri_size;	/* size of ri_list in bytes */
95 } rpcib_ipaddrs_t;
96 
97 
98 typedef struct rpcib_ping {
99 	rib_hca_t  *hca;
100 	ibt_path_info_t path;
101 	ibt_ip_addr_t srcip;
102 	ibt_ip_addr_t dstip;
103 } rpcib_ping_t;
104 
105 /*
106  * Prototype declarations for driver ops
107  */
108 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
109 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
110 				void *, void **);
111 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
113 static int	rpcib_do_ip_ioctl(int, int, void *);
114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
115 static int rpcib_cache_kstat_update(kstat_t *, int);
116 static void rib_force_cleanup(void *);
117 static void rib_stop_hca_services(rib_hca_t *);
118 static void rib_attach_hca(void);
119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
120 		struct netbuf *d_svcaddr, CONN **conn);
121 
122 struct {
123 	kstat_named_t cache_limit;
124 	kstat_named_t cache_allocation;
125 	kstat_named_t cache_hits;
126 	kstat_named_t cache_misses;
127 	kstat_named_t cache_misses_above_the_limit;
128 } rpcib_kstat = {
129 	{"cache_limit",			KSTAT_DATA_UINT64 },
130 	{"cache_allocation",		KSTAT_DATA_UINT64 },
131 	{"cache_hits",			KSTAT_DATA_UINT64 },
132 	{"cache_misses",		KSTAT_DATA_UINT64 },
133 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
134 };
135 
136 /* rpcib cb_ops */
137 static struct cb_ops rpcib_cbops = {
138 	nulldev,		/* open */
139 	nulldev,		/* close */
140 	nodev,			/* strategy */
141 	nodev,			/* print */
142 	nodev,			/* dump */
143 	nodev,			/* read */
144 	nodev,			/* write */
145 	nodev,			/* ioctl */
146 	nodev,			/* devmap */
147 	nodev,			/* mmap */
148 	nodev,			/* segmap */
149 	nochpoll,		/* poll */
150 	ddi_prop_op,		/* prop_op */
151 	NULL,			/* stream */
152 	D_MP,			/* cb_flag */
153 	CB_REV,			/* rev */
154 	nodev,			/* int (*cb_aread)() */
155 	nodev			/* int (*cb_awrite)() */
156 };
157 
158 /*
159  * Device options
160  */
161 static struct dev_ops rpcib_ops = {
162 	DEVO_REV,		/* devo_rev, */
163 	0,			/* refcnt  */
164 	rpcib_getinfo,		/* info */
165 	nulldev,		/* identify */
166 	nulldev,		/* probe */
167 	rpcib_attach,		/* attach */
168 	rpcib_detach,		/* detach */
169 	nodev,			/* reset */
170 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
171 	NULL,			/* bus operations */
172 	NULL,			/* power */
173 	ddi_quiesce_not_needed,		/* quiesce */
174 };
175 
176 /*
177  * Module linkage information.
178  */
179 
180 static struct modldrv rib_modldrv = {
181 	&mod_driverops,		/* Driver module */
182 	"RPCIB plugin driver",	/* Driver name and version */
183 	&rpcib_ops,		/* Driver ops */
184 };
185 
186 static struct modlinkage rib_modlinkage = {
187 	MODREV_1,
188 	(void *)&rib_modldrv,
189 	NULL
190 };
191 
192 typedef struct rib_lrc_entry {
193 	struct rib_lrc_entry *forw;
194 	struct rib_lrc_entry *back;
195 	char *lrc_buf;
196 
197 	uint32_t lrc_len;
198 	void  *avl_node;
199 	bool_t registered;
200 
201 	struct mrc lrc_mhandle;
202 	bool_t lrc_on_freed_list;
203 } rib_lrc_entry_t;
204 
205 typedef	struct cache_struct	{
206 	rib_lrc_entry_t		r;
207 	uint32_t		len;
208 	uint32_t		elements;
209 	kmutex_t		node_lock;
210 	avl_node_t		avl_link;
211 } cache_avl_struct_t;
212 
213 uint64_t	cache_limit = 100 * 1024 * 1024;
214 static uint64_t	cache_watermark = 80 * 1024 * 1024;
215 static bool_t	stats_enabled = FALSE;
216 
217 static uint64_t max_unsignaled_rws = 5;
218 int nfs_rdma_port = NFS_RDMA_PORT;
219 
220 /*
221  * rib_stat: private data pointer used when registering
222  *	with the IBTF.  It is returned to the consumer
223  *	in all callbacks.
224  */
225 static rpcib_state_t *rib_stat = NULL;
226 
227 #define	RNR_RETRIES	IBT_RNR_RETRY_1
228 #define	MAX_PORTS	2
229 #define	RDMA_DUMMY_WRID	0x4D3A1D4D3A1D
230 #define	RDMA_CONN_REAP_RETRY	10	/* 10 secs */
231 
232 int preposted_rbufs = RDMA_BUFS_GRANT;
233 int send_threshold = 1;
234 
235 /*
236  * Old cards with Tavor driver have limited memory footprint
237  * when booted in 32bit. The rib_max_rbufs tunable can be
238  * tuned for more buffers if needed.
239  */
240 
241 #if !defined(_ELF64) && !defined(__sparc)
242 int rib_max_rbufs = MAX_BUFS;
243 #else
244 int rib_max_rbufs = 10 * MAX_BUFS;
245 #endif	/* !(_ELF64) && !(__sparc) */
246 
247 int rib_conn_timeout = 60 * 12;		/* 12 minutes */
248 
249 /*
250  * State of the plugin.
251  * ACCEPT = accepting new connections and requests.
252  * NO_ACCEPT = not accepting new connection and requests.
253  * This should eventually move to rpcib_state_t structure, since this
254  * will tell in which state the plugin is for a particular type of service
255  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
256  * state for one and in no_accept state for the other.
257  */
258 int		plugin_state;
259 kmutex_t	plugin_state_lock;
260 
261 ldi_ident_t rpcib_li;
262 
263 /*
264  * RPCIB RDMATF operations
265  */
266 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
267 static rdma_stat rib_disconnect(CONN *conn);
268 static void rib_listen(struct rdma_svc_data *rd);
269 static void rib_listen_stop(struct rdma_svc_data *rd);
270 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
271 	uint_t buflen, struct mrc *buf_handle);
272 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
273 	struct mrc buf_handle);
274 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
275 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
276 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
277 		struct mrc buf_handle);
278 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
279 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
280 	void *lrc);
281 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
282 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
283 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
284 	caddr_t buf, int len, int cpu);
285 
286 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
287 
288 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
289 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
290 
291 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
292 
293 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
294 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
295 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
296 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
297 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
298 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
299 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
300 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
301 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
302 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
303 	int addr_type, void *, CONN **);
304 static rdma_stat rib_conn_release(CONN *conn);
305 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
306 	rpcib_ping_t *, CONN **);
307 static rdma_stat rib_getinfo(rdma_info_t *info);
308 
309 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
310 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
311 static void rib_destroy_cache(rib_hca_t *hca);
312 static	void	rib_server_side_cache_reclaim(void *argp);
313 static int avl_compare(const void *t1, const void *t2);
314 
315 static void rib_stop_services(rib_hca_t *);
316 static void rib_close_channels(rib_conn_list_t *);
317 static void rib_conn_close(void *);
318 
319 /*
320  * RPCIB addressing operations
321  */
322 
323 /*
324  * RDMA operations the RPCIB module exports
325  */
326 static rdmaops_t rib_ops = {
327 	rib_reachable,
328 	rib_conn_get,
329 	rib_conn_release,
330 	rib_listen,
331 	rib_listen_stop,
332 	rib_registermem,
333 	rib_deregistermem,
334 	rib_registermemsync,
335 	rib_deregistermemsync,
336 	rib_syncmem,
337 	rib_reg_buf_alloc,
338 	rib_reg_buf_free,
339 	rib_send,
340 	rib_send_resp,
341 	rib_post_resp,
342 	rib_post_resp_remove,
343 	rib_post_recv,
344 	rib_recv,
345 	rib_read,
346 	rib_write,
347 	rib_getinfo,
348 };
349 
350 /*
351  * RDMATF RPCIB plugin details
352  */
353 static rdma_mod_t rib_mod = {
354 	"ibtf",		/* api name */
355 	RDMATF_VERS_1,
356 	0,
357 	&rib_ops,	/* rdma op vector for ibtf */
358 };
359 
360 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
361 static rdma_stat rib_qp_init(rib_qp_t *, int);
362 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
363 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
364 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
365 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
366 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
367 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
368 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
369 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
370 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
371 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
372 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
373 	rib_qp_t **);
374 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
375 	rib_qp_t **);
376 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
377 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
378 static int rib_free_sendwait(struct send_wid *);
379 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
380 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
381 static void rdma_done_rem_list(rib_qp_t *);
382 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
383 
384 static void rib_async_handler(void *,
385 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
386 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
387 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
388 static int rib_free_svc_recv(struct svc_recv *);
389 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
390 static void rib_free_wid(struct recv_wid *);
391 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
392 static void rib_detach_hca(rib_hca_t *);
393 static void rib_close_a_channel(CONN *);
394 static void rib_send_hold(rib_qp_t *);
395 static void rib_send_rele(rib_qp_t *);
396 
397 /*
398  * Registration with IBTF as a consumer
399  */
400 static struct ibt_clnt_modinfo_s rib_modinfo = {
401 	IBTI_V_CURR,
402 	IBT_GENERIC,
403 	rib_async_handler,	/* async event handler */
404 	NULL,			/* Memory Region Handler */
405 	"nfs/ib"
406 };
407 
408 /*
409  * Global strucuture
410  */
411 
412 typedef struct rpcib_s {
413 	dev_info_t	*rpcib_dip;
414 	kmutex_t	rpcib_mutex;
415 } rpcib_t;
416 
417 rpcib_t rpcib;
418 
419 /*
420  * /etc/system controlled variable to control
421  * debugging in rpcib kernel module.
422  * Set it to values greater that 1 to control
423  * the amount of debugging messages required.
424  */
425 int rib_debug = 0;
426 
427 int
428 _init(void)
429 {
430 	int error;
431 
432 	error = mod_install((struct modlinkage *)&rib_modlinkage);
433 	if (error != 0) {
434 		/*
435 		 * Could not load module
436 		 */
437 		return (error);
438 	}
439 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
440 	return (0);
441 }
442 
443 int
444 _fini()
445 {
446 	int status;
447 
448 	/*
449 	 * Remove module
450 	 */
451 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
452 		return (status);
453 	}
454 	mutex_destroy(&plugin_state_lock);
455 	return (0);
456 }
457 
458 int
459 _info(struct modinfo *modinfop)
460 {
461 	return (mod_info(&rib_modlinkage, modinfop));
462 }
463 
464 /*
465  * rpcib_getinfo()
466  * Given the device number, return the devinfo pointer or the
467  * instance number.
468  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
469  */
470 
471 /*ARGSUSED*/
472 static int
473 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
474 {
475 	int ret = DDI_SUCCESS;
476 
477 	switch (cmd) {
478 	case DDI_INFO_DEVT2DEVINFO:
479 		if (rpcib.rpcib_dip != NULL)
480 			*result = rpcib.rpcib_dip;
481 		else {
482 			*result = NULL;
483 			ret = DDI_FAILURE;
484 		}
485 		break;
486 
487 	case DDI_INFO_DEVT2INSTANCE:
488 		*result = NULL;
489 		break;
490 
491 	default:
492 		ret = DDI_FAILURE;
493 	}
494 	return (ret);
495 }
496 
497 static void
498 rpcib_free_hca_list()
499 {
500 	rib_hca_t *hca, *hcap;
501 
502 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
503 	hca = rib_stat->hcas_list;
504 	rib_stat->hcas_list = NULL;
505 	rw_exit(&rib_stat->hcas_list_lock);
506 	while (hca != NULL) {
507 		rw_enter(&hca->state_lock, RW_WRITER);
508 		hcap = hca;
509 		hca = hca->next;
510 		rib_stat->nhca_inited--;
511 		rib_mod.rdma_count--;
512 		hcap->state = HCA_DETACHED;
513 		rw_exit(&hcap->state_lock);
514 		rib_stop_hca_services(hcap);
515 
516 		kmem_free(hcap, sizeof (*hcap));
517 	}
518 }
519 
520 static rdma_stat
521 rpcib_free_service_list()
522 {
523 	rib_service_t *service;
524 	ibt_status_t ret;
525 
526 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
527 	while (rib_stat->service_list != NULL) {
528 		service = rib_stat->service_list;
529 		ret = ibt_unbind_all_services(service->srv_hdl);
530 		if (ret != IBT_SUCCESS) {
531 			rw_exit(&rib_stat->service_list_lock);
532 #ifdef DEBUG
533 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
534 			    "ibt_unbind_all_services failed (%d)\n", (int)ret);
535 #endif
536 			return (RDMA_FAILED);
537 		}
538 		ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
539 		    service->srv_hdl);
540 		if (ret != IBT_SUCCESS) {
541 			rw_exit(&rib_stat->service_list_lock);
542 #ifdef DEBUG
543 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
544 			    "ibt_deregister_service failed (%d)\n", (int)ret);
545 #endif
546 			return (RDMA_FAILED);
547 		}
548 		rib_stat->service_list = service->next;
549 		kmem_free(service, sizeof (rib_service_t));
550 	}
551 	rw_exit(&rib_stat->service_list_lock);
552 
553 	return (RDMA_SUCCESS);
554 }
555 
556 static int
557 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
558 {
559 	ibt_status_t	ibt_status;
560 	rdma_stat	r_status;
561 
562 	switch (cmd) {
563 	case DDI_ATTACH:
564 		break;
565 	case DDI_RESUME:
566 		return (DDI_SUCCESS);
567 	default:
568 		return (DDI_FAILURE);
569 	}
570 
571 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
572 
573 	mutex_enter(&rpcib.rpcib_mutex);
574 	if (rpcib.rpcib_dip != NULL) {
575 		mutex_exit(&rpcib.rpcib_mutex);
576 		return (DDI_FAILURE);
577 	}
578 	rpcib.rpcib_dip = dip;
579 	mutex_exit(&rpcib.rpcib_mutex);
580 	/*
581 	 * Create the "rpcib" minor-node.
582 	 */
583 	if (ddi_create_minor_node(dip,
584 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
585 		/* Error message, no cmn_err as they print on console */
586 		return (DDI_FAILURE);
587 	}
588 
589 	if (rib_stat == NULL) {
590 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
591 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
592 		rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
593 		mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
594 	}
595 
596 	rib_stat->hca_count = ibt_get_hca_list(NULL);
597 	if (rib_stat->hca_count < 1) {
598 		mutex_destroy(&rib_stat->listen_lock);
599 		rw_destroy(&rib_stat->hcas_list_lock);
600 		mutex_destroy(&rib_stat->open_hca_lock);
601 		kmem_free(rib_stat, sizeof (*rib_stat));
602 		rib_stat = NULL;
603 		return (DDI_FAILURE);
604 	}
605 
606 	ibt_status = ibt_attach(&rib_modinfo, dip,
607 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
608 
609 	if (ibt_status != IBT_SUCCESS) {
610 		mutex_destroy(&rib_stat->listen_lock);
611 		rw_destroy(&rib_stat->hcas_list_lock);
612 		mutex_destroy(&rib_stat->open_hca_lock);
613 		kmem_free(rib_stat, sizeof (*rib_stat));
614 		rib_stat = NULL;
615 		return (DDI_FAILURE);
616 	}
617 
618 	rib_stat->service_list = NULL;
619 	rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
620 	mutex_enter(&rib_stat->open_hca_lock);
621 	if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
622 		mutex_exit(&rib_stat->open_hca_lock);
623 		goto open_fail;
624 	}
625 	mutex_exit(&rib_stat->open_hca_lock);
626 
627 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
628 	    DDI_PROP_SUCCESS) {
629 		cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
630 		    "failed.");
631 		goto register_fail;
632 	}
633 
634 	/*
635 	 * Register with rdmatf
636 	 */
637 	r_status = rdma_register_mod(&rib_mod);
638 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
639 		cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
640 		    "status = %d", r_status);
641 		goto register_fail;
642 	}
643 
644 	return (DDI_SUCCESS);
645 
646 register_fail:
647 
648 open_fail:
649 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
650 	rpcib_free_hca_list();
651 	(void) rpcib_free_service_list();
652 	mutex_destroy(&rib_stat->listen_lock);
653 	rw_destroy(&rib_stat->hcas_list_lock);
654 	mutex_destroy(&rib_stat->open_hca_lock);
655 	rw_destroy(&rib_stat->service_list_lock);
656 	kmem_free(rib_stat, sizeof (*rib_stat));
657 	rib_stat = NULL;
658 	return (DDI_FAILURE);
659 }
660 
661 /*ARGSUSED*/
662 static int
663 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
664 {
665 	switch (cmd) {
666 
667 	case DDI_DETACH:
668 		break;
669 
670 	case DDI_SUSPEND:
671 	default:
672 		return (DDI_FAILURE);
673 	}
674 
675 	/*
676 	 * Detach the hca and free resources
677 	 */
678 	mutex_enter(&plugin_state_lock);
679 	plugin_state = NO_ACCEPT;
680 	mutex_exit(&plugin_state_lock);
681 
682 	if (rpcib_free_service_list() != RDMA_SUCCESS)
683 		return (DDI_FAILURE);
684 	rpcib_free_hca_list();
685 
686 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
687 	mutex_destroy(&rib_stat->listen_lock);
688 	rw_destroy(&rib_stat->hcas_list_lock);
689 	mutex_destroy(&rib_stat->open_hca_lock);
690 	rw_destroy(&rib_stat->service_list_lock);
691 
692 	kmem_free(rib_stat, sizeof (*rib_stat));
693 	rib_stat = NULL;
694 
695 	mutex_enter(&rpcib.rpcib_mutex);
696 	rpcib.rpcib_dip = NULL;
697 	mutex_exit(&rpcib.rpcib_mutex);
698 	mutex_destroy(&rpcib.rpcib_mutex);
699 	return (DDI_SUCCESS);
700 }
701 
702 
703 static void rib_rbufpool_free(rib_hca_t *, int);
704 static void rib_rbufpool_deregister(rib_hca_t *, int);
705 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
706 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
707 static rdma_stat rib_rem_replylist(rib_qp_t *);
708 static int rib_remreply(rib_qp_t *, struct reply *);
709 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
710 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
711 
712 
713 /*
714  * One CQ pair per HCA
715  */
716 static rdma_stat
717 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
718 	rib_cq_t **cqp)
719 {
720 	rib_cq_t	*cq;
721 	ibt_cq_attr_t	cq_attr;
722 	uint32_t	real_size;
723 	ibt_status_t	status;
724 	rdma_stat	error = RDMA_SUCCESS;
725 
726 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
727 	cq->rib_hca = hca;
728 	cq_attr.cq_size = cq_size;
729 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
730 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
731 	    &real_size);
732 	if (status != IBT_SUCCESS) {
733 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
734 		    " status=%d", status);
735 		error = RDMA_FAILED;
736 		goto fail;
737 	}
738 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
739 
740 	/*
741 	 * Enable CQ callbacks. CQ Callbacks are single shot
742 	 * (e.g. you have to call ibt_enable_cq_notify()
743 	 * after each callback to get another one).
744 	 */
745 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
746 	if (status != IBT_SUCCESS) {
747 		cmn_err(CE_WARN, "rib_create_cq: "
748 		    "enable_cq_notify failed, status %d", status);
749 		error = RDMA_FAILED;
750 		goto fail;
751 	}
752 	*cqp = cq;
753 
754 	return (error);
755 fail:
756 	if (cq->rib_cq_hdl)
757 		(void) ibt_free_cq(cq->rib_cq_hdl);
758 	if (cq)
759 		kmem_free(cq, sizeof (rib_cq_t));
760 	return (error);
761 }
762 
763 /*
764  * rpcib_find_hca
765  *
766  * Caller should have already locked the hcas_lock before calling
767  * this function.
768  */
769 static rib_hca_t *
770 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
771 {
772 	rib_hca_t *hca = ribstat->hcas_list;
773 
774 	while (hca && hca->hca_guid != guid)
775 		hca = hca->next;
776 
777 	return (hca);
778 }
779 
780 static rdma_stat
781 rpcib_open_hcas(rpcib_state_t *ribstat)
782 {
783 	rib_hca_t		*hca;
784 	ibt_status_t		ibt_status;
785 	rdma_stat		status;
786 	ibt_hca_portinfo_t	*pinfop;
787 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
788 	uint_t			size, cq_size;
789 	int			i;
790 	kstat_t *ksp;
791 	cache_avl_struct_t example_avl_node;
792 	char rssc_name[32];
793 	int old_nhca_inited = ribstat->nhca_inited;
794 	ib_guid_t		*hca_guids;
795 
796 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
797 
798 	ribstat->hca_count = ibt_get_hca_list(&hca_guids);
799 	if (ribstat->hca_count == 0)
800 		return (RDMA_FAILED);
801 
802 	rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
803 	/*
804 	 * Open a hca and setup for RDMA
805 	 */
806 	for (i = 0; i < ribstat->hca_count; i++) {
807 		if (rpcib_find_hca(ribstat, hca_guids[i]))
808 			continue;
809 		hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
810 
811 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
812 		    hca_guids[i], &hca->hca_hdl);
813 		if (ibt_status != IBT_SUCCESS) {
814 			kmem_free(hca, sizeof (rib_hca_t));
815 			continue;
816 		}
817 		hca->hca_guid = hca_guids[i];
818 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
819 		hca->state = HCA_INITED;
820 
821 		/*
822 		 * query HCA info
823 		 */
824 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
825 		if (ibt_status != IBT_SUCCESS) {
826 			goto fail1;
827 		}
828 
829 		/*
830 		 * One PD (Protection Domain) per HCA.
831 		 * A qp is allowed to access a memory region
832 		 * only when it's in the same PD as that of
833 		 * the memory region.
834 		 */
835 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
836 		if (ibt_status != IBT_SUCCESS) {
837 			goto fail1;
838 		}
839 
840 		/*
841 		 * query HCA ports
842 		 */
843 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
844 		    0, &pinfop, &hca->hca_nports, &size);
845 		if (ibt_status != IBT_SUCCESS) {
846 			goto fail2;
847 		}
848 		hca->hca_ports = pinfop;
849 		hca->hca_pinfosz = size;
850 		pinfop = NULL;
851 
852 		cq_size = DEF_CQ_SIZE; /* default cq size */
853 		/*
854 		 * Create 2 pairs of cq's (1 pair for client
855 		 * and the other pair for server) on this hca.
856 		 * If number of qp's gets too large, then several
857 		 * cq's will be needed.
858 		 */
859 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
860 		    &hca->svc_rcq);
861 		if (status != RDMA_SUCCESS) {
862 			goto fail3;
863 		}
864 
865 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
866 		    &hca->svc_scq);
867 		if (status != RDMA_SUCCESS) {
868 			goto fail3;
869 		}
870 
871 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
872 		    &hca->clnt_rcq);
873 		if (status != RDMA_SUCCESS) {
874 			goto fail3;
875 		}
876 
877 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
878 		    &hca->clnt_scq);
879 		if (status != RDMA_SUCCESS) {
880 			goto fail3;
881 		}
882 
883 		/*
884 		 * Create buffer pools.
885 		 * Note rib_rbuf_create also allocates memory windows.
886 		 */
887 		hca->recv_pool = rib_rbufpool_create(hca,
888 		    RECV_BUFFER, rib_max_rbufs);
889 		if (hca->recv_pool == NULL) {
890 			goto fail3;
891 		}
892 
893 		hca->send_pool = rib_rbufpool_create(hca,
894 		    SEND_BUFFER, rib_max_rbufs);
895 		if (hca->send_pool == NULL) {
896 			rib_rbufpool_destroy(hca, RECV_BUFFER);
897 			goto fail3;
898 		}
899 
900 		if (hca->server_side_cache == NULL) {
901 			(void) sprintf(rssc_name,
902 			    "rib_srvr_cache_%llx",
903 			    (long long unsigned int) hca->hca_guid);
904 			hca->server_side_cache = kmem_cache_create(
905 			    rssc_name,
906 			    sizeof (cache_avl_struct_t), 0,
907 			    NULL,
908 			    NULL,
909 			    rib_server_side_cache_reclaim,
910 			    hca, NULL, 0);
911 		}
912 
913 		avl_create(&hca->avl_tree,
914 		    avl_compare,
915 		    sizeof (cache_avl_struct_t),
916 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
917 		    (uint_t)(uintptr_t)&example_avl_node);
918 
919 		rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
920 		    hca->iblock);
921 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
922 		rw_init(&hca->avl_rw_lock,
923 		    NULL, RW_DRIVER, hca->iblock);
924 		mutex_init(&hca->cache_allocation_lock,
925 		    NULL, MUTEX_DRIVER, NULL);
926 		hca->avl_init = TRUE;
927 
928 		/* Create kstats for the cache */
929 		ASSERT(INGLOBALZONE(curproc));
930 
931 		if (!stats_enabled) {
932 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
933 			    KSTAT_TYPE_NAMED,
934 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
935 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
936 			    GLOBAL_ZONEID);
937 			if (ksp) {
938 				ksp->ks_data = (void *) &rpcib_kstat;
939 				ksp->ks_update = rpcib_cache_kstat_update;
940 				kstat_install(ksp);
941 				stats_enabled = TRUE;
942 			}
943 		}
944 		if (hca->cleanup_helper == NULL) {
945 			char tq_name[sizeof (hca->hca_guid) * 2 + 1];
946 
947 			(void) snprintf(tq_name, sizeof (tq_name), "%llX",
948 			    (unsigned long long int) hca->hca_guid);
949 			hca->cleanup_helper = ddi_taskq_create(NULL,
950 			    tq_name, 1, TASKQ_DEFAULTPRI, 0);
951 		}
952 
953 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
954 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
955 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
956 		    hca->iblock);
957 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
958 		    hca->iblock);
959 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
960 		hca->inuse = TRUE;
961 
962 		hca->next = ribstat->hcas_list;
963 		ribstat->hcas_list = hca;
964 		ribstat->nhca_inited++;
965 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
966 		continue;
967 
968 fail3:
969 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
970 fail2:
971 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
972 fail1:
973 		(void) ibt_close_hca(hca->hca_hdl);
974 		kmem_free(hca, sizeof (rib_hca_t));
975 	}
976 	rw_exit(&ribstat->hcas_list_lock);
977 	ibt_free_hca_list(hca_guids, ribstat->hca_count);
978 	rib_mod.rdma_count = rib_stat->nhca_inited;
979 
980 	/*
981 	 * return success if at least one new hca has been configured.
982 	 */
983 	if (ribstat->nhca_inited != old_nhca_inited)
984 		return (RDMA_SUCCESS);
985 	else
986 		return (RDMA_FAILED);
987 }
988 
989 /*
990  * Callback routines
991  */
992 
993 /*
994  * SCQ handlers
995  */
996 /* ARGSUSED */
997 static void
998 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
999 {
1000 	ibt_status_t	ibt_status;
1001 	ibt_wc_t	wc;
1002 	struct send_wid	*wd;
1003 	CONN		*conn;
1004 	rib_qp_t	*qp;
1005 	int		i;
1006 
1007 	/*
1008 	 * Re-enable cq notify here to avoid missing any
1009 	 * completion queue notification.
1010 	 */
1011 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1012 
1013 	ibt_status = IBT_SUCCESS;
1014 	while (ibt_status != IBT_CQ_EMPTY) {
1015 		bzero(&wc, sizeof (wc));
1016 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1017 		if (ibt_status != IBT_SUCCESS)
1018 			return;
1019 
1020 		/*
1021 		 * Got a send completion
1022 		 */
1023 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1024 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1025 			qp = wd->qp;
1026 			conn = qptoc(qp);
1027 
1028 			mutex_enter(&wd->sendwait_lock);
1029 			switch (wc.wc_status) {
1030 			case IBT_WC_SUCCESS:
1031 				wd->status = RDMA_SUCCESS;
1032 				break;
1033 			default:
1034 /*
1035  *    RC Send Q Error Code		Local state     Remote State
1036  *    ==================== 		===========     ============
1037  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
1038  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
1039  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
1040  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
1041  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
1042  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
1043  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
1044  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
1045  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
1046  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
1047  *    IBT_WC_WR_FLUSHED_ERR               ERROR           None
1048  */
1049 				/*
1050 				 * Channel in error state. Set connection to
1051 				 * ERROR and cleanup will happen either from
1052 				 * conn_release  or from rib_conn_get
1053 				 */
1054 				wd->status = RDMA_FAILED;
1055 				mutex_enter(&conn->c_lock);
1056 				if (conn->c_state != C_DISCONN_PEND)
1057 					conn->c_state = C_ERROR_CONN;
1058 				mutex_exit(&conn->c_lock);
1059 				break;
1060 			}
1061 
1062 			if (wd->cv_sig == 1) {
1063 				/*
1064 				 * Notify poster
1065 				 */
1066 				cv_signal(&wd->wait_cv);
1067 				mutex_exit(&wd->sendwait_lock);
1068 			} else {
1069 				/*
1070 				 * Poster not waiting for notification.
1071 				 * Free the send buffers and send_wid
1072 				 */
1073 				for (i = 0; i < wd->nsbufs; i++) {
1074 					rib_rbuf_free(qptoc(wd->qp),
1075 					    SEND_BUFFER,
1076 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1077 				}
1078 
1079 				/* decrement the send ref count */
1080 				rib_send_rele(qp);
1081 
1082 				mutex_exit(&wd->sendwait_lock);
1083 				(void) rib_free_sendwait(wd);
1084 			}
1085 		}
1086 	}
1087 }
1088 
1089 /* ARGSUSED */
1090 static void
1091 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1092 {
1093 	ibt_status_t	ibt_status;
1094 	ibt_wc_t	wc;
1095 	struct send_wid	*wd;
1096 	rib_qp_t	*qp;
1097 	CONN		*conn;
1098 	int		i;
1099 
1100 	/*
1101 	 * Re-enable cq notify here to avoid missing any
1102 	 * completion queue notification.
1103 	 */
1104 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1105 
1106 	ibt_status = IBT_SUCCESS;
1107 	while (ibt_status != IBT_CQ_EMPTY) {
1108 		bzero(&wc, sizeof (wc));
1109 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1110 		if (ibt_status != IBT_SUCCESS)
1111 			return;
1112 
1113 		/*
1114 		 * Got a send completion
1115 		 */
1116 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1117 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1118 			qp = wd->qp;
1119 			conn = qptoc(qp);
1120 			mutex_enter(&wd->sendwait_lock);
1121 
1122 			switch (wc.wc_status) {
1123 			case IBT_WC_SUCCESS:
1124 				wd->status = RDMA_SUCCESS;
1125 				break;
1126 			default:
1127 				/*
1128 				 * Channel in error state. Set connection to
1129 				 * ERROR and cleanup will happen either from
1130 				 * conn_release  or conn timeout.
1131 				 */
1132 				wd->status = RDMA_FAILED;
1133 				mutex_enter(&conn->c_lock);
1134 				if (conn->c_state != C_DISCONN_PEND)
1135 					conn->c_state = C_ERROR_CONN;
1136 				mutex_exit(&conn->c_lock);
1137 				break;
1138 			}
1139 
1140 			if (wd->cv_sig == 1) {
1141 				/*
1142 				 * Update completion status and notify poster
1143 				 */
1144 				cv_signal(&wd->wait_cv);
1145 				mutex_exit(&wd->sendwait_lock);
1146 			} else {
1147 				/*
1148 				 * Poster not waiting for notification.
1149 				 * Free the send buffers and send_wid
1150 				 */
1151 				for (i = 0; i < wd->nsbufs; i++) {
1152 					rib_rbuf_free(qptoc(wd->qp),
1153 					    SEND_BUFFER,
1154 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1155 				}
1156 
1157 				/* decrement the send ref count */
1158 				rib_send_rele(qp);
1159 
1160 				mutex_exit(&wd->sendwait_lock);
1161 				(void) rib_free_sendwait(wd);
1162 			}
1163 		}
1164 	}
1165 }
1166 
1167 /*
1168  * RCQ handler
1169  */
1170 /* ARGSUSED */
1171 static void
1172 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1173 {
1174 	rib_qp_t	*qp;
1175 	ibt_status_t	ibt_status;
1176 	ibt_wc_t	wc;
1177 	struct recv_wid	*rwid;
1178 
1179 	/*
1180 	 * Re-enable cq notify here to avoid missing any
1181 	 * completion queue notification.
1182 	 */
1183 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1184 
1185 	ibt_status = IBT_SUCCESS;
1186 	while (ibt_status != IBT_CQ_EMPTY) {
1187 		bzero(&wc, sizeof (wc));
1188 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1189 		if (ibt_status != IBT_SUCCESS)
1190 			return;
1191 
1192 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1193 		qp = rwid->qp;
1194 		if (wc.wc_status == IBT_WC_SUCCESS) {
1195 			XDR	inxdrs, *xdrs;
1196 			uint_t	xid, vers, op, find_xid = 0;
1197 			struct reply	*r;
1198 			CONN *conn = qptoc(qp);
1199 			uint32_t rdma_credit = 0;
1200 
1201 			xdrs = &inxdrs;
1202 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1203 			    wc.wc_bytes_xfer, XDR_DECODE);
1204 			/*
1205 			 * Treat xid as opaque (xid is the first entity
1206 			 * in the rpc rdma message).
1207 			 */
1208 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1209 
1210 			/* Skip xid and set the xdr position accordingly. */
1211 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1212 			(void) xdr_u_int(xdrs, &vers);
1213 			(void) xdr_u_int(xdrs, &rdma_credit);
1214 			(void) xdr_u_int(xdrs, &op);
1215 			XDR_DESTROY(xdrs);
1216 
1217 			if (vers != RPCRDMA_VERS) {
1218 				/*
1219 				 * Invalid RPC/RDMA version. Cannot
1220 				 * interoperate.  Set connection to
1221 				 * ERROR state and bail out.
1222 				 */
1223 				mutex_enter(&conn->c_lock);
1224 				if (conn->c_state != C_DISCONN_PEND)
1225 					conn->c_state = C_ERROR_CONN;
1226 				mutex_exit(&conn->c_lock);
1227 				rib_rbuf_free(conn, RECV_BUFFER,
1228 				    (void *)(uintptr_t)rwid->addr);
1229 				rib_free_wid(rwid);
1230 				continue;
1231 			}
1232 
1233 			mutex_enter(&qp->replylist_lock);
1234 			for (r = qp->replylist; r != NULL; r = r->next) {
1235 				if (r->xid == xid) {
1236 					find_xid = 1;
1237 					switch (op) {
1238 					case RDMA_MSG:
1239 					case RDMA_NOMSG:
1240 					case RDMA_MSGP:
1241 						r->status = RDMA_SUCCESS;
1242 						r->vaddr_cq = rwid->addr;
1243 						r->bytes_xfer =
1244 						    wc.wc_bytes_xfer;
1245 						cv_signal(&r->wait_cv);
1246 						break;
1247 					default:
1248 						rib_rbuf_free(qptoc(qp),
1249 						    RECV_BUFFER,
1250 						    (void *)(uintptr_t)
1251 						    rwid->addr);
1252 						break;
1253 					}
1254 					break;
1255 				}
1256 			}
1257 			mutex_exit(&qp->replylist_lock);
1258 			if (find_xid == 0) {
1259 				/* RPC caller not waiting for reply */
1260 
1261 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1262 				    int, xid);
1263 
1264 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1265 				    (void *)(uintptr_t)rwid->addr);
1266 			}
1267 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1268 			CONN *conn = qptoc(qp);
1269 
1270 			/*
1271 			 * Connection being flushed. Just free
1272 			 * the posted buffer
1273 			 */
1274 			rib_rbuf_free(conn, RECV_BUFFER,
1275 			    (void *)(uintptr_t)rwid->addr);
1276 		} else {
1277 			CONN *conn = qptoc(qp);
1278 /*
1279  *  RC Recv Q Error Code		Local state     Remote State
1280  *  ====================		===========     ============
1281  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1282  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1283  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1284  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1285  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1286  *  IBT_WC_WR_FLUSHED_ERR               None            None
1287  */
1288 			/*
1289 			 * Channel in error state. Set connection
1290 			 * in ERROR state.
1291 			 */
1292 			mutex_enter(&conn->c_lock);
1293 			if (conn->c_state != C_DISCONN_PEND)
1294 				conn->c_state = C_ERROR_CONN;
1295 			mutex_exit(&conn->c_lock);
1296 			rib_rbuf_free(conn, RECV_BUFFER,
1297 			    (void *)(uintptr_t)rwid->addr);
1298 		}
1299 		rib_free_wid(rwid);
1300 	}
1301 }
1302 
1303 /* Server side */
1304 /* ARGSUSED */
1305 static void
1306 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1307 {
1308 	rdma_recv_data_t *rdp;
1309 	rib_qp_t	*qp;
1310 	ibt_status_t	ibt_status;
1311 	ibt_wc_t	wc;
1312 	struct svc_recv	*s_recvp;
1313 	CONN		*conn;
1314 	mblk_t		*mp;
1315 
1316 	/*
1317 	 * Re-enable cq notify here to avoid missing any
1318 	 * completion queue notification.
1319 	 */
1320 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1321 
1322 	ibt_status = IBT_SUCCESS;
1323 	while (ibt_status != IBT_CQ_EMPTY) {
1324 		bzero(&wc, sizeof (wc));
1325 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1326 		if (ibt_status != IBT_SUCCESS)
1327 			return;
1328 
1329 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1330 		qp = s_recvp->qp;
1331 		conn = qptoc(qp);
1332 		mutex_enter(&qp->posted_rbufs_lock);
1333 		qp->n_posted_rbufs--;
1334 		if (qp->n_posted_rbufs == 0)
1335 			cv_signal(&qp->posted_rbufs_cv);
1336 		mutex_exit(&qp->posted_rbufs_lock);
1337 
1338 		if (wc.wc_status == IBT_WC_SUCCESS) {
1339 			XDR	inxdrs, *xdrs;
1340 			uint_t	xid, vers, op;
1341 			uint32_t rdma_credit;
1342 
1343 			xdrs = &inxdrs;
1344 			/* s_recvp->vaddr stores data */
1345 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1346 			    wc.wc_bytes_xfer, XDR_DECODE);
1347 
1348 			/*
1349 			 * Treat xid as opaque (xid is the first entity
1350 			 * in the rpc rdma message).
1351 			 */
1352 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1353 			/* Skip xid and set the xdr position accordingly. */
1354 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1355 			if (!xdr_u_int(xdrs, &vers) ||
1356 			    !xdr_u_int(xdrs, &rdma_credit) ||
1357 			    !xdr_u_int(xdrs, &op)) {
1358 				rib_rbuf_free(conn, RECV_BUFFER,
1359 				    (void *)(uintptr_t)s_recvp->vaddr);
1360 				XDR_DESTROY(xdrs);
1361 				(void) rib_free_svc_recv(s_recvp);
1362 				continue;
1363 			}
1364 			XDR_DESTROY(xdrs);
1365 
1366 			if (vers != RPCRDMA_VERS) {
1367 				/*
1368 				 * Invalid RPC/RDMA version.
1369 				 * Drop rpc rdma message.
1370 				 */
1371 				rib_rbuf_free(conn, RECV_BUFFER,
1372 				    (void *)(uintptr_t)s_recvp->vaddr);
1373 				(void) rib_free_svc_recv(s_recvp);
1374 				continue;
1375 			}
1376 			/*
1377 			 * Is this for RDMA_DONE?
1378 			 */
1379 			if (op == RDMA_DONE) {
1380 				rib_rbuf_free(conn, RECV_BUFFER,
1381 				    (void *)(uintptr_t)s_recvp->vaddr);
1382 				/*
1383 				 * Wake up the thread waiting on
1384 				 * a RDMA_DONE for xid
1385 				 */
1386 				mutex_enter(&qp->rdlist_lock);
1387 				rdma_done_notify(qp, xid);
1388 				mutex_exit(&qp->rdlist_lock);
1389 				(void) rib_free_svc_recv(s_recvp);
1390 				continue;
1391 			}
1392 
1393 			mutex_enter(&plugin_state_lock);
1394 			if (plugin_state == ACCEPT) {
1395 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1396 				    == NULL)
1397 					(void) strwaitbuf(
1398 					    sizeof (*rdp), BPRI_LO);
1399 				/*
1400 				 * Plugin is in accept state, hence the master
1401 				 * transport queue for this is still accepting
1402 				 * requests. Hence we can call svc_queuereq to
1403 				 * queue this recieved msg.
1404 				 */
1405 				rdp = (rdma_recv_data_t *)mp->b_rptr;
1406 				rdp->conn = conn;
1407 				rdp->rpcmsg.addr =
1408 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1409 				rdp->rpcmsg.type = RECV_BUFFER;
1410 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1411 				rdp->status = wc.wc_status;
1412 				mutex_enter(&conn->c_lock);
1413 				conn->c_ref++;
1414 				mutex_exit(&conn->c_lock);
1415 				mp->b_wptr += sizeof (*rdp);
1416 				svc_queuereq((queue_t *)rib_stat->q, mp);
1417 				mutex_exit(&plugin_state_lock);
1418 			} else {
1419 				/*
1420 				 * The master transport for this is going
1421 				 * away and the queue is not accepting anymore
1422 				 * requests for krpc, so don't do anything, just
1423 				 * free the msg.
1424 				 */
1425 				mutex_exit(&plugin_state_lock);
1426 				rib_rbuf_free(conn, RECV_BUFFER,
1427 				    (void *)(uintptr_t)s_recvp->vaddr);
1428 			}
1429 		} else {
1430 			rib_rbuf_free(conn, RECV_BUFFER,
1431 			    (void *)(uintptr_t)s_recvp->vaddr);
1432 		}
1433 		(void) rib_free_svc_recv(s_recvp);
1434 	}
1435 }
1436 
1437 static void
1438 rib_attach_hca()
1439 {
1440 	mutex_enter(&rib_stat->open_hca_lock);
1441 	(void) rpcib_open_hcas(rib_stat);
1442 	rib_listen(NULL);
1443 	mutex_exit(&rib_stat->open_hca_lock);
1444 }
1445 
1446 /*
1447  * Handles DR event of IBT_HCA_DETACH_EVENT.
1448  */
1449 /* ARGSUSED */
1450 static void
1451 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1452 	ibt_async_code_t code, ibt_async_event_t *event)
1453 {
1454 	switch (code) {
1455 	case IBT_HCA_ATTACH_EVENT:
1456 		rib_attach_hca();
1457 		break;
1458 	case IBT_HCA_DETACH_EVENT:
1459 	{
1460 		rib_hca_t *hca;
1461 
1462 		rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1463 		for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1464 			rw_enter(&hca->state_lock, RW_READER);
1465 			if ((hca->state != HCA_DETACHED) &&
1466 			    (hca->hca_hdl == hca_hdl)) {
1467 				rw_exit(&hca->state_lock);
1468 				break;
1469 			}
1470 			rw_exit(&hca->state_lock);
1471 		}
1472 		rw_exit(&rib_stat->hcas_list_lock);
1473 
1474 		if (hca == NULL)
1475 			return;
1476 		ASSERT(hca->hca_hdl == hca_hdl);
1477 		rib_detach_hca(hca);
1478 #ifdef DEBUG
1479 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1480 #endif
1481 		break;
1482 	}
1483 	case IBT_EVENT_PORT_UP:
1484 		/*
1485 		 * A port is up. We should call rib_listen() since there is
1486 		 * a chance that rib_listen() may have failed during
1487 		 * rib_attach_hca() because the port had not been up yet.
1488 		 */
1489 		rib_listen(NULL);
1490 #ifdef DEBUG
1491 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1492 #endif
1493 		break;
1494 #ifdef DEBUG
1495 	case IBT_EVENT_PATH_MIGRATED:
1496 		cmn_err(CE_NOTE, "rib_async_handler(): "
1497 		    "IBT_EVENT_PATH_MIGRATED\n");
1498 		break;
1499 	case IBT_EVENT_SQD:
1500 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1501 		break;
1502 	case IBT_EVENT_COM_EST:
1503 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1504 		break;
1505 	case IBT_ERROR_CATASTROPHIC_CHAN:
1506 		cmn_err(CE_NOTE, "rib_async_handler(): "
1507 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1508 		break;
1509 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1510 		cmn_err(CE_NOTE, "rib_async_handler(): "
1511 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1512 		break;
1513 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1514 		cmn_err(CE_NOTE, "rib_async_handler(): "
1515 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1516 		break;
1517 	case IBT_ERROR_PATH_MIGRATE_REQ:
1518 		cmn_err(CE_NOTE, "rib_async_handler(): "
1519 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1520 		break;
1521 	case IBT_ERROR_CQ:
1522 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1523 		break;
1524 	case IBT_ERROR_PORT_DOWN:
1525 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1526 		break;
1527 	case IBT_ASYNC_OPAQUE1:
1528 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1529 		break;
1530 	case IBT_ASYNC_OPAQUE2:
1531 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1532 		break;
1533 	case IBT_ASYNC_OPAQUE3:
1534 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1535 		break;
1536 	case IBT_ASYNC_OPAQUE4:
1537 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1538 		break;
1539 #endif
1540 	default:
1541 		break;
1542 	}
1543 }
1544 
1545 /*
1546  * Client's reachable function.
1547  */
1548 static rdma_stat
1549 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1550 {
1551 	rdma_stat	status;
1552 	rpcib_ping_t	rpt;
1553 	struct netbuf	saddr;
1554 	CONN		*conn;
1555 
1556 	bzero(&saddr, sizeof (struct netbuf));
1557 	status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
1558 
1559 	if (status == RDMA_SUCCESS) {
1560 		*handle = (void *)rpt.hca;
1561 		/* release the reference */
1562 		(void) rib_conn_release(conn);
1563 		return (RDMA_SUCCESS);
1564 	} else {
1565 		*handle = NULL;
1566 		DTRACE_PROBE(rpcib__i__pingfailed);
1567 		return (RDMA_FAILED);
1568 	}
1569 }
1570 
1571 /* Client side qp creation */
1572 static rdma_stat
1573 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1574 {
1575 	rib_qp_t	*kqp = NULL;
1576 	CONN		*conn;
1577 	rdma_clnt_cred_ctrl_t *cc_info;
1578 
1579 	ASSERT(qp != NULL);
1580 	*qp = NULL;
1581 
1582 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1583 	conn = qptoc(kqp);
1584 	kqp->hca = hca;
1585 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1586 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1587 
1588 	kqp->mode = RIB_CLIENT;
1589 	kqp->chan_flags = IBT_BLOCKING;
1590 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1591 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1592 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1593 	/*
1594 	 * Initialize
1595 	 */
1596 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1597 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1598 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1599 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1600 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1601 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1602 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1603 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1604 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1605 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1606 	/*
1607 	 * Initialize the client credit control
1608 	 * portion of the rdmaconn struct.
1609 	 */
1610 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1611 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1612 	cc_info->clnt_cc_granted_ops = 0;
1613 	cc_info->clnt_cc_in_flight_ops = 0;
1614 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1615 
1616 	*qp = kqp;
1617 	return (RDMA_SUCCESS);
1618 }
1619 
1620 /* Server side qp creation */
1621 static rdma_stat
1622 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1623 {
1624 	rib_qp_t	*kqp = NULL;
1625 	ibt_chan_sizes_t	chan_sizes;
1626 	ibt_rc_chan_alloc_args_t	qp_attr;
1627 	ibt_status_t		ibt_status;
1628 	rdma_srv_cred_ctrl_t *cc_info;
1629 
1630 	*qp = NULL;
1631 
1632 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1633 	kqp->hca = hca;
1634 	kqp->port_num = port;
1635 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1636 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1637 
1638 	/*
1639 	 * Create the qp handle
1640 	 */
1641 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1642 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1643 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1644 	qp_attr.rc_pd = hca->pd_hdl;
1645 	qp_attr.rc_hca_port_num = port;
1646 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1647 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1648 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1649 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1650 	qp_attr.rc_clone_chan = NULL;
1651 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1652 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1653 
1654 	rw_enter(&hca->state_lock, RW_READER);
1655 	if (hca->state != HCA_DETACHED) {
1656 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1657 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1658 		    &chan_sizes);
1659 	} else {
1660 		rw_exit(&hca->state_lock);
1661 		goto fail;
1662 	}
1663 	rw_exit(&hca->state_lock);
1664 
1665 	if (ibt_status != IBT_SUCCESS) {
1666 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1667 		    int, ibt_status);
1668 		goto fail;
1669 	}
1670 
1671 	kqp->mode = RIB_SERVER;
1672 	kqp->chan_flags = IBT_BLOCKING;
1673 	kqp->q = q;	/* server ONLY */
1674 
1675 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1676 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1677 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1678 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1679 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1680 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1681 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1682 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1683 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1684 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1685 	/*
1686 	 * Set the private data area to qp to be used in callbacks
1687 	 */
1688 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1689 	kqp->rdmaconn.c_state = C_CONNECTED;
1690 
1691 	/*
1692 	 * Initialize the server credit control
1693 	 * portion of the rdmaconn struct.
1694 	 */
1695 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1696 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1697 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1698 	cc_info->srv_cc_cur_buffers_used = 0;
1699 	cc_info->srv_cc_posted = preposted_rbufs;
1700 
1701 	*qp = kqp;
1702 
1703 	return (RDMA_SUCCESS);
1704 fail:
1705 	if (kqp)
1706 		kmem_free(kqp, sizeof (rib_qp_t));
1707 
1708 	return (RDMA_FAILED);
1709 }
1710 
1711 /* ARGSUSED */
1712 ibt_cm_status_t
1713 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1714     ibt_cm_return_args_t *ret_args, void *priv_data,
1715     ibt_priv_data_len_t len)
1716 {
1717 	rib_hca_t	*hca;
1718 
1719 	hca = (rib_hca_t *)clnt_hdl;
1720 
1721 	switch (event->cm_type) {
1722 
1723 	/* got a connection close event */
1724 	case IBT_CM_EVENT_CONN_CLOSED:
1725 	{
1726 		CONN	*conn;
1727 		rib_qp_t *qp;
1728 
1729 		/* check reason why connection was closed */
1730 		switch (event->cm_event.closed) {
1731 		case IBT_CM_CLOSED_DREP_RCVD:
1732 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1733 		case IBT_CM_CLOSED_DUP:
1734 		case IBT_CM_CLOSED_ABORT:
1735 		case IBT_CM_CLOSED_ALREADY:
1736 			/*
1737 			 * These cases indicate the local end initiated
1738 			 * the closing of the channel. Nothing to do here.
1739 			 */
1740 			break;
1741 		default:
1742 			/*
1743 			 * Reason for CONN_CLOSED event must be one of
1744 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1745 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1746 			 * the remote end is closing the channel. In these
1747 			 * cases free the channel and transition to error
1748 			 * state
1749 			 */
1750 			qp = ibt_get_chan_private(event->cm_channel);
1751 			conn = qptoc(qp);
1752 			mutex_enter(&conn->c_lock);
1753 			if (conn->c_state == C_DISCONN_PEND) {
1754 				mutex_exit(&conn->c_lock);
1755 				break;
1756 			}
1757 
1758 			conn->c_state = C_ERROR_CONN;
1759 
1760 			/*
1761 			 * Free the conn if c_ref is down to 0 already
1762 			 */
1763 			if (conn->c_ref == 0) {
1764 				/*
1765 				 * Remove from list and free conn
1766 				 */
1767 				conn->c_state = C_DISCONN_PEND;
1768 				mutex_exit(&conn->c_lock);
1769 				rw_enter(&hca->state_lock, RW_READER);
1770 				if (hca->state != HCA_DETACHED)
1771 					(void) rib_disconnect_channel(conn,
1772 					    &hca->cl_conn_list);
1773 				rw_exit(&hca->state_lock);
1774 			} else {
1775 				/*
1776 				 * conn will be freed when c_ref goes to 0.
1777 				 * Indicate to cleaning thread not to close
1778 				 * the connection, but just free the channel.
1779 				 */
1780 				conn->c_flags |= C_CLOSE_NOTNEEDED;
1781 				mutex_exit(&conn->c_lock);
1782 			}
1783 #ifdef DEBUG
1784 			if (rib_debug)
1785 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1786 				    "(CONN_CLOSED) channel disconnected");
1787 #endif
1788 			break;
1789 		}
1790 		break;
1791 	}
1792 	default:
1793 		break;
1794 	}
1795 	return (IBT_CM_ACCEPT);
1796 }
1797 
1798 /*
1799  * Connect to the server.
1800  */
1801 rdma_stat
1802 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1803 {
1804 	ibt_chan_open_args_t	chan_args;	/* channel args */
1805 	ibt_chan_sizes_t	chan_sizes;
1806 	ibt_rc_chan_alloc_args_t	qp_attr;
1807 	ibt_status_t		ibt_status;
1808 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1809 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1810 	ibt_ip_cm_info_t	ipcm_info;
1811 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1812 
1813 
1814 	(void) bzero(&chan_args, sizeof (chan_args));
1815 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1816 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1817 
1818 	ipcm_info.src_addr.family = rptp->srcip.family;
1819 	switch (ipcm_info.src_addr.family) {
1820 	case AF_INET:
1821 		ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1822 		break;
1823 	case AF_INET6:
1824 		ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1825 		break;
1826 	}
1827 
1828 	ipcm_info.dst_addr.family = rptp->srcip.family;
1829 	switch (ipcm_info.dst_addr.family) {
1830 	case AF_INET:
1831 		ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1832 		break;
1833 	case AF_INET6:
1834 		ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1835 		break;
1836 	}
1837 
1838 	ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1839 
1840 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1841 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1842 
1843 	if (ibt_status != IBT_SUCCESS) {
1844 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1845 		return (-1);
1846 	}
1847 
1848 	qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1849 	/* Alloc a RC channel */
1850 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1851 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1852 	qp_attr.rc_pd = hca->pd_hdl;
1853 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1854 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1855 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1856 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1857 	qp_attr.rc_clone_chan = NULL;
1858 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1859 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1860 
1861 	rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1862 	chan_args.oc_path = &rptp->path;
1863 
1864 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1865 	chan_args.oc_cm_clnt_private = (void *)hca;
1866 	chan_args.oc_rdma_ra_out = 4;
1867 	chan_args.oc_rdma_ra_in = 4;
1868 	chan_args.oc_path_retry_cnt = 2;
1869 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1870 	chan_args.oc_priv_data = cmp_ip_pvt;
1871 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1872 
1873 refresh:
1874 	rw_enter(&hca->state_lock, RW_READER);
1875 	if (hca->state != HCA_DETACHED) {
1876 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1877 		    IBT_ACHAN_NO_FLAGS,
1878 		    &qp_attr, &qp->qp_hdl,
1879 		    &chan_sizes);
1880 	} else {
1881 		rw_exit(&hca->state_lock);
1882 		return (RDMA_FAILED);
1883 	}
1884 	rw_exit(&hca->state_lock);
1885 
1886 	if (ibt_status != IBT_SUCCESS) {
1887 		DTRACE_PROBE1(rpcib__i_conntosrv,
1888 		    int, ibt_status);
1889 		return (RDMA_FAILED);
1890 	}
1891 
1892 	/* Connect to the Server */
1893 	(void) bzero(&ret_args, sizeof (ret_args));
1894 	mutex_enter(&qp->cb_lock);
1895 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1896 	    IBT_BLOCKING, &chan_args, &ret_args);
1897 	if (ibt_status != IBT_SUCCESS) {
1898 		DTRACE_PROBE2(rpcib__i_openrctosrv,
1899 		    int, ibt_status, int, ret_args.rc_status);
1900 
1901 		(void) ibt_free_channel(qp->qp_hdl);
1902 		qp->qp_hdl = NULL;
1903 		mutex_exit(&qp->cb_lock);
1904 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1905 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1906 			/*
1907 			 * Got IBT_CM_CONN_STALE probably because of stale
1908 			 * data on the passive end of a channel that existed
1909 			 * prior to reboot. Retry establishing a channel
1910 			 * REFRESH_ATTEMPTS times, during which time the
1911 			 * stale conditions on the server might clear up.
1912 			 */
1913 			goto refresh;
1914 		}
1915 		return (RDMA_FAILED);
1916 	}
1917 	mutex_exit(&qp->cb_lock);
1918 	/*
1919 	 * Set the private data area to qp to be used in callbacks
1920 	 */
1921 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1922 	return (RDMA_SUCCESS);
1923 }
1924 
1925 rdma_stat
1926 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1927 {
1928 	uint_t			i, addr_count;
1929 	ibt_status_t		ibt_status;
1930 	uint8_t			num_paths_p;
1931 	ibt_ip_path_attr_t	ipattr;
1932 	ibt_path_ip_src_t	srcip;
1933 	rpcib_ipaddrs_t		addrs4;
1934 	rpcib_ipaddrs_t		addrs6;
1935 	struct sockaddr_in	*sinp;
1936 	struct sockaddr_in6	*sin6p;
1937 	rdma_stat		retval = RDMA_FAILED;
1938 	rib_hca_t *hca;
1939 
1940 	if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1941 		return (RDMA_INVAL);
1942 	ASSERT(raddr->buf != NULL);
1943 
1944 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1945 
1946 	if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1947 	    (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1948 		retval = RDMA_FAILED;
1949 		goto done2;
1950 	}
1951 
1952 	if (addr_type == AF_INET) {
1953 		addr_count = addrs4.ri_count;
1954 		sinp = (struct sockaddr_in *)raddr->buf;
1955 		rptp->dstip.family = AF_INET;
1956 		rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1957 		sinp = addrs4.ri_list;
1958 	} else {
1959 		addr_count = addrs6.ri_count;
1960 		sin6p = (struct sockaddr_in6 *)raddr->buf;
1961 		rptp->dstip.family = AF_INET6;
1962 		rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1963 		sin6p = addrs6.ri_list;
1964 	}
1965 
1966 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1967 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1968 		rw_enter(&hca->state_lock, RW_READER);
1969 		if (hca->state == HCA_DETACHED) {
1970 			rw_exit(&hca->state_lock);
1971 			continue;
1972 		}
1973 
1974 		ipattr.ipa_dst_ip 	= &rptp->dstip;
1975 		ipattr.ipa_hca_guid	= hca->hca_guid;
1976 		ipattr.ipa_ndst		= 1;
1977 		ipattr.ipa_max_paths	= 1;
1978 		ipattr.ipa_src_ip.family = rptp->dstip.family;
1979 		for (i = 0; i < addr_count; i++) {
1980 			num_paths_p = 0;
1981 			if (addr_type == AF_INET) {
1982 				ipattr.ipa_src_ip.un.ip4addr =
1983 				    sinp[i].sin_addr.s_addr;
1984 			} else {
1985 				ipattr.ipa_src_ip.un.ip6addr =
1986 				    sin6p[i].sin6_addr;
1987 			}
1988 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
1989 
1990 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1991 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1992 			    &num_paths_p, &srcip);
1993 			if (ibt_status == IBT_SUCCESS &&
1994 			    num_paths_p != 0 &&
1995 			    rptp->path.pi_hca_guid == hca->hca_guid) {
1996 				rptp->hca = hca;
1997 				rw_exit(&hca->state_lock);
1998 				if (addr_type == AF_INET) {
1999 					rptp->srcip.family = AF_INET;
2000 					rptp->srcip.un.ip4addr =
2001 					    srcip.ip_primary.un.ip4addr;
2002 				} else {
2003 					rptp->srcip.family = AF_INET6;
2004 					rptp->srcip.un.ip6addr =
2005 					    srcip.ip_primary.un.ip6addr;
2006 
2007 				}
2008 				retval = RDMA_SUCCESS;
2009 				goto done1;
2010 			}
2011 		}
2012 		rw_exit(&hca->state_lock);
2013 	}
2014 done1:
2015 	rw_exit(&rib_stat->hcas_list_lock);
2016 done2:
2017 	if (addrs4.ri_size > 0)
2018 		kmem_free(addrs4.ri_list, addrs4.ri_size);
2019 	if (addrs6.ri_size > 0)
2020 		kmem_free(addrs6.ri_list, addrs6.ri_size);
2021 	return (retval);
2022 }
2023 
2024 /*
2025  * Close channel, remove from connection list and
2026  * free up resources allocated for that channel.
2027  */
2028 rdma_stat
2029 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2030 {
2031 	rib_qp_t	*qp = ctoqp(conn);
2032 	rib_hca_t	*hca;
2033 
2034 	mutex_enter(&conn->c_lock);
2035 	if (conn->c_timeout != NULL) {
2036 		mutex_exit(&conn->c_lock);
2037 		(void) untimeout(conn->c_timeout);
2038 		mutex_enter(&conn->c_lock);
2039 	}
2040 
2041 	while (conn->c_flags & C_CLOSE_PENDING) {
2042 		cv_wait(&conn->c_cv, &conn->c_lock);
2043 	}
2044 	mutex_exit(&conn->c_lock);
2045 
2046 	/*
2047 	 * c_ref == 0 and connection is in C_DISCONN_PEND
2048 	 */
2049 	hca = qp->hca;
2050 	if (conn_list != NULL)
2051 		(void) rib_rm_conn(conn, conn_list);
2052 
2053 	/*
2054 	 * There is only one case where we get here with
2055 	 * qp_hdl = NULL, which is during connection setup on
2056 	 * the client. In such a case there are no posted
2057 	 * send/recv buffers.
2058 	 */
2059 	if (qp->qp_hdl != NULL) {
2060 		mutex_enter(&qp->posted_rbufs_lock);
2061 		while (qp->n_posted_rbufs)
2062 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2063 		mutex_exit(&qp->posted_rbufs_lock);
2064 
2065 		mutex_enter(&qp->send_rbufs_lock);
2066 		while (qp->n_send_rbufs)
2067 			cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2068 		mutex_exit(&qp->send_rbufs_lock);
2069 
2070 		(void) ibt_free_channel(qp->qp_hdl);
2071 		qp->qp_hdl = NULL;
2072 	}
2073 
2074 	ASSERT(qp->rdlist == NULL);
2075 
2076 	if (qp->replylist != NULL) {
2077 		(void) rib_rem_replylist(qp);
2078 	}
2079 
2080 	cv_destroy(&qp->cb_conn_cv);
2081 	cv_destroy(&qp->posted_rbufs_cv);
2082 	cv_destroy(&qp->send_rbufs_cv);
2083 	mutex_destroy(&qp->cb_lock);
2084 	mutex_destroy(&qp->replylist_lock);
2085 	mutex_destroy(&qp->posted_rbufs_lock);
2086 	mutex_destroy(&qp->send_rbufs_lock);
2087 	mutex_destroy(&qp->rdlist_lock);
2088 
2089 	cv_destroy(&conn->c_cv);
2090 	mutex_destroy(&conn->c_lock);
2091 
2092 	if (conn->c_raddr.buf != NULL) {
2093 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2094 	}
2095 	if (conn->c_laddr.buf != NULL) {
2096 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2097 	}
2098 
2099 	/*
2100 	 * Credit control cleanup.
2101 	 */
2102 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2103 		rdma_clnt_cred_ctrl_t *cc_info;
2104 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2105 		cv_destroy(&cc_info->clnt_cc_cv);
2106 	}
2107 
2108 	kmem_free(qp, sizeof (rib_qp_t));
2109 
2110 	/*
2111 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2112 	 * then the hca is no longer being used.
2113 	 */
2114 	if (conn_list != NULL) {
2115 		rw_enter(&hca->state_lock, RW_READER);
2116 		if (hca->state == HCA_DETACHED) {
2117 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2118 			if (hca->srv_conn_list.conn_hd == NULL) {
2119 				rw_enter(&hca->cl_conn_list.conn_lock,
2120 				    RW_READER);
2121 
2122 				if (hca->cl_conn_list.conn_hd == NULL) {
2123 					mutex_enter(&hca->inuse_lock);
2124 					hca->inuse = FALSE;
2125 					cv_signal(&hca->cb_cv);
2126 					mutex_exit(&hca->inuse_lock);
2127 				}
2128 				rw_exit(&hca->cl_conn_list.conn_lock);
2129 			}
2130 			rw_exit(&hca->srv_conn_list.conn_lock);
2131 		}
2132 		rw_exit(&hca->state_lock);
2133 	}
2134 
2135 	return (RDMA_SUCCESS);
2136 }
2137 
2138 /*
2139  * All sends are done under the protection of
2140  * the wdesc->sendwait_lock. n_send_rbufs count
2141  * is protected using the send_rbufs_lock.
2142  * lock ordering is:
2143  * sendwait_lock -> send_rbufs_lock
2144  */
2145 
2146 void
2147 rib_send_hold(rib_qp_t *qp)
2148 {
2149 	mutex_enter(&qp->send_rbufs_lock);
2150 	qp->n_send_rbufs++;
2151 	mutex_exit(&qp->send_rbufs_lock);
2152 }
2153 
2154 void
2155 rib_send_rele(rib_qp_t *qp)
2156 {
2157 	mutex_enter(&qp->send_rbufs_lock);
2158 	qp->n_send_rbufs--;
2159 	if (qp->n_send_rbufs == 0)
2160 		cv_signal(&qp->send_rbufs_cv);
2161 	mutex_exit(&qp->send_rbufs_lock);
2162 }
2163 
2164 /*
2165  * Wait for send completion notification. Only on receiving a
2166  * notification be it a successful or error completion, free the
2167  * send_wid.
2168  */
2169 static rdma_stat
2170 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2171 {
2172 	clock_t timout, cv_wait_ret;
2173 	rdma_stat error = RDMA_SUCCESS;
2174 	int	i;
2175 
2176 	/*
2177 	 * Wait for send to complete
2178 	 */
2179 	ASSERT(wd != NULL);
2180 	mutex_enter(&wd->sendwait_lock);
2181 	if (wd->status == (uint_t)SEND_WAIT) {
2182 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2183 		    ddi_get_lbolt();
2184 
2185 		if (qp->mode == RIB_SERVER) {
2186 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2187 			    &wd->sendwait_lock, timout)) > 0 &&
2188 			    wd->status == (uint_t)SEND_WAIT)
2189 				;
2190 			switch (cv_wait_ret) {
2191 			case -1:	/* timeout */
2192 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2193 
2194 				wd->cv_sig = 0;		/* no signal needed */
2195 				error = RDMA_TIMEDOUT;
2196 				break;
2197 			default:	/* got send completion */
2198 				break;
2199 			}
2200 		} else {
2201 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2202 			    &wd->sendwait_lock, timout)) > 0 &&
2203 			    wd->status == (uint_t)SEND_WAIT)
2204 				;
2205 			switch (cv_wait_ret) {
2206 			case -1:	/* timeout */
2207 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2208 
2209 				wd->cv_sig = 0;		/* no signal needed */
2210 				error = RDMA_TIMEDOUT;
2211 				break;
2212 			case 0:		/* interrupted */
2213 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2214 
2215 				wd->cv_sig = 0;		/* no signal needed */
2216 				error = RDMA_INTR;
2217 				break;
2218 			default:	/* got send completion */
2219 				break;
2220 			}
2221 		}
2222 	}
2223 
2224 	if (wd->status != (uint_t)SEND_WAIT) {
2225 		/* got send completion */
2226 		if (wd->status != RDMA_SUCCESS) {
2227 			switch (wd->status) {
2228 			case RDMA_CONNLOST:
2229 				error = RDMA_CONNLOST;
2230 				break;
2231 			default:
2232 				error = RDMA_FAILED;
2233 				break;
2234 			}
2235 		}
2236 		for (i = 0; i < wd->nsbufs; i++) {
2237 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2238 			    (void *)(uintptr_t)wd->sbufaddr[i]);
2239 		}
2240 
2241 		rib_send_rele(qp);
2242 
2243 		mutex_exit(&wd->sendwait_lock);
2244 		(void) rib_free_sendwait(wd);
2245 
2246 	} else {
2247 		mutex_exit(&wd->sendwait_lock);
2248 	}
2249 	return (error);
2250 }
2251 
2252 static struct send_wid *
2253 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2254 {
2255 	struct send_wid	*wd;
2256 
2257 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2258 	wd->xid = xid;
2259 	wd->cv_sig = cv_sig;
2260 	wd->qp = qp;
2261 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2262 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2263 	wd->status = (uint_t)SEND_WAIT;
2264 
2265 	return (wd);
2266 }
2267 
2268 static int
2269 rib_free_sendwait(struct send_wid *wdesc)
2270 {
2271 	cv_destroy(&wdesc->wait_cv);
2272 	mutex_destroy(&wdesc->sendwait_lock);
2273 	kmem_free(wdesc, sizeof (*wdesc));
2274 
2275 	return (0);
2276 }
2277 
2278 static rdma_stat
2279 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2280 {
2281 	mutex_enter(&qp->replylist_lock);
2282 	if (rep != NULL) {
2283 		(void) rib_remreply(qp, rep);
2284 		mutex_exit(&qp->replylist_lock);
2285 		return (RDMA_SUCCESS);
2286 	}
2287 	mutex_exit(&qp->replylist_lock);
2288 	return (RDMA_FAILED);
2289 }
2290 
2291 /*
2292  * Send buffers are freed here only in case of error in posting
2293  * on QP. If the post succeeded, the send buffers are freed upon
2294  * send completion in rib_sendwait() or in the scq_handler.
2295  */
2296 rdma_stat
2297 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2298 	int send_sig, int cv_sig, caddr_t *swid)
2299 {
2300 	struct send_wid	*wdesc;
2301 	struct clist	*clp;
2302 	ibt_status_t	ibt_status = IBT_SUCCESS;
2303 	rdma_stat	ret = RDMA_SUCCESS;
2304 	ibt_send_wr_t	tx_wr;
2305 	int		i, nds;
2306 	ibt_wr_ds_t	sgl[DSEG_MAX];
2307 	uint_t		total_msg_size;
2308 	rib_qp_t	*qp;
2309 
2310 	qp = ctoqp(conn);
2311 
2312 	ASSERT(cl != NULL);
2313 
2314 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2315 
2316 	nds = 0;
2317 	total_msg_size = 0;
2318 	clp = cl;
2319 	while (clp != NULL) {
2320 		if (nds >= DSEG_MAX) {
2321 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2322 			return (RDMA_FAILED);
2323 		}
2324 		sgl[nds].ds_va = clp->w.c_saddr;
2325 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2326 		sgl[nds].ds_len = clp->c_len;
2327 		total_msg_size += clp->c_len;
2328 		clp = clp->c_next;
2329 		nds++;
2330 	}
2331 
2332 	if (send_sig) {
2333 		/* Set SEND_SIGNAL flag. */
2334 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2335 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2336 		*swid = (caddr_t)wdesc;
2337 		tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2338 		mutex_enter(&wdesc->sendwait_lock);
2339 		wdesc->nsbufs = nds;
2340 		for (i = 0; i < nds; i++) {
2341 			wdesc->sbufaddr[i] = sgl[i].ds_va;
2342 		}
2343 	} else {
2344 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2345 		*swid = NULL;
2346 		tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2347 	}
2348 
2349 	tx_wr.wr_opcode = IBT_WRC_SEND;
2350 	tx_wr.wr_trans = IBT_RC_SRV;
2351 	tx_wr.wr_nds = nds;
2352 	tx_wr.wr_sgl = sgl;
2353 
2354 	mutex_enter(&conn->c_lock);
2355 	if (conn->c_state == C_CONNECTED) {
2356 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2357 	}
2358 	if (conn->c_state != C_CONNECTED ||
2359 	    ibt_status != IBT_SUCCESS) {
2360 		if (conn->c_state != C_DISCONN_PEND)
2361 			conn->c_state = C_ERROR_CONN;
2362 		mutex_exit(&conn->c_lock);
2363 		if (send_sig) {
2364 			for (i = 0; i < nds; i++) {
2365 				rib_rbuf_free(conn, SEND_BUFFER,
2366 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2367 			}
2368 			mutex_exit(&wdesc->sendwait_lock);
2369 			(void) rib_free_sendwait(wdesc);
2370 		}
2371 		return (RDMA_CONNLOST);
2372 	}
2373 
2374 	mutex_exit(&conn->c_lock);
2375 
2376 	if (send_sig) {
2377 		rib_send_hold(qp);
2378 		mutex_exit(&wdesc->sendwait_lock);
2379 		if (cv_sig) {
2380 			/*
2381 			 * cv_wait for send to complete.
2382 			 * We can fail due to a timeout or signal or
2383 			 * unsuccessful send.
2384 			 */
2385 			ret = rib_sendwait(qp, wdesc);
2386 
2387 			return (ret);
2388 		}
2389 	}
2390 
2391 	return (RDMA_SUCCESS);
2392 }
2393 
2394 
2395 rdma_stat
2396 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2397 {
2398 	rdma_stat	ret;
2399 	caddr_t		wd;
2400 
2401 	/* send-wait & cv_signal */
2402 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2403 	return (ret);
2404 }
2405 
2406 /*
2407  * Deprecated/obsolete interface not used currently
2408  * but earlier used for READ-READ protocol.
2409  * Send RPC reply and wait for RDMA_DONE.
2410  */
2411 rdma_stat
2412 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2413 {
2414 	rdma_stat ret = RDMA_SUCCESS;
2415 	struct rdma_done_list *rd;
2416 	clock_t timout, cv_wait_ret;
2417 	caddr_t *wid = NULL;
2418 	rib_qp_t *qp = ctoqp(conn);
2419 
2420 	mutex_enter(&qp->rdlist_lock);
2421 	rd = rdma_done_add(qp, msgid);
2422 
2423 	/* No cv_signal (whether send-wait or no-send-wait) */
2424 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2425 
2426 	if (ret != RDMA_SUCCESS) {
2427 		rdma_done_rm(qp, rd);
2428 	} else {
2429 		/*
2430 		 * Wait for RDMA_DONE from remote end
2431 		 */
2432 		timout =
2433 		    drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2434 		cv_wait_ret = cv_timedwait(&rd->rdma_done_cv,
2435 		    &qp->rdlist_lock,
2436 		    timout);
2437 
2438 		rdma_done_rm(qp, rd);
2439 
2440 		if (cv_wait_ret < 0) {
2441 			ret = RDMA_TIMEDOUT;
2442 		}
2443 	}
2444 
2445 	mutex_exit(&qp->rdlist_lock);
2446 	return (ret);
2447 }
2448 
2449 static struct recv_wid *
2450 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2451 {
2452 	struct recv_wid	*rwid;
2453 
2454 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2455 	rwid->xid = msgid;
2456 	rwid->addr = sgl->ds_va;
2457 	rwid->qp = qp;
2458 
2459 	return (rwid);
2460 }
2461 
2462 static void
2463 rib_free_wid(struct recv_wid *rwid)
2464 {
2465 	kmem_free(rwid, sizeof (struct recv_wid));
2466 }
2467 
2468 rdma_stat
2469 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2470 {
2471 	rib_qp_t	*qp = ctoqp(conn);
2472 	struct clist	*clp = cl;
2473 	struct reply	*rep;
2474 	struct recv_wid	*rwid;
2475 	int		nds;
2476 	ibt_wr_ds_t	sgl[DSEG_MAX];
2477 	ibt_recv_wr_t	recv_wr;
2478 	rdma_stat	ret;
2479 	ibt_status_t	ibt_status;
2480 
2481 	/*
2482 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2483 	 */
2484 
2485 	nds = 0;
2486 	while (cl != NULL) {
2487 		if (nds >= DSEG_MAX) {
2488 			ret = RDMA_FAILED;
2489 			goto done;
2490 		}
2491 		sgl[nds].ds_va = cl->w.c_saddr;
2492 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2493 		sgl[nds].ds_len = cl->c_len;
2494 		cl = cl->c_next;
2495 		nds++;
2496 	}
2497 
2498 	if (nds != 1) {
2499 		ret = RDMA_FAILED;
2500 		goto done;
2501 	}
2502 
2503 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2504 	recv_wr.wr_nds = nds;
2505 	recv_wr.wr_sgl = sgl;
2506 
2507 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2508 	if (rwid) {
2509 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2510 	} else {
2511 		ret = RDMA_NORESOURCE;
2512 		goto done;
2513 	}
2514 	rep = rib_addreplylist(qp, msgid);
2515 	if (!rep) {
2516 		rib_free_wid(rwid);
2517 		ret = RDMA_NORESOURCE;
2518 		goto done;
2519 	}
2520 
2521 	mutex_enter(&conn->c_lock);
2522 
2523 	if (conn->c_state == C_CONNECTED) {
2524 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2525 	}
2526 
2527 	if (conn->c_state != C_CONNECTED ||
2528 	    ibt_status != IBT_SUCCESS) {
2529 		if (conn->c_state != C_DISCONN_PEND)
2530 			conn->c_state = C_ERROR_CONN;
2531 		mutex_exit(&conn->c_lock);
2532 		rib_free_wid(rwid);
2533 		(void) rib_rem_rep(qp, rep);
2534 		ret = RDMA_CONNLOST;
2535 		goto done;
2536 	}
2537 	mutex_exit(&conn->c_lock);
2538 	return (RDMA_SUCCESS);
2539 
2540 done:
2541 	while (clp != NULL) {
2542 		rib_rbuf_free(conn, RECV_BUFFER,
2543 		    (void *)(uintptr_t)clp->w.c_saddr3);
2544 		clp = clp->c_next;
2545 	}
2546 	return (ret);
2547 }
2548 
2549 rdma_stat
2550 rib_svc_post(CONN* conn, struct clist *cl)
2551 {
2552 	rib_qp_t	*qp = ctoqp(conn);
2553 	struct svc_recv	*s_recvp;
2554 	int		nds;
2555 	ibt_wr_ds_t	sgl[DSEG_MAX];
2556 	ibt_recv_wr_t	recv_wr;
2557 	ibt_status_t	ibt_status;
2558 
2559 	nds = 0;
2560 	while (cl != NULL) {
2561 		if (nds >= DSEG_MAX) {
2562 			return (RDMA_FAILED);
2563 		}
2564 		sgl[nds].ds_va = cl->w.c_saddr;
2565 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2566 		sgl[nds].ds_len = cl->c_len;
2567 		cl = cl->c_next;
2568 		nds++;
2569 	}
2570 
2571 	if (nds != 1) {
2572 		rib_rbuf_free(conn, RECV_BUFFER,
2573 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2574 
2575 		return (RDMA_FAILED);
2576 	}
2577 
2578 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2579 	recv_wr.wr_nds = nds;
2580 	recv_wr.wr_sgl = sgl;
2581 
2582 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2583 	/* Use s_recvp's addr as wr id */
2584 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2585 	mutex_enter(&conn->c_lock);
2586 	if (conn->c_state == C_CONNECTED) {
2587 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2588 	}
2589 	if (conn->c_state != C_CONNECTED ||
2590 	    ibt_status != IBT_SUCCESS) {
2591 		if (conn->c_state != C_DISCONN_PEND)
2592 			conn->c_state = C_ERROR_CONN;
2593 		mutex_exit(&conn->c_lock);
2594 		rib_rbuf_free(conn, RECV_BUFFER,
2595 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2596 		(void) rib_free_svc_recv(s_recvp);
2597 
2598 		return (RDMA_CONNLOST);
2599 	}
2600 	mutex_exit(&conn->c_lock);
2601 
2602 	return (RDMA_SUCCESS);
2603 }
2604 
2605 /* Client */
2606 rdma_stat
2607 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2608 {
2609 
2610 	return (rib_clnt_post(conn, cl, msgid));
2611 }
2612 
2613 /* Client */
2614 rdma_stat
2615 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2616 {
2617 	rib_qp_t	*qp = ctoqp(conn);
2618 	struct reply	*rep;
2619 
2620 	mutex_enter(&qp->replylist_lock);
2621 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2622 		if (rep->xid == msgid) {
2623 			if (rep->vaddr_cq) {
2624 				rib_rbuf_free(conn, RECV_BUFFER,
2625 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2626 			}
2627 			(void) rib_remreply(qp, rep);
2628 			break;
2629 		}
2630 	}
2631 	mutex_exit(&qp->replylist_lock);
2632 
2633 	return (RDMA_SUCCESS);
2634 }
2635 
2636 /* Server */
2637 rdma_stat
2638 rib_post_recv(CONN *conn, struct clist *cl)
2639 {
2640 	rib_qp_t	*qp = ctoqp(conn);
2641 
2642 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2643 		mutex_enter(&qp->posted_rbufs_lock);
2644 		qp->n_posted_rbufs++;
2645 		mutex_exit(&qp->posted_rbufs_lock);
2646 		return (RDMA_SUCCESS);
2647 	}
2648 	return (RDMA_FAILED);
2649 }
2650 
2651 /*
2652  * Client side only interface to "recv" the rpc reply buf
2653  * posted earlier by rib_post_resp(conn, cl, msgid).
2654  */
2655 rdma_stat
2656 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2657 {
2658 	struct reply *rep = NULL;
2659 	clock_t timout, cv_wait_ret;
2660 	rdma_stat ret = RDMA_SUCCESS;
2661 	rib_qp_t *qp = ctoqp(conn);
2662 
2663 	/*
2664 	 * Find the reply structure for this msgid
2665 	 */
2666 	mutex_enter(&qp->replylist_lock);
2667 
2668 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2669 		if (rep->xid == msgid)
2670 			break;
2671 	}
2672 
2673 	if (rep != NULL) {
2674 		/*
2675 		 * If message not yet received, wait.
2676 		 */
2677 		if (rep->status == (uint_t)REPLY_WAIT) {
2678 			timout = ddi_get_lbolt() +
2679 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2680 
2681 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2682 			    &qp->replylist_lock, timout)) > 0 &&
2683 			    rep->status == (uint_t)REPLY_WAIT)
2684 				;
2685 
2686 			switch (cv_wait_ret) {
2687 			case -1:	/* timeout */
2688 				ret = RDMA_TIMEDOUT;
2689 				break;
2690 			case 0:
2691 				ret = RDMA_INTR;
2692 				break;
2693 			default:
2694 				break;
2695 			}
2696 		}
2697 
2698 		if (rep->status == RDMA_SUCCESS) {
2699 			struct clist *cl = NULL;
2700 
2701 			/*
2702 			 * Got message successfully
2703 			 */
2704 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2705 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2706 			*clp = cl;
2707 		} else {
2708 			if (rep->status != (uint_t)REPLY_WAIT) {
2709 				/*
2710 				 * Got error in reply message. Free
2711 				 * recv buffer here.
2712 				 */
2713 				ret = rep->status;
2714 				rib_rbuf_free(conn, RECV_BUFFER,
2715 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2716 			}
2717 		}
2718 		(void) rib_remreply(qp, rep);
2719 	} else {
2720 		/*
2721 		 * No matching reply structure found for given msgid on the
2722 		 * reply wait list.
2723 		 */
2724 		ret = RDMA_INVAL;
2725 		DTRACE_PROBE(rpcib__i__nomatchxid2);
2726 	}
2727 
2728 	/*
2729 	 * Done.
2730 	 */
2731 	mutex_exit(&qp->replylist_lock);
2732 	return (ret);
2733 }
2734 
2735 /*
2736  * RDMA write a buffer to the remote address.
2737  */
2738 rdma_stat
2739 rib_write(CONN *conn, struct clist *cl, int wait)
2740 {
2741 	ibt_send_wr_t	tx_wr;
2742 	int		cv_sig;
2743 	ibt_wr_ds_t	sgl[DSEG_MAX];
2744 	struct send_wid	*wdesc;
2745 	ibt_status_t	ibt_status;
2746 	rdma_stat	ret = RDMA_SUCCESS;
2747 	rib_qp_t	*qp = ctoqp(conn);
2748 	uint64_t	n_writes = 0;
2749 
2750 	if (cl == NULL) {
2751 		return (RDMA_FAILED);
2752 	}
2753 
2754 	while ((cl != NULL)) {
2755 		if (cl->c_len > 0) {
2756 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2757 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2758 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2759 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2760 			sgl[0].ds_va = cl->w.c_saddr;
2761 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2762 			sgl[0].ds_len = cl->c_len;
2763 
2764 			if (wait) {
2765 				cv_sig = 1;
2766 			} else {
2767 				if (n_writes > max_unsignaled_rws) {
2768 					n_writes = 0;
2769 					cv_sig = 1;
2770 				} else {
2771 					cv_sig = 0;
2772 				}
2773 			}
2774 
2775 			if (cv_sig) {
2776 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2777 				wdesc = rib_init_sendwait(0, cv_sig, qp);
2778 				tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2779 				mutex_enter(&wdesc->sendwait_lock);
2780 			} else {
2781 				tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2782 				tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2783 			}
2784 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2785 			tx_wr.wr_trans = IBT_RC_SRV;
2786 			tx_wr.wr_nds = 1;
2787 			tx_wr.wr_sgl = sgl;
2788 
2789 			mutex_enter(&conn->c_lock);
2790 			if (conn->c_state == C_CONNECTED) {
2791 				ibt_status =
2792 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2793 			}
2794 			if (conn->c_state != C_CONNECTED ||
2795 			    ibt_status != IBT_SUCCESS) {
2796 				if (conn->c_state != C_DISCONN_PEND)
2797 					conn->c_state = C_ERROR_CONN;
2798 				mutex_exit(&conn->c_lock);
2799 				if (cv_sig) {
2800 					mutex_exit(&wdesc->sendwait_lock);
2801 					(void) rib_free_sendwait(wdesc);
2802 				}
2803 				return (RDMA_CONNLOST);
2804 			}
2805 
2806 			mutex_exit(&conn->c_lock);
2807 
2808 			/*
2809 			 * Wait for send to complete
2810 			 */
2811 			if (cv_sig) {
2812 
2813 				rib_send_hold(qp);
2814 				mutex_exit(&wdesc->sendwait_lock);
2815 
2816 				ret = rib_sendwait(qp, wdesc);
2817 				if (ret != 0)
2818 					return (ret);
2819 			}
2820 			n_writes ++;
2821 		}
2822 		cl = cl->c_next;
2823 	}
2824 	return (RDMA_SUCCESS);
2825 }
2826 
2827 /*
2828  * RDMA Read a buffer from the remote address.
2829  */
2830 rdma_stat
2831 rib_read(CONN *conn, struct clist *cl, int wait)
2832 {
2833 	ibt_send_wr_t	rx_wr;
2834 	int		cv_sig = 0;
2835 	ibt_wr_ds_t	sgl;
2836 	struct send_wid	*wdesc;
2837 	ibt_status_t	ibt_status = IBT_SUCCESS;
2838 	rdma_stat	ret = RDMA_SUCCESS;
2839 	rib_qp_t	*qp = ctoqp(conn);
2840 
2841 	if (cl == NULL) {
2842 		return (RDMA_FAILED);
2843 	}
2844 
2845 	while (cl != NULL) {
2846 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2847 		/*
2848 		 * Remote address is at the head chunk item in list.
2849 		 */
2850 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2851 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2852 
2853 		sgl.ds_va = cl->u.c_daddr;
2854 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2855 		sgl.ds_len = cl->c_len;
2856 
2857 		/*
2858 		 * If there are multiple chunks to be read, and
2859 		 * wait is set, ask for signal only for the last chunk
2860 		 * and wait only on the last chunk. The completion of
2861 		 * RDMA_READ on last chunk ensures that reads on all
2862 		 * previous chunks are also completed.
2863 		 */
2864 		if (wait && (cl->c_next == NULL)) {
2865 			cv_sig = 1;
2866 			wdesc = rib_init_sendwait(0, cv_sig, qp);
2867 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2868 			rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2869 			mutex_enter(&wdesc->sendwait_lock);
2870 		} else {
2871 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2872 			rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2873 		}
2874 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2875 		rx_wr.wr_trans = IBT_RC_SRV;
2876 		rx_wr.wr_nds = 1;
2877 		rx_wr.wr_sgl = &sgl;
2878 
2879 		mutex_enter(&conn->c_lock);
2880 		if (conn->c_state == C_CONNECTED) {
2881 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2882 		}
2883 		if (conn->c_state != C_CONNECTED ||
2884 		    ibt_status != IBT_SUCCESS) {
2885 			if (conn->c_state != C_DISCONN_PEND)
2886 				conn->c_state = C_ERROR_CONN;
2887 			mutex_exit(&conn->c_lock);
2888 			if (wait && (cl->c_next == NULL)) {
2889 				mutex_exit(&wdesc->sendwait_lock);
2890 				(void) rib_free_sendwait(wdesc);
2891 			}
2892 			return (RDMA_CONNLOST);
2893 		}
2894 
2895 		mutex_exit(&conn->c_lock);
2896 
2897 		/*
2898 		 * Wait for send to complete if this is the
2899 		 * last item in the list.
2900 		 */
2901 		if (wait && cl->c_next == NULL) {
2902 			rib_send_hold(qp);
2903 			mutex_exit(&wdesc->sendwait_lock);
2904 
2905 			ret = rib_sendwait(qp, wdesc);
2906 
2907 			if (ret != 0)
2908 				return (ret);
2909 		}
2910 		cl = cl->c_next;
2911 	}
2912 	return (RDMA_SUCCESS);
2913 }
2914 
2915 /*
2916  * rib_srv_cm_handler()
2917  *    Connection Manager callback to handle RC connection requests.
2918  */
2919 /* ARGSUSED */
2920 static ibt_cm_status_t
2921 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2922 	ibt_cm_return_args_t *ret_args, void *priv_data,
2923 	ibt_priv_data_len_t len)
2924 {
2925 	queue_t		*q;
2926 	rib_qp_t	*qp;
2927 	rib_hca_t	*hca;
2928 	rdma_stat	status = RDMA_SUCCESS;
2929 	int		i;
2930 	struct clist	cl;
2931 	rdma_buf_t	rdbuf = {0};
2932 	void		*buf = NULL;
2933 	CONN		*conn;
2934 	ibt_ip_cm_info_t	ipinfo;
2935 	struct sockaddr_in *s;
2936 	struct sockaddr_in6 *s6;
2937 	int sin_size = sizeof (struct sockaddr_in);
2938 	int in_size = sizeof (struct in_addr);
2939 	int sin6_size = sizeof (struct sockaddr_in6);
2940 
2941 	ASSERT(any != NULL);
2942 	ASSERT(event != NULL);
2943 
2944 	hca = (rib_hca_t *)any;
2945 
2946 	/* got a connection request */
2947 	switch (event->cm_type) {
2948 	case IBT_CM_EVENT_REQ_RCV:
2949 		/*
2950 		 * If the plugin is in the NO_ACCEPT state, bail out.
2951 		 */
2952 		mutex_enter(&plugin_state_lock);
2953 		if (plugin_state == NO_ACCEPT) {
2954 			mutex_exit(&plugin_state_lock);
2955 			return (IBT_CM_REJECT);
2956 		}
2957 		mutex_exit(&plugin_state_lock);
2958 
2959 		/*
2960 		 * Need to send a MRA MAD to CM so that it does not
2961 		 * timeout on us.
2962 		 */
2963 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2964 		    event->cm_event.req.req_timeout * 8, NULL, 0);
2965 
2966 		mutex_enter(&rib_stat->open_hca_lock);
2967 		q = rib_stat->q;
2968 		mutex_exit(&rib_stat->open_hca_lock);
2969 
2970 		status = rib_svc_create_chan(hca, (caddr_t)q,
2971 		    event->cm_event.req.req_prim_hca_port, &qp);
2972 
2973 		if (status) {
2974 			return (IBT_CM_REJECT);
2975 		}
2976 
2977 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2978 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2979 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2980 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2981 
2982 		/*
2983 		 * Pre-posts RECV buffers
2984 		 */
2985 		conn = qptoc(qp);
2986 		for (i = 0; i < preposted_rbufs; i++) {
2987 			bzero(&rdbuf, sizeof (rdbuf));
2988 			rdbuf.type = RECV_BUFFER;
2989 			buf = rib_rbuf_alloc(conn, &rdbuf);
2990 			if (buf == NULL) {
2991 				/*
2992 				 * A connection is not established yet.
2993 				 * Just flush the channel. Buffers
2994 				 * posted till now will error out with
2995 				 * IBT_WC_WR_FLUSHED_ERR.
2996 				 */
2997 				(void) ibt_flush_channel(qp->qp_hdl);
2998 				(void) rib_disconnect_channel(conn, NULL);
2999 				return (IBT_CM_REJECT);
3000 			}
3001 
3002 			bzero(&cl, sizeof (cl));
3003 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
3004 			cl.c_len = rdbuf.len;
3005 			cl.c_smemhandle.mrc_lmr =
3006 			    rdbuf.handle.mrc_lmr; /* lkey */
3007 			cl.c_next = NULL;
3008 			status = rib_post_recv(conn, &cl);
3009 			if (status != RDMA_SUCCESS) {
3010 				/*
3011 				 * A connection is not established yet.
3012 				 * Just flush the channel. Buffers
3013 				 * posted till now will error out with
3014 				 * IBT_WC_WR_FLUSHED_ERR.
3015 				 */
3016 				(void) ibt_flush_channel(qp->qp_hdl);
3017 				(void) rib_disconnect_channel(conn, NULL);
3018 				return (IBT_CM_REJECT);
3019 			}
3020 		}
3021 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
3022 
3023 		/*
3024 		 * Get the address translation
3025 		 */
3026 		rw_enter(&hca->state_lock, RW_READER);
3027 		if (hca->state == HCA_DETACHED) {
3028 			rw_exit(&hca->state_lock);
3029 			return (IBT_CM_REJECT);
3030 		}
3031 		rw_exit(&hca->state_lock);
3032 
3033 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3034 
3035 		if (ibt_get_ip_data(event->cm_priv_data_len,
3036 		    event->cm_priv_data,
3037 		    &ipinfo) != IBT_SUCCESS) {
3038 
3039 			return (IBT_CM_REJECT);
3040 		}
3041 
3042 		switch (ipinfo.src_addr.family) {
3043 		case AF_INET:
3044 
3045 			conn->c_raddr.maxlen =
3046 			    conn->c_raddr.len = sin_size;
3047 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3048 
3049 			s = (struct sockaddr_in *)conn->c_raddr.buf;
3050 			s->sin_family = AF_INET;
3051 
3052 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3053 			    &s->sin_addr, in_size);
3054 
3055 			break;
3056 
3057 		case AF_INET6:
3058 
3059 			conn->c_raddr.maxlen =
3060 			    conn->c_raddr.len = sin6_size;
3061 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3062 
3063 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3064 			s6->sin6_family = AF_INET6;
3065 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3066 			    &s6->sin6_addr,
3067 			    sizeof (struct in6_addr));
3068 
3069 			break;
3070 
3071 		default:
3072 			return (IBT_CM_REJECT);
3073 		}
3074 
3075 		break;
3076 
3077 	case IBT_CM_EVENT_CONN_CLOSED:
3078 	{
3079 		CONN		*conn;
3080 		rib_qp_t	*qp;
3081 
3082 		switch (event->cm_event.closed) {
3083 		case IBT_CM_CLOSED_DREP_RCVD:
3084 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
3085 		case IBT_CM_CLOSED_DUP:
3086 		case IBT_CM_CLOSED_ABORT:
3087 		case IBT_CM_CLOSED_ALREADY:
3088 			/*
3089 			 * These cases indicate the local end initiated
3090 			 * the closing of the channel. Nothing to do here.
3091 			 */
3092 			break;
3093 		default:
3094 			/*
3095 			 * Reason for CONN_CLOSED event must be one of
3096 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3097 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
3098 			 * the remote end is closing the channel. In these
3099 			 * cases free the channel and transition to error
3100 			 * state
3101 			 */
3102 			qp = ibt_get_chan_private(event->cm_channel);
3103 			conn = qptoc(qp);
3104 			mutex_enter(&conn->c_lock);
3105 			if (conn->c_state == C_DISCONN_PEND) {
3106 				mutex_exit(&conn->c_lock);
3107 				break;
3108 			}
3109 			conn->c_state = C_ERROR_CONN;
3110 
3111 			/*
3112 			 * Free the conn if c_ref goes down to 0
3113 			 */
3114 			if (conn->c_ref == 0) {
3115 				/*
3116 				 * Remove from list and free conn
3117 				 */
3118 				conn->c_state = C_DISCONN_PEND;
3119 				mutex_exit(&conn->c_lock);
3120 				(void) rib_disconnect_channel(conn,
3121 				    &hca->srv_conn_list);
3122 			} else {
3123 				/*
3124 				 * conn will be freed when c_ref goes to 0.
3125 				 * Indicate to cleaning thread not to close
3126 				 * the connection, but just free the channel.
3127 				 */
3128 				conn->c_flags |= C_CLOSE_NOTNEEDED;
3129 				mutex_exit(&conn->c_lock);
3130 			}
3131 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3132 			break;
3133 		}
3134 		break;
3135 	}
3136 	case IBT_CM_EVENT_CONN_EST:
3137 		/*
3138 		 * RTU received, hence connection established.
3139 		 */
3140 		if (rib_debug > 1)
3141 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3142 			    "(CONN_EST) channel established");
3143 		break;
3144 
3145 	default:
3146 		if (rib_debug > 2) {
3147 			/* Let CM handle the following events. */
3148 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3149 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3150 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3151 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3152 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3153 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3154 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3155 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3156 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3157 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3158 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3159 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3160 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3161 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3162 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
3163 			}
3164 		}
3165 		return (IBT_CM_DEFAULT);
3166 	}
3167 
3168 	/* accept all other CM messages (i.e. let the CM handle them) */
3169 	return (IBT_CM_ACCEPT);
3170 }
3171 
3172 static rdma_stat
3173 rib_register_service(rib_hca_t *hca, int service_type,
3174 	uint8_t protocol_num, in_port_t dst_port)
3175 {
3176 	ibt_srv_desc_t		sdesc;
3177 	ibt_hca_portinfo_t	*port_infop;
3178 	ib_svc_id_t		srv_id;
3179 	ibt_srv_hdl_t		srv_hdl;
3180 	uint_t			port_size;
3181 	uint_t			pki, i, num_ports, nbinds;
3182 	ibt_status_t		ibt_status;
3183 	rib_service_t		*service;
3184 	ib_pkey_t		pkey;
3185 
3186 	/*
3187 	 * Query all ports for the given HCA
3188 	 */
3189 	rw_enter(&hca->state_lock, RW_READER);
3190 	if (hca->state != HCA_DETACHED) {
3191 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3192 		    &num_ports, &port_size);
3193 		rw_exit(&hca->state_lock);
3194 	} else {
3195 		rw_exit(&hca->state_lock);
3196 		return (RDMA_FAILED);
3197 	}
3198 	if (ibt_status != IBT_SUCCESS) {
3199 		return (RDMA_FAILED);
3200 	}
3201 
3202 	DTRACE_PROBE1(rpcib__i__regservice_numports,
3203 	    int, num_ports);
3204 
3205 	for (i = 0; i < num_ports; i++) {
3206 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3207 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3208 			    int, i+1);
3209 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3210 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
3211 			    int, i+1);
3212 		}
3213 	}
3214 
3215 	/*
3216 	 * Get all the IP addresses on this system to register the
3217 	 * given "service type" on all DNS recognized IP addrs.
3218 	 * Each service type such as NFS will have all the systems
3219 	 * IP addresses as its different names. For now the only
3220 	 * type of service we support in RPCIB is NFS.
3221 	 */
3222 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3223 	/*
3224 	 * Start registering and binding service to active
3225 	 * on active ports on this HCA.
3226 	 */
3227 	nbinds = 0;
3228 	for (service = rib_stat->service_list;
3229 	    service && (service->srv_type != service_type);
3230 	    service = service->next)
3231 		;
3232 
3233 	if (service == NULL) {
3234 		/*
3235 		 * We use IP addresses as the service names for
3236 		 * service registration.  Register each of them
3237 		 * with CM to obtain a svc_id and svc_hdl.  We do not
3238 		 * register the service with machine's loopback address.
3239 		 */
3240 		(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3241 		(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3242 		(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3243 		sdesc.sd_handler = rib_srv_cm_handler;
3244 		sdesc.sd_flags = 0;
3245 		ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3246 		    &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3247 		    1, &srv_hdl, &srv_id);
3248 		if ((ibt_status != IBT_SUCCESS) &&
3249 		    (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3250 			rw_exit(&rib_stat->service_list_lock);
3251 			DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3252 			    int, ibt_status);
3253 			ibt_free_portinfo(port_infop, port_size);
3254 			return (RDMA_FAILED);
3255 		}
3256 
3257 		/*
3258 		 * Allocate and prepare a service entry
3259 		 */
3260 		service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3261 
3262 		service->srv_type = service_type;
3263 		service->srv_hdl = srv_hdl;
3264 		service->srv_id = srv_id;
3265 
3266 		service->next = rib_stat->service_list;
3267 		rib_stat->service_list = service;
3268 		DTRACE_PROBE1(rpcib__i__regservice__new__service,
3269 		    int, service->srv_type);
3270 	} else {
3271 		srv_hdl = service->srv_hdl;
3272 		srv_id = service->srv_id;
3273 		DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3274 		    int, service->srv_type);
3275 	}
3276 
3277 	for (i = 0; i < num_ports; i++) {
3278 		ibt_sbind_hdl_t		sbp;
3279 		rib_hca_service_t	*hca_srv;
3280 		ib_gid_t		gid;
3281 
3282 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3283 			continue;
3284 
3285 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3286 			pkey = port_infop[i].p_pkey_tbl[pki];
3287 
3288 			rw_enter(&hca->bound_services_lock, RW_READER);
3289 			gid = port_infop[i].p_sgid_tbl[0];
3290 			for (hca_srv = hca->bound_services; hca_srv;
3291 			    hca_srv = hca_srv->next) {
3292 				if ((hca_srv->srv_id == service->srv_id) &&
3293 				    (hca_srv->gid.gid_prefix ==
3294 				    gid.gid_prefix) &&
3295 				    (hca_srv->gid.gid_guid == gid.gid_guid))
3296 					break;
3297 			}
3298 			rw_exit(&hca->bound_services_lock);
3299 			if (hca_srv != NULL) {
3300 				/*
3301 				 * port is alreay bound the the service
3302 				 */
3303 				DTRACE_PROBE1(
3304 				    rpcib__i__regservice__already__bound,
3305 				    int, i+1);
3306 				nbinds++;
3307 				continue;
3308 			}
3309 
3310 			if ((pkey & IBSRM_HB) &&
3311 			    (pkey != IB_PKEY_INVALID_FULL)) {
3312 
3313 				sbp = NULL;
3314 				ibt_status = ibt_bind_service(srv_hdl,
3315 				    gid, NULL, hca, &sbp);
3316 
3317 				if (ibt_status == IBT_SUCCESS) {
3318 					hca_srv = kmem_zalloc(
3319 					    sizeof (rib_hca_service_t),
3320 					    KM_SLEEP);
3321 					hca_srv->srv_id = srv_id;
3322 					hca_srv->gid = gid;
3323 					hca_srv->sbind_hdl = sbp;
3324 
3325 					rw_enter(&hca->bound_services_lock,
3326 					    RW_WRITER);
3327 					hca_srv->next = hca->bound_services;
3328 					hca->bound_services = hca_srv;
3329 					rw_exit(&hca->bound_services_lock);
3330 					nbinds++;
3331 				}
3332 
3333 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3334 				    int, ibt_status);
3335 			}
3336 		}
3337 	}
3338 	rw_exit(&rib_stat->service_list_lock);
3339 
3340 	ibt_free_portinfo(port_infop, port_size);
3341 
3342 	if (nbinds == 0) {
3343 		return (RDMA_FAILED);
3344 	} else {
3345 		/*
3346 		 * Put this plugin into accept state, since atleast
3347 		 * one registration was successful.
3348 		 */
3349 		mutex_enter(&plugin_state_lock);
3350 		plugin_state = ACCEPT;
3351 		mutex_exit(&plugin_state_lock);
3352 		return (RDMA_SUCCESS);
3353 	}
3354 }
3355 
3356 void
3357 rib_listen(struct rdma_svc_data *rd)
3358 {
3359 	rdma_stat status;
3360 	int n_listening = 0;
3361 	rib_hca_t *hca;
3362 
3363 	mutex_enter(&rib_stat->listen_lock);
3364 	/*
3365 	 * if rd parameter is NULL then it means that rib_stat->q is
3366 	 * already initialized by a call from RDMA and we just want to
3367 	 * add a newly attached HCA to the same listening state as other
3368 	 * HCAs.
3369 	 */
3370 	if (rd == NULL) {
3371 		if (rib_stat->q == NULL) {
3372 			mutex_exit(&rib_stat->listen_lock);
3373 			return;
3374 		}
3375 	} else {
3376 		rib_stat->q = &rd->q;
3377 	}
3378 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3379 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3380 		/*
3381 		 * First check if a hca is still attached
3382 		 */
3383 		rw_enter(&hca->state_lock, RW_READER);
3384 		if (hca->state != HCA_INITED) {
3385 			rw_exit(&hca->state_lock);
3386 			continue;
3387 		}
3388 		rw_exit(&hca->state_lock);
3389 
3390 		/*
3391 		 * Right now the only service type is NFS. Hence
3392 		 * force feed this value. Ideally to communicate
3393 		 * the service type it should be passed down in
3394 		 * rdma_svc_data.
3395 		 */
3396 		status = rib_register_service(hca, NFS,
3397 		    IPPROTO_TCP, nfs_rdma_port);
3398 		if (status == RDMA_SUCCESS)
3399 			n_listening++;
3400 	}
3401 	rw_exit(&rib_stat->hcas_list_lock);
3402 
3403 	/*
3404 	 * Service active on an HCA, check rd->err_code for more
3405 	 * explainable errors.
3406 	 */
3407 	if (rd) {
3408 		if (n_listening > 0) {
3409 			rd->active = 1;
3410 			rd->err_code = RDMA_SUCCESS;
3411 		} else {
3412 			rd->active = 0;
3413 			rd->err_code = RDMA_FAILED;
3414 		}
3415 	}
3416 	mutex_exit(&rib_stat->listen_lock);
3417 }
3418 
3419 /* XXXX */
3420 /* ARGSUSED */
3421 static void
3422 rib_listen_stop(struct rdma_svc_data *svcdata)
3423 {
3424 	rib_hca_t		*hca;
3425 
3426 	mutex_enter(&rib_stat->listen_lock);
3427 	/*
3428 	 * KRPC called the RDMATF to stop the listeners, this means
3429 	 * stop sending incomming or recieved requests to KRPC master
3430 	 * transport handle for RDMA-IB. This is also means that the
3431 	 * master transport handle, responsible for us, is going away.
3432 	 */
3433 	mutex_enter(&plugin_state_lock);
3434 	plugin_state = NO_ACCEPT;
3435 	if (svcdata != NULL)
3436 		svcdata->active = 0;
3437 	mutex_exit(&plugin_state_lock);
3438 
3439 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3440 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3441 		/*
3442 		 * First check if a hca is still attached
3443 		 */
3444 		rw_enter(&hca->state_lock, RW_READER);
3445 		if (hca->state == HCA_DETACHED) {
3446 			rw_exit(&hca->state_lock);
3447 			continue;
3448 		}
3449 		rib_close_channels(&hca->srv_conn_list);
3450 		rib_stop_services(hca);
3451 		rw_exit(&hca->state_lock);
3452 	}
3453 	rw_exit(&rib_stat->hcas_list_lock);
3454 
3455 	/*
3456 	 * Avoid rib_listen() using the stale q field.
3457 	 * This could happen if a port goes up after all services
3458 	 * are already unregistered.
3459 	 */
3460 	rib_stat->q = NULL;
3461 	mutex_exit(&rib_stat->listen_lock);
3462 }
3463 
3464 /*
3465  * Traverse the HCA's service list to unbind and deregister services.
3466  * For each bound service of HCA to be removed, first find the corresponding
3467  * service handle (srv_hdl) and then unbind the service by calling
3468  * ibt_unbind_service().
3469  */
3470 static void
3471 rib_stop_services(rib_hca_t *hca)
3472 {
3473 	rib_hca_service_t *srv_list, *to_remove;
3474 
3475 	/*
3476 	 * unbind and deregister the services for this service type.
3477 	 * Right now there is only one service type. In future it will
3478 	 * be passed down to this function.
3479 	 */
3480 	rw_enter(&hca->bound_services_lock, RW_READER);
3481 	srv_list = hca->bound_services;
3482 	hca->bound_services = NULL;
3483 	rw_exit(&hca->bound_services_lock);
3484 
3485 	while (srv_list != NULL) {
3486 		rib_service_t *sc;
3487 
3488 		to_remove = srv_list;
3489 		srv_list = to_remove->next;
3490 		rw_enter(&rib_stat->service_list_lock, RW_READER);
3491 		for (sc = rib_stat->service_list;
3492 		    sc && (sc->srv_id != to_remove->srv_id);
3493 		    sc = sc->next)
3494 			;
3495 		/*
3496 		 * if sc is NULL then the service doesn't exist anymore,
3497 		 * probably just removed completely through rib_stat.
3498 		 */
3499 		if (sc != NULL)
3500 			(void) ibt_unbind_service(sc->srv_hdl,
3501 			    to_remove->sbind_hdl);
3502 		rw_exit(&rib_stat->service_list_lock);
3503 		kmem_free(to_remove, sizeof (rib_hca_service_t));
3504 	}
3505 }
3506 
3507 static struct svc_recv *
3508 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3509 {
3510 	struct svc_recv	*recvp;
3511 
3512 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3513 	recvp->vaddr = sgl->ds_va;
3514 	recvp->qp = qp;
3515 	recvp->bytes_xfer = 0;
3516 	return (recvp);
3517 }
3518 
3519 static int
3520 rib_free_svc_recv(struct svc_recv *recvp)
3521 {
3522 	kmem_free(recvp, sizeof (*recvp));
3523 
3524 	return (0);
3525 }
3526 
3527 static struct reply *
3528 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3529 {
3530 	struct reply	*rep;
3531 
3532 
3533 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3534 	if (rep == NULL) {
3535 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3536 		return (NULL);
3537 	}
3538 	rep->xid = msgid;
3539 	rep->vaddr_cq = NULL;
3540 	rep->bytes_xfer = 0;
3541 	rep->status = (uint_t)REPLY_WAIT;
3542 	rep->prev = NULL;
3543 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3544 
3545 	mutex_enter(&qp->replylist_lock);
3546 	if (qp->replylist) {
3547 		rep->next = qp->replylist;
3548 		qp->replylist->prev = rep;
3549 	}
3550 	qp->rep_list_size++;
3551 
3552 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3553 	    int, qp->rep_list_size);
3554 
3555 	qp->replylist = rep;
3556 	mutex_exit(&qp->replylist_lock);
3557 
3558 	return (rep);
3559 }
3560 
3561 static rdma_stat
3562 rib_rem_replylist(rib_qp_t *qp)
3563 {
3564 	struct reply	*r, *n;
3565 
3566 	mutex_enter(&qp->replylist_lock);
3567 	for (r = qp->replylist; r != NULL; r = n) {
3568 		n = r->next;
3569 		(void) rib_remreply(qp, r);
3570 	}
3571 	mutex_exit(&qp->replylist_lock);
3572 
3573 	return (RDMA_SUCCESS);
3574 }
3575 
3576 static int
3577 rib_remreply(rib_qp_t *qp, struct reply *rep)
3578 {
3579 
3580 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3581 	if (rep->prev) {
3582 		rep->prev->next = rep->next;
3583 	}
3584 	if (rep->next) {
3585 		rep->next->prev = rep->prev;
3586 	}
3587 	if (qp->replylist == rep)
3588 		qp->replylist = rep->next;
3589 
3590 	cv_destroy(&rep->wait_cv);
3591 	qp->rep_list_size--;
3592 
3593 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3594 	    int, qp->rep_list_size);
3595 
3596 	kmem_free(rep, sizeof (*rep));
3597 
3598 	return (0);
3599 }
3600 
3601 rdma_stat
3602 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3603 	struct mrc *buf_handle)
3604 {
3605 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3606 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3607 	rdma_stat	status;
3608 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3609 
3610 	/*
3611 	 * Note: ALL buffer pools use the same memory type RDMARW.
3612 	 */
3613 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3614 	if (status == RDMA_SUCCESS) {
3615 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3616 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3617 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3618 	} else {
3619 		buf_handle->mrc_linfo = NULL;
3620 		buf_handle->mrc_lmr = 0;
3621 		buf_handle->mrc_rmr = 0;
3622 	}
3623 	return (status);
3624 }
3625 
3626 static rdma_stat
3627 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3628 	ibt_mr_flags_t spec,
3629 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3630 {
3631 	ibt_mr_attr_t	mem_attr;
3632 	ibt_status_t	ibt_status;
3633 	mem_attr.mr_vaddr = (uintptr_t)buf;
3634 	mem_attr.mr_len = (ib_msglen_t)size;
3635 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3636 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3637 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3638 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3639 
3640 	rw_enter(&hca->state_lock, RW_READER);
3641 	if (hca->state != HCA_DETACHED) {
3642 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3643 		    &mem_attr, mr_hdlp, mr_descp);
3644 		rw_exit(&hca->state_lock);
3645 	} else {
3646 		rw_exit(&hca->state_lock);
3647 		return (RDMA_FAILED);
3648 	}
3649 
3650 	if (ibt_status != IBT_SUCCESS) {
3651 		return (RDMA_FAILED);
3652 	}
3653 	return (RDMA_SUCCESS);
3654 }
3655 
3656 rdma_stat
3657 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3658 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3659 {
3660 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3661 	rib_lrc_entry_t *l;
3662 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3663 	rdma_stat	status;
3664 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3665 
3666 	/*
3667 	 * Non-coherent memory registration.
3668 	 */
3669 	l = (rib_lrc_entry_t *)lrc;
3670 	if (l) {
3671 		if (l->registered) {
3672 			buf_handle->mrc_linfo =
3673 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3674 			buf_handle->mrc_lmr =
3675 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3676 			buf_handle->mrc_rmr =
3677 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3678 			*sync_handle = (RIB_SYNCMEM_HANDLE)
3679 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3680 			return (RDMA_SUCCESS);
3681 		} else {
3682 			/* Always register the whole buffer */
3683 			buf = (caddr_t)l->lrc_buf;
3684 			buflen = l->lrc_len;
3685 		}
3686 	}
3687 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3688 
3689 	if (status == RDMA_SUCCESS) {
3690 		if (l) {
3691 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3692 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3693 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3694 			l->registered		 = TRUE;
3695 		}
3696 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3697 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3698 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3699 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3700 	} else {
3701 		buf_handle->mrc_linfo = NULL;
3702 		buf_handle->mrc_lmr = 0;
3703 		buf_handle->mrc_rmr = 0;
3704 	}
3705 	return (status);
3706 }
3707 
3708 /* ARGSUSED */
3709 rdma_stat
3710 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3711 {
3712 	rib_hca_t *hca = (ctoqp(conn))->hca;
3713 	/*
3714 	 * Allow memory deregistration even if HCA is
3715 	 * getting detached. Need all outstanding
3716 	 * memory registrations to be deregistered
3717 	 * before HCA_DETACH_EVENT can be accepted.
3718 	 */
3719 	(void) ibt_deregister_mr(hca->hca_hdl,
3720 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3721 	return (RDMA_SUCCESS);
3722 }
3723 
3724 /* ARGSUSED */
3725 rdma_stat
3726 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3727 		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3728 {
3729 	rib_lrc_entry_t *l;
3730 	l = (rib_lrc_entry_t *)lrc;
3731 	if (l)
3732 		if (l->registered)
3733 			return (RDMA_SUCCESS);
3734 
3735 	(void) rib_deregistermem(conn, buf, buf_handle);
3736 
3737 	return (RDMA_SUCCESS);
3738 }
3739 
3740 /* ARGSUSED */
3741 rdma_stat
3742 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3743 		int len, int cpu)
3744 {
3745 	ibt_status_t	status;
3746 	rib_hca_t *hca = (ctoqp(conn))->hca;
3747 	ibt_mr_sync_t	mr_segment;
3748 
3749 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3750 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3751 	mr_segment.ms_len = (ib_memlen_t)len;
3752 	if (cpu) {
3753 		/* make incoming data visible to memory */
3754 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3755 	} else {
3756 		/* make memory changes visible to IO */
3757 		mr_segment.ms_flags = IBT_SYNC_READ;
3758 	}
3759 	rw_enter(&hca->state_lock, RW_READER);
3760 	if (hca->state != HCA_DETACHED) {
3761 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3762 		rw_exit(&hca->state_lock);
3763 	} else {
3764 		rw_exit(&hca->state_lock);
3765 		return (RDMA_FAILED);
3766 	}
3767 
3768 	if (status == IBT_SUCCESS)
3769 		return (RDMA_SUCCESS);
3770 	else {
3771 		return (RDMA_FAILED);
3772 	}
3773 }
3774 
3775 /*
3776  * XXXX	????
3777  */
3778 static rdma_stat
3779 rib_getinfo(rdma_info_t *info)
3780 {
3781 	/*
3782 	 * XXXX	Hack!
3783 	 */
3784 	info->addrlen = 16;
3785 	info->mts = 1000000;
3786 	info->mtu = 1000000;
3787 
3788 	return (RDMA_SUCCESS);
3789 }
3790 
3791 rib_bufpool_t *
3792 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3793 {
3794 	rib_bufpool_t	*rbp = NULL;
3795 	bufpool_t	*bp = NULL;
3796 	caddr_t		buf;
3797 	ibt_mr_attr_t	mem_attr;
3798 	ibt_status_t	ibt_status;
3799 	int		i, j;
3800 
3801 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3802 
3803 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3804 	    num * sizeof (void *), KM_SLEEP);
3805 
3806 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3807 	bp->numelems = num;
3808 
3809 
3810 	switch (ptype) {
3811 	case SEND_BUFFER:
3812 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3813 		bp->rsize = RPC_MSG_SZ;
3814 		break;
3815 	case RECV_BUFFER:
3816 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3817 		bp->rsize = RPC_BUF_SIZE;
3818 		break;
3819 	default:
3820 		goto fail;
3821 	}
3822 
3823 	/*
3824 	 * Register the pool.
3825 	 */
3826 	bp->bufsize = num * bp->rsize;
3827 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3828 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3829 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3830 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3831 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3832 	rw_enter(&hca->state_lock, RW_READER);
3833 
3834 	if (hca->state == HCA_DETACHED) {
3835 		rw_exit(&hca->state_lock);
3836 		goto fail;
3837 	}
3838 
3839 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3840 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3841 		mem_attr.mr_vaddr = (uintptr_t)buf;
3842 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3843 		mem_attr.mr_as = NULL;
3844 		ibt_status = ibt_register_mr(hca->hca_hdl,
3845 		    hca->pd_hdl, &mem_attr,
3846 		    &rbp->mr_hdl[i],
3847 		    &rbp->mr_desc[i]);
3848 		if (ibt_status != IBT_SUCCESS) {
3849 			for (j = 0; j < i; j++) {
3850 				(void) ibt_deregister_mr(hca->hca_hdl,
3851 				    rbp->mr_hdl[j]);
3852 			}
3853 			rw_exit(&hca->state_lock);
3854 			goto fail;
3855 		}
3856 	}
3857 	rw_exit(&hca->state_lock);
3858 	buf = (caddr_t)bp->buf;
3859 	for (i = 0; i < num; i++, buf += bp->rsize) {
3860 		bp->buflist[i] = (void *)buf;
3861 	}
3862 	bp->buffree = num - 1;	/* no. of free buffers */
3863 	rbp->bpool = bp;
3864 
3865 	return (rbp);
3866 fail:
3867 	if (bp) {
3868 		if (bp->buf)
3869 			kmem_free(bp->buf, bp->bufsize);
3870 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3871 	}
3872 	if (rbp) {
3873 		if (rbp->mr_hdl)
3874 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3875 		if (rbp->mr_desc)
3876 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3877 		kmem_free(rbp, sizeof (rib_bufpool_t));
3878 	}
3879 	return (NULL);
3880 }
3881 
3882 static void
3883 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3884 {
3885 	int i;
3886 	rib_bufpool_t *rbp = NULL;
3887 	bufpool_t *bp;
3888 
3889 	/*
3890 	 * Obtain pool address based on type of pool
3891 	 */
3892 	switch (ptype) {
3893 		case SEND_BUFFER:
3894 			rbp = hca->send_pool;
3895 			break;
3896 		case RECV_BUFFER:
3897 			rbp = hca->recv_pool;
3898 			break;
3899 		default:
3900 			return;
3901 	}
3902 	if (rbp == NULL)
3903 		return;
3904 
3905 	bp = rbp->bpool;
3906 
3907 	/*
3908 	 * Deregister the pool memory and free it.
3909 	 */
3910 	for (i = 0; i < bp->numelems; i++) {
3911 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3912 	}
3913 }
3914 
3915 static void
3916 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3917 {
3918 
3919 	rib_bufpool_t *rbp = NULL;
3920 	bufpool_t *bp;
3921 
3922 	/*
3923 	 * Obtain pool address based on type of pool
3924 	 */
3925 	switch (ptype) {
3926 		case SEND_BUFFER:
3927 			rbp = hca->send_pool;
3928 			break;
3929 		case RECV_BUFFER:
3930 			rbp = hca->recv_pool;
3931 			break;
3932 		default:
3933 			return;
3934 	}
3935 	if (rbp == NULL)
3936 		return;
3937 
3938 	bp = rbp->bpool;
3939 
3940 	/*
3941 	 * Free the pool memory.
3942 	 */
3943 	if (rbp->mr_hdl)
3944 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3945 
3946 	if (rbp->mr_desc)
3947 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
3948 	if (bp->buf)
3949 		kmem_free(bp->buf, bp->bufsize);
3950 	mutex_destroy(&bp->buflock);
3951 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
3952 	kmem_free(rbp, sizeof (rib_bufpool_t));
3953 }
3954 
3955 void
3956 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
3957 {
3958 	/*
3959 	 * Deregister the pool memory and free it.
3960 	 */
3961 	rib_rbufpool_deregister(hca, ptype);
3962 	rib_rbufpool_free(hca, ptype);
3963 }
3964 
3965 /*
3966  * Fetch a buffer from the pool of type specified in rdbuf->type.
3967  */
3968 static rdma_stat
3969 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3970 {
3971 	rib_lrc_entry_t *rlep;
3972 
3973 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
3974 		rlep = rib_get_cache_buf(conn, rdbuf->len);
3975 		rdbuf->rb_private =  (caddr_t)rlep;
3976 		rdbuf->addr = rlep->lrc_buf;
3977 		rdbuf->handle = rlep->lrc_mhandle;
3978 		return (RDMA_SUCCESS);
3979 	}
3980 
3981 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
3982 	if (rdbuf->addr) {
3983 		switch (rdbuf->type) {
3984 		case SEND_BUFFER:
3985 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
3986 			break;
3987 		case RECV_BUFFER:
3988 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
3989 			break;
3990 		default:
3991 			rdbuf->len = 0;
3992 		}
3993 		return (RDMA_SUCCESS);
3994 	} else
3995 		return (RDMA_FAILED);
3996 }
3997 
3998 /*
3999  * Fetch a buffer of specified type.
4000  * Note that rdbuf->handle is mw's rkey.
4001  */
4002 static void *
4003 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4004 {
4005 	rib_qp_t	*qp = ctoqp(conn);
4006 	rib_hca_t	*hca = qp->hca;
4007 	rdma_btype	ptype = rdbuf->type;
4008 	void		*buf;
4009 	rib_bufpool_t	*rbp = NULL;
4010 	bufpool_t	*bp;
4011 	int		i;
4012 
4013 	/*
4014 	 * Obtain pool address based on type of pool
4015 	 */
4016 	switch (ptype) {
4017 	case SEND_BUFFER:
4018 		rbp = hca->send_pool;
4019 		break;
4020 	case RECV_BUFFER:
4021 		rbp = hca->recv_pool;
4022 		break;
4023 	default:
4024 		return (NULL);
4025 	}
4026 	if (rbp == NULL)
4027 		return (NULL);
4028 
4029 	bp = rbp->bpool;
4030 
4031 	mutex_enter(&bp->buflock);
4032 	if (bp->buffree < 0) {
4033 		mutex_exit(&bp->buflock);
4034 		return (NULL);
4035 	}
4036 
4037 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4038 	buf = bp->buflist[bp->buffree];
4039 	rdbuf->addr = buf;
4040 	rdbuf->len = bp->rsize;
4041 	for (i = bp->numelems - 1; i >= 0; i--) {
4042 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4043 			rdbuf->handle.mrc_rmr =
4044 			    (uint32_t)rbp->mr_desc[i].md_rkey;
4045 			rdbuf->handle.mrc_linfo =
4046 			    (uintptr_t)rbp->mr_hdl[i];
4047 			rdbuf->handle.mrc_lmr =
4048 			    (uint32_t)rbp->mr_desc[i].md_lkey;
4049 			bp->buffree--;
4050 
4051 			mutex_exit(&bp->buflock);
4052 
4053 			return (buf);
4054 		}
4055 	}
4056 
4057 	mutex_exit(&bp->buflock);
4058 
4059 	return (NULL);
4060 }
4061 
4062 static void
4063 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4064 {
4065 
4066 	if (rdbuf->type == RDMA_LONG_BUFFER) {
4067 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4068 		rdbuf->rb_private = NULL;
4069 		return;
4070 	}
4071 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4072 }
4073 
4074 static void
4075 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4076 {
4077 	rib_qp_t *qp = ctoqp(conn);
4078 	rib_hca_t *hca = qp->hca;
4079 	rib_bufpool_t *rbp = NULL;
4080 	bufpool_t *bp;
4081 
4082 	/*
4083 	 * Obtain pool address based on type of pool
4084 	 */
4085 	switch (ptype) {
4086 	case SEND_BUFFER:
4087 		rbp = hca->send_pool;
4088 		break;
4089 	case RECV_BUFFER:
4090 		rbp = hca->recv_pool;
4091 		break;
4092 	default:
4093 		return;
4094 	}
4095 	if (rbp == NULL)
4096 		return;
4097 
4098 	bp = rbp->bpool;
4099 
4100 	mutex_enter(&bp->buflock);
4101 	if (++bp->buffree >= bp->numelems) {
4102 		/*
4103 		 * Should never happen
4104 		 */
4105 		bp->buffree--;
4106 	} else {
4107 		bp->buflist[bp->buffree] = buf;
4108 	}
4109 	mutex_exit(&bp->buflock);
4110 }
4111 
4112 static rdma_stat
4113 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4114 {
4115 	rw_enter(&connlist->conn_lock, RW_WRITER);
4116 	if (connlist->conn_hd) {
4117 		cn->c_next = connlist->conn_hd;
4118 		connlist->conn_hd->c_prev = cn;
4119 	}
4120 	connlist->conn_hd = cn;
4121 	rw_exit(&connlist->conn_lock);
4122 
4123 	return (RDMA_SUCCESS);
4124 }
4125 
4126 static rdma_stat
4127 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4128 {
4129 	rw_enter(&connlist->conn_lock, RW_WRITER);
4130 	if (cn->c_prev) {
4131 		cn->c_prev->c_next = cn->c_next;
4132 	}
4133 	if (cn->c_next) {
4134 		cn->c_next->c_prev = cn->c_prev;
4135 	}
4136 	if (connlist->conn_hd == cn)
4137 		connlist->conn_hd = cn->c_next;
4138 	rw_exit(&connlist->conn_lock);
4139 
4140 	return (RDMA_SUCCESS);
4141 }
4142 
4143 /* ARGSUSED */
4144 static rdma_stat
4145 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4146     int addr_type, void *handle, CONN **conn)
4147 {
4148 	rdma_stat status;
4149 	rpcib_ping_t rpt;
4150 
4151 	status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
4152 	return (status);
4153 }
4154 
4155 /*
4156  * rib_find_hca_connection
4157  *
4158  * if there is an existing connection to the specified address then
4159  * it will be returned in conn, otherwise conn will be set to NULL.
4160  * Also cleans up any connection that is in error state.
4161  */
4162 static int
4163 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4164     struct netbuf *d_svcaddr, CONN **conn)
4165 {
4166 	CONN *cn;
4167 	clock_t cv_stat, timout;
4168 
4169 	*conn = NULL;
4170 again:
4171 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4172 	cn = hca->cl_conn_list.conn_hd;
4173 	while (cn != NULL) {
4174 		/*
4175 		 * First, clear up any connection in the ERROR state
4176 		 */
4177 		mutex_enter(&cn->c_lock);
4178 		if (cn->c_state == C_ERROR_CONN) {
4179 			if (cn->c_ref == 0) {
4180 				/*
4181 				 * Remove connection from list and destroy it.
4182 				 */
4183 				cn->c_state = C_DISCONN_PEND;
4184 				mutex_exit(&cn->c_lock);
4185 				rw_exit(&hca->cl_conn_list.conn_lock);
4186 				rib_conn_close((void *)cn);
4187 				goto again;
4188 			}
4189 			mutex_exit(&cn->c_lock);
4190 			cn = cn->c_next;
4191 			continue;
4192 		}
4193 		if (cn->c_state == C_DISCONN_PEND) {
4194 			mutex_exit(&cn->c_lock);
4195 			cn = cn->c_next;
4196 			continue;
4197 		}
4198 
4199 		/*
4200 		 * source address is only checked for if there is one,
4201 		 * this is the case for retries.
4202 		 */
4203 		if ((cn->c_raddr.len == d_svcaddr->len) &&
4204 		    (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4205 		    d_svcaddr->len) == 0) &&
4206 		    ((s_svcaddr->len == 0) ||
4207 		    ((cn->c_laddr.len == s_svcaddr->len) &&
4208 		    (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4209 		    s_svcaddr->len) == 0)))) {
4210 			/*
4211 			 * Our connection. Give up conn list lock
4212 			 * as we are done traversing the list.
4213 			 */
4214 			rw_exit(&hca->cl_conn_list.conn_lock);
4215 			if (cn->c_state == C_CONNECTED) {
4216 				cn->c_ref++;	/* sharing a conn */
4217 				mutex_exit(&cn->c_lock);
4218 				*conn = cn;
4219 				return (RDMA_SUCCESS);
4220 			}
4221 			if (cn->c_state == C_CONN_PEND) {
4222 				/*
4223 				 * Hold a reference to this conn before
4224 				 * we give up the lock.
4225 				 */
4226 				cn->c_ref++;
4227 				timout =  ddi_get_lbolt() +
4228 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
4229 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4230 				    &cn->c_lock, timout)) > 0 &&
4231 				    cn->c_state == C_CONN_PEND)
4232 					;
4233 				if (cv_stat == 0) {
4234 					cn->c_ref--;
4235 					mutex_exit(&cn->c_lock);
4236 					return (RDMA_INTR);
4237 				}
4238 				if (cv_stat < 0) {
4239 					cn->c_ref--;
4240 					mutex_exit(&cn->c_lock);
4241 					return (RDMA_TIMEDOUT);
4242 				}
4243 				if (cn->c_state == C_CONNECTED) {
4244 					*conn = cn;
4245 					mutex_exit(&cn->c_lock);
4246 					return (RDMA_SUCCESS);
4247 				} else {
4248 					cn->c_ref--;
4249 					mutex_exit(&cn->c_lock);
4250 					return (RDMA_TIMEDOUT);
4251 				}
4252 			}
4253 		}
4254 		mutex_exit(&cn->c_lock);
4255 		cn = cn->c_next;
4256 	}
4257 	rw_exit(&hca->cl_conn_list.conn_lock);
4258 	*conn = NULL;
4259 	return (RDMA_FAILED);
4260 }
4261 
4262 /*
4263  * Connection management.
4264  * IBTF does not support recycling of channels. So connections are only
4265  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4266  * C_DISCONN_PEND state. No C_IDLE state.
4267  * C_CONN_PEND state: Connection establishment in progress to the server.
4268  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4269  * It has an RC channel associated with it. ibt_post_send/recv are allowed
4270  * only in this state.
4271  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4272  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4273  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4274  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4275  * c_ref drops to 0 (this indicates that RPC has no more references to this
4276  * connection), the connection should be destroyed. A connection transitions
4277  * into this state when it is being destroyed.
4278  */
4279 /* ARGSUSED */
4280 static rdma_stat
4281 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4282     int addr_type, rpcib_ping_t *rpt, CONN **conn)
4283 {
4284 	CONN *cn;
4285 	int status;
4286 	rib_hca_t *hca;
4287 	rib_qp_t *qp;
4288 	int s_addr_len;
4289 	char *s_addr_buf;
4290 
4291 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4292 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4293 		rw_enter(&hca->state_lock, RW_READER);
4294 		if (hca->state != HCA_DETACHED) {
4295 			status = rib_find_hca_connection(hca, s_svcaddr,
4296 			    d_svcaddr, conn);
4297 			rw_exit(&hca->state_lock);
4298 			if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4299 				rw_exit(&rib_stat->hcas_list_lock);
4300 				return (status);
4301 			}
4302 		} else
4303 			rw_exit(&hca->state_lock);
4304 	}
4305 	rw_exit(&rib_stat->hcas_list_lock);
4306 
4307 	/*
4308 	 * No existing connection found, establish a new connection.
4309 	 */
4310 	bzero(rpt, sizeof (rpcib_ping_t));
4311 
4312 	status = rib_ping_srv(addr_type, d_svcaddr, rpt);
4313 	if (status != RDMA_SUCCESS) {
4314 		return (RDMA_FAILED);
4315 	}
4316 	hca = rpt->hca;
4317 
4318 	if (rpt->srcip.family == AF_INET) {
4319 		s_addr_len = sizeof (rpt->srcip.un.ip4addr);
4320 		s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
4321 	} else if (rpt->srcip.family == AF_INET6) {
4322 		s_addr_len = sizeof (rpt->srcip.un.ip6addr);
4323 		s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
4324 	} else {
4325 		return (RDMA_FAILED);
4326 	}
4327 
4328 	/*
4329 	 * Channel to server doesn't exist yet, create one.
4330 	 */
4331 	if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4332 		return (RDMA_FAILED);
4333 	}
4334 	cn = qptoc(qp);
4335 	cn->c_state = C_CONN_PEND;
4336 	cn->c_ref = 1;
4337 
4338 	cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4339 	bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4340 	cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4341 
4342 	/*
4343 	 * Add to conn list.
4344 	 * We had given up the READER lock. In the time since then,
4345 	 * another thread might have created the connection we are
4346 	 * trying here. But for now, that is quiet alright - there
4347 	 * might be two connections between a pair of hosts instead
4348 	 * of one. If we really want to close that window,
4349 	 * then need to check the list after acquiring the
4350 	 * WRITER lock.
4351 	 */
4352 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4353 	status = rib_conn_to_srv(hca, qp, rpt);
4354 	mutex_enter(&cn->c_lock);
4355 	if (status == RDMA_SUCCESS) {
4356 		cn->c_state = C_CONNECTED;
4357 		*conn = cn;
4358 	} else {
4359 		cn->c_state = C_ERROR_CONN;
4360 		cn->c_ref--;
4361 	}
4362 	cv_broadcast(&cn->c_cv);
4363 	mutex_exit(&cn->c_lock);
4364 	return (status);
4365 }
4366 
4367 static void
4368 rib_conn_close(void *rarg)
4369 {
4370 	CONN *conn = (CONN *)rarg;
4371 	rib_qp_t *qp = ctoqp(conn);
4372 
4373 	mutex_enter(&conn->c_lock);
4374 	if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4375 
4376 		conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4377 		/*
4378 		 * Live connection in CONNECTED state.
4379 		 */
4380 		if (conn->c_state == C_CONNECTED) {
4381 			conn->c_state = C_ERROR_CONN;
4382 		}
4383 		mutex_exit(&conn->c_lock);
4384 
4385 		rib_close_a_channel(conn);
4386 
4387 		mutex_enter(&conn->c_lock);
4388 		conn->c_flags &= ~C_CLOSE_PENDING;
4389 		cv_signal(&conn->c_cv);
4390 	}
4391 
4392 	mutex_exit(&conn->c_lock);
4393 
4394 	if (qp->mode == RIB_SERVER)
4395 		(void) rib_disconnect_channel(conn,
4396 		    &qp->hca->srv_conn_list);
4397 	else
4398 		(void) rib_disconnect_channel(conn,
4399 		    &qp->hca->cl_conn_list);
4400 }
4401 
4402 static void
4403 rib_conn_timeout_call(void *carg)
4404 {
4405 	time_t idle_time;
4406 	CONN *conn = (CONN *)carg;
4407 	rib_hca_t *hca = ctoqp(conn)->hca;
4408 	int error;
4409 
4410 	mutex_enter(&conn->c_lock);
4411 	if ((conn->c_ref > 0) ||
4412 	    (conn->c_state == C_DISCONN_PEND)) {
4413 		conn->c_timeout = NULL;
4414 		mutex_exit(&conn->c_lock);
4415 		return;
4416 	}
4417 
4418 	idle_time = (gethrestime_sec() - conn->c_last_used);
4419 
4420 	if ((idle_time <= rib_conn_timeout) &&
4421 	    (conn->c_state != C_ERROR_CONN)) {
4422 		/*
4423 		 * There was activity after the last timeout.
4424 		 * Extend the conn life. Unless the conn is
4425 		 * already in error state.
4426 		 */
4427 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4428 		    SEC_TO_TICK(rib_conn_timeout - idle_time));
4429 		mutex_exit(&conn->c_lock);
4430 		return;
4431 	}
4432 
4433 	error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4434 	    (void *)conn, DDI_NOSLEEP);
4435 
4436 	/*
4437 	 * If taskq dispatch fails above, then reset the timeout
4438 	 * to try again after 10 secs.
4439 	 */
4440 
4441 	if (error != DDI_SUCCESS) {
4442 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4443 		    SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4444 		mutex_exit(&conn->c_lock);
4445 		return;
4446 	}
4447 
4448 	conn->c_state = C_DISCONN_PEND;
4449 	mutex_exit(&conn->c_lock);
4450 }
4451 
4452 static rdma_stat
4453 rib_conn_release(CONN *conn)
4454 {
4455 
4456 	mutex_enter(&conn->c_lock);
4457 	conn->c_ref--;
4458 
4459 	conn->c_last_used = gethrestime_sec();
4460 	if (conn->c_ref > 0) {
4461 		mutex_exit(&conn->c_lock);
4462 		return (RDMA_SUCCESS);
4463 	}
4464 
4465 	/*
4466 	 * If a conn is C_ERROR_CONN, close the channel.
4467 	 */
4468 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4469 		conn->c_state = C_DISCONN_PEND;
4470 		mutex_exit(&conn->c_lock);
4471 		rib_conn_close((void *)conn);
4472 		return (RDMA_SUCCESS);
4473 	}
4474 
4475 	/*
4476 	 * c_ref == 0, set a timeout for conn release
4477 	 */
4478 
4479 	if (conn->c_timeout == NULL) {
4480 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4481 		    SEC_TO_TICK(rib_conn_timeout));
4482 	}
4483 
4484 	mutex_exit(&conn->c_lock);
4485 	return (RDMA_SUCCESS);
4486 }
4487 
4488 /*
4489  * Add at front of list
4490  */
4491 static struct rdma_done_list *
4492 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4493 {
4494 	struct rdma_done_list *rd;
4495 
4496 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4497 
4498 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4499 	rd->xid = xid;
4500 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4501 
4502 	rd->prev = NULL;
4503 	rd->next = qp->rdlist;
4504 	if (qp->rdlist != NULL)
4505 		qp->rdlist->prev = rd;
4506 	qp->rdlist = rd;
4507 
4508 	return (rd);
4509 }
4510 
4511 static void
4512 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4513 {
4514 	struct rdma_done_list *r;
4515 
4516 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4517 
4518 	r = rd->next;
4519 	if (r != NULL) {
4520 		r->prev = rd->prev;
4521 	}
4522 
4523 	r = rd->prev;
4524 	if (r != NULL) {
4525 		r->next = rd->next;
4526 	} else {
4527 		qp->rdlist = rd->next;
4528 	}
4529 
4530 	cv_destroy(&rd->rdma_done_cv);
4531 	kmem_free(rd, sizeof (*rd));
4532 }
4533 
4534 static void
4535 rdma_done_rem_list(rib_qp_t *qp)
4536 {
4537 	struct rdma_done_list	*r, *n;
4538 
4539 	mutex_enter(&qp->rdlist_lock);
4540 	for (r = qp->rdlist; r != NULL; r = n) {
4541 		n = r->next;
4542 		rdma_done_rm(qp, r);
4543 	}
4544 	mutex_exit(&qp->rdlist_lock);
4545 }
4546 
4547 static void
4548 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4549 {
4550 	struct rdma_done_list *r = qp->rdlist;
4551 
4552 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4553 
4554 	while (r) {
4555 		if (r->xid == xid) {
4556 			cv_signal(&r->rdma_done_cv);
4557 			return;
4558 		} else {
4559 			r = r->next;
4560 		}
4561 	}
4562 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4563 	    int, xid);
4564 }
4565 
4566 /*
4567  * Expects conn->c_lock to be held by the caller.
4568  */
4569 
4570 static void
4571 rib_close_a_channel(CONN *conn)
4572 {
4573 	rib_qp_t	*qp;
4574 	qp = ctoqp(conn);
4575 
4576 	if (qp->qp_hdl == NULL) {
4577 		/* channel already freed */
4578 		return;
4579 	}
4580 
4581 	/*
4582 	 * Call ibt_close_rc_channel in blocking mode
4583 	 * with no callbacks.
4584 	 */
4585 	(void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4586 	    NULL, 0, NULL, NULL, 0);
4587 }
4588 
4589 /*
4590  * Goes through all connections and closes the channel
4591  * This will cause all the WRs on those channels to be
4592  * flushed.
4593  */
4594 static void
4595 rib_close_channels(rib_conn_list_t *connlist)
4596 {
4597 	CONN 		*conn, *tmp;
4598 
4599 	rw_enter(&connlist->conn_lock, RW_READER);
4600 	conn = connlist->conn_hd;
4601 	while (conn != NULL) {
4602 		mutex_enter(&conn->c_lock);
4603 		tmp = conn->c_next;
4604 		if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4605 
4606 			conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4607 
4608 			/*
4609 			 * Live connection in CONNECTED state.
4610 			 */
4611 			if (conn->c_state == C_CONNECTED)
4612 				conn->c_state = C_ERROR_CONN;
4613 			mutex_exit(&conn->c_lock);
4614 
4615 			rib_close_a_channel(conn);
4616 
4617 			mutex_enter(&conn->c_lock);
4618 			conn->c_flags &= ~C_CLOSE_PENDING;
4619 			/* Signal a pending rib_disconnect_channel() */
4620 			cv_signal(&conn->c_cv);
4621 		}
4622 		mutex_exit(&conn->c_lock);
4623 		conn = tmp;
4624 	}
4625 	rw_exit(&connlist->conn_lock);
4626 }
4627 
4628 /*
4629  * Frees up all connections that are no longer being referenced
4630  */
4631 static void
4632 rib_purge_connlist(rib_conn_list_t *connlist)
4633 {
4634 	CONN 		*conn;
4635 
4636 top:
4637 	rw_enter(&connlist->conn_lock, RW_READER);
4638 	conn = connlist->conn_hd;
4639 	while (conn != NULL) {
4640 		mutex_enter(&conn->c_lock);
4641 
4642 		/*
4643 		 * At this point connection is either in ERROR
4644 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4645 		 * then some other thread is culling that connection.
4646 		 * If not and if c_ref is 0, then destroy the connection.
4647 		 */
4648 		if (conn->c_ref == 0 &&
4649 		    conn->c_state != C_DISCONN_PEND) {
4650 			/*
4651 			 * Cull the connection
4652 			 */
4653 			conn->c_state = C_DISCONN_PEND;
4654 			mutex_exit(&conn->c_lock);
4655 			rw_exit(&connlist->conn_lock);
4656 			(void) rib_disconnect_channel(conn, connlist);
4657 			goto top;
4658 		} else {
4659 			/*
4660 			 * conn disconnect already scheduled or will
4661 			 * happen from conn_release when c_ref drops to 0.
4662 			 */
4663 			mutex_exit(&conn->c_lock);
4664 		}
4665 		conn = conn->c_next;
4666 	}
4667 	rw_exit(&connlist->conn_lock);
4668 
4669 	/*
4670 	 * At this point, only connections with c_ref != 0 are on the list
4671 	 */
4672 }
4673 
4674 /*
4675  * Free all the HCA resources and close
4676  * the hca.
4677  */
4678 
4679 static void
4680 rib_free_hca(rib_hca_t *hca)
4681 {
4682 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4683 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4684 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4685 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4686 
4687 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4688 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4689 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4690 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4691 
4692 	rib_rbufpool_destroy(hca, RECV_BUFFER);
4693 	rib_rbufpool_destroy(hca, SEND_BUFFER);
4694 	rib_destroy_cache(hca);
4695 	if (rib_mod.rdma_count == 0)
4696 		rdma_unregister_mod(&rib_mod);
4697 	(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4698 	(void) ibt_close_hca(hca->hca_hdl);
4699 	hca->hca_hdl = NULL;
4700 }
4701 
4702 
4703 static void
4704 rib_stop_hca_services(rib_hca_t *hca)
4705 {
4706 	rib_stop_services(hca);
4707 	rib_close_channels(&hca->cl_conn_list);
4708 	rib_close_channels(&hca->srv_conn_list);
4709 
4710 	rib_purge_connlist(&hca->cl_conn_list);
4711 	rib_purge_connlist(&hca->srv_conn_list);
4712 
4713 	if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4714 		kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4715 		    GLOBAL_ZONEID);
4716 		stats_enabled = FALSE;
4717 	}
4718 
4719 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4720 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4721 	if (hca->srv_conn_list.conn_hd == NULL &&
4722 	    hca->cl_conn_list.conn_hd == NULL) {
4723 		/*
4724 		 * conn_lists are NULL, so destroy
4725 		 * buffers, close hca and be done.
4726 		 */
4727 		rib_free_hca(hca);
4728 	}
4729 	rw_exit(&hca->cl_conn_list.conn_lock);
4730 	rw_exit(&hca->srv_conn_list.conn_lock);
4731 
4732 	if (hca->hca_hdl != NULL) {
4733 		mutex_enter(&hca->inuse_lock);
4734 		while (hca->inuse)
4735 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4736 		mutex_exit(&hca->inuse_lock);
4737 
4738 		rib_free_hca(hca);
4739 	}
4740 	rw_destroy(&hca->bound_services_lock);
4741 
4742 	if (hca->cleanup_helper != NULL) {
4743 		ddi_taskq_destroy(hca->cleanup_helper);
4744 		hca->cleanup_helper = NULL;
4745 	}
4746 }
4747 
4748 /*
4749  * Cleans and closes up all uses of the HCA
4750  */
4751 static void
4752 rib_detach_hca(rib_hca_t *hca)
4753 {
4754 	rib_hca_t **hcap;
4755 
4756 	/*
4757 	 * Stop all services on the HCA
4758 	 * Go through cl_conn_list and close all rc_channels
4759 	 * Go through svr_conn_list and close all rc_channels
4760 	 * Free connections whose c_ref has dropped to 0
4761 	 * Destroy all CQs
4762 	 * Deregister and released all buffer pool memory after all
4763 	 * connections are destroyed
4764 	 * Free the protection domain
4765 	 * ibt_close_hca()
4766 	 */
4767 	rw_enter(&hca->state_lock, RW_WRITER);
4768 	if (hca->state == HCA_DETACHED) {
4769 		rw_exit(&hca->state_lock);
4770 		return;
4771 	}
4772 
4773 	hca->state = HCA_DETACHED;
4774 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4775 	for (hcap = &rib_stat->hcas_list; *hcap && (*hcap != hca);
4776 	    hcap = &(*hcap)->next)
4777 		;
4778 	ASSERT(*hcap == hca);
4779 	*hcap = hca->next;
4780 	rib_stat->nhca_inited--;
4781 	rib_mod.rdma_count--;
4782 	rw_exit(&rib_stat->hcas_list_lock);
4783 	rw_exit(&hca->state_lock);
4784 
4785 	rib_stop_hca_services(hca);
4786 
4787 	kmem_free(hca, sizeof (*hca));
4788 }
4789 
4790 static void
4791 rib_server_side_cache_reclaim(void *argp)
4792 {
4793 	cache_avl_struct_t    *rcas;
4794 	rib_lrc_entry_t		*rb;
4795 	rib_hca_t *hca = (rib_hca_t *)argp;
4796 
4797 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4798 	rcas = avl_first(&hca->avl_tree);
4799 	if (rcas != NULL)
4800 		avl_remove(&hca->avl_tree, rcas);
4801 
4802 	while (rcas != NULL) {
4803 		while (rcas->r.forw != &rcas->r) {
4804 			rcas->elements--;
4805 			rb = rcas->r.forw;
4806 			remque(rb);
4807 			if (rb->registered)
4808 				(void) rib_deregistermem_via_hca(hca,
4809 				    rb->lrc_buf, rb->lrc_mhandle);
4810 
4811 			hca->cache_allocation -= rb->lrc_len;
4812 			kmem_free(rb->lrc_buf, rb->lrc_len);
4813 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4814 		}
4815 		mutex_destroy(&rcas->node_lock);
4816 		kmem_cache_free(hca->server_side_cache, rcas);
4817 		rcas = avl_first(&hca->avl_tree);
4818 		if (rcas != NULL)
4819 			avl_remove(&hca->avl_tree, rcas);
4820 	}
4821 	rw_exit(&hca->avl_rw_lock);
4822 }
4823 
4824 static void
4825 rib_server_side_cache_cleanup(void *argp)
4826 {
4827 	cache_avl_struct_t    *rcas;
4828 	rib_lrc_entry_t		*rb;
4829 	rib_hca_t *hca = (rib_hca_t *)argp;
4830 
4831 	mutex_enter(&hca->cache_allocation_lock);
4832 	if (hca->cache_allocation < cache_limit) {
4833 		mutex_exit(&hca->cache_allocation_lock);
4834 		return;
4835 	}
4836 	mutex_exit(&hca->cache_allocation_lock);
4837 
4838 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4839 	rcas = avl_last(&hca->avl_tree);
4840 	if (rcas != NULL)
4841 		avl_remove(&hca->avl_tree, rcas);
4842 
4843 	while (rcas != NULL) {
4844 		while (rcas->r.forw != &rcas->r) {
4845 			rcas->elements--;
4846 			rb = rcas->r.forw;
4847 			remque(rb);
4848 			if (rb->registered)
4849 				(void) rib_deregistermem_via_hca(hca,
4850 				    rb->lrc_buf, rb->lrc_mhandle);
4851 
4852 			hca->cache_allocation -= rb->lrc_len;
4853 
4854 			kmem_free(rb->lrc_buf, rb->lrc_len);
4855 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4856 		}
4857 		mutex_destroy(&rcas->node_lock);
4858 		if (hca->server_side_cache) {
4859 			kmem_cache_free(hca->server_side_cache, rcas);
4860 		}
4861 
4862 		if (hca->cache_allocation < cache_limit) {
4863 			rw_exit(&hca->avl_rw_lock);
4864 			return;
4865 		}
4866 
4867 		rcas = avl_last(&hca->avl_tree);
4868 		if (rcas != NULL)
4869 			avl_remove(&hca->avl_tree, rcas);
4870 	}
4871 	rw_exit(&hca->avl_rw_lock);
4872 }
4873 
4874 static int
4875 avl_compare(const void *t1, const void *t2)
4876 {
4877 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
4878 		return (0);
4879 
4880 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
4881 		return (-1);
4882 
4883 	return (1);
4884 }
4885 
4886 static void
4887 rib_destroy_cache(rib_hca_t *hca)
4888 {
4889 	if (hca->avl_init) {
4890 		rib_server_side_cache_reclaim((void *)hca);
4891 		if (hca->server_side_cache) {
4892 			kmem_cache_destroy(hca->server_side_cache);
4893 			hca->server_side_cache = NULL;
4894 		}
4895 		avl_destroy(&hca->avl_tree);
4896 		mutex_destroy(&hca->cache_allocation_lock);
4897 		rw_destroy(&hca->avl_rw_lock);
4898 	}
4899 	hca->avl_init = FALSE;
4900 }
4901 
4902 static void
4903 rib_force_cleanup(void *hca)
4904 {
4905 	if (((rib_hca_t *)hca)->cleanup_helper != NULL)
4906 		(void) ddi_taskq_dispatch(
4907 		    ((rib_hca_t *)hca)->cleanup_helper,
4908 		    rib_server_side_cache_cleanup,
4909 		    (void *)hca, DDI_NOSLEEP);
4910 }
4911 
4912 static rib_lrc_entry_t *
4913 rib_get_cache_buf(CONN *conn, uint32_t len)
4914 {
4915 	cache_avl_struct_t	cas, *rcas;
4916 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4917 	rib_lrc_entry_t *reply_buf;
4918 	avl_index_t where = NULL;
4919 	uint64_t c_alloc = 0;
4920 
4921 	if (!hca->avl_init)
4922 		goto  error_alloc;
4923 
4924 	cas.len = len;
4925 
4926 	rw_enter(&hca->avl_rw_lock, RW_READER);
4927 
4928 	mutex_enter(&hca->cache_allocation_lock);
4929 	c_alloc = hca->cache_allocation;
4930 	mutex_exit(&hca->cache_allocation_lock);
4931 
4932 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
4933 	    &where)) == NULL) {
4934 		/* Am I above the cache limit */
4935 		if ((c_alloc + len) >= cache_limit) {
4936 			rib_force_cleanup((void *)hca);
4937 			rw_exit(&hca->avl_rw_lock);
4938 			mutex_enter(&hca->cache_allocation_lock);
4939 			hca->cache_misses_above_the_limit ++;
4940 			mutex_exit(&hca->cache_allocation_lock);
4941 
4942 			/* Allocate and register the buffer directly */
4943 			goto error_alloc;
4944 		}
4945 
4946 		rw_exit(&hca->avl_rw_lock);
4947 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
4948 
4949 		/* Recheck to make sure no other thread added the entry in */
4950 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
4951 		    &cas, &where)) == NULL) {
4952 			/* Allocate an avl tree entry */
4953 			rcas = (cache_avl_struct_t *)
4954 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
4955 
4956 			bzero(rcas, sizeof (cache_avl_struct_t));
4957 			rcas->elements = 0;
4958 			rcas->r.forw = &rcas->r;
4959 			rcas->r.back = &rcas->r;
4960 			rcas->len = len;
4961 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
4962 			avl_insert(&hca->avl_tree, rcas, where);
4963 		}
4964 	}
4965 
4966 	mutex_enter(&rcas->node_lock);
4967 
4968 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
4969 		reply_buf = rcas->r.forw;
4970 		remque(reply_buf);
4971 		rcas->elements--;
4972 		mutex_exit(&rcas->node_lock);
4973 		rw_exit(&hca->avl_rw_lock);
4974 
4975 		mutex_enter(&hca->cache_allocation_lock);
4976 		hca->cache_hits++;
4977 		hca->cache_allocation -= len;
4978 		mutex_exit(&hca->cache_allocation_lock);
4979 	} else {
4980 		/* Am I above the cache limit */
4981 		mutex_exit(&rcas->node_lock);
4982 		if ((c_alloc + len) >= cache_limit) {
4983 			rib_force_cleanup((void *)hca);
4984 			rw_exit(&hca->avl_rw_lock);
4985 
4986 			mutex_enter(&hca->cache_allocation_lock);
4987 			hca->cache_misses_above_the_limit++;
4988 			mutex_exit(&hca->cache_allocation_lock);
4989 			/* Allocate and register the buffer directly */
4990 			goto error_alloc;
4991 		}
4992 		rw_exit(&hca->avl_rw_lock);
4993 		mutex_enter(&hca->cache_allocation_lock);
4994 		hca->cache_misses++;
4995 		mutex_exit(&hca->cache_allocation_lock);
4996 		/* Allocate a reply_buf entry */
4997 		reply_buf = (rib_lrc_entry_t *)
4998 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4999 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
5000 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
5001 		reply_buf->lrc_len  = len;
5002 		reply_buf->registered = FALSE;
5003 		reply_buf->avl_node = (void *)rcas;
5004 	}
5005 
5006 	return (reply_buf);
5007 
5008 error_alloc:
5009 	reply_buf = (rib_lrc_entry_t *)
5010 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5011 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
5012 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5013 	reply_buf->lrc_len = len;
5014 	reply_buf->registered = FALSE;
5015 	reply_buf->avl_node = NULL;
5016 
5017 	return (reply_buf);
5018 }
5019 
5020 /*
5021  * Return a pre-registered back to the cache (without
5022  * unregistering the buffer)..
5023  */
5024 
5025 static void
5026 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5027 {
5028 	cache_avl_struct_t    cas, *rcas;
5029 	avl_index_t where = NULL;
5030 	rib_hca_t	*hca = (ctoqp(conn))->hca;
5031 
5032 	if (!hca->avl_init)
5033 		goto  error_free;
5034 
5035 	cas.len = reg_buf->lrc_len;
5036 	rw_enter(&hca->avl_rw_lock, RW_READER);
5037 	if ((rcas = (cache_avl_struct_t *)
5038 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5039 		rw_exit(&hca->avl_rw_lock);
5040 		goto error_free;
5041 	} else {
5042 		cas.len = reg_buf->lrc_len;
5043 		mutex_enter(&rcas->node_lock);
5044 		insque(reg_buf, &rcas->r);
5045 		rcas->elements ++;
5046 		mutex_exit(&rcas->node_lock);
5047 		rw_exit(&hca->avl_rw_lock);
5048 		mutex_enter(&hca->cache_allocation_lock);
5049 		hca->cache_allocation += cas.len;
5050 		mutex_exit(&hca->cache_allocation_lock);
5051 	}
5052 
5053 	return;
5054 
5055 error_free:
5056 
5057 	if (reg_buf->registered)
5058 		(void) rib_deregistermem_via_hca(hca,
5059 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5060 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5061 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5062 }
5063 
5064 static rdma_stat
5065 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5066 	uint_t buflen, struct mrc *buf_handle)
5067 {
5068 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
5069 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
5070 	rdma_stat	status;
5071 
5072 
5073 	/*
5074 	 * Note: ALL buffer pools use the same memory type RDMARW.
5075 	 */
5076 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5077 	if (status == RDMA_SUCCESS) {
5078 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5079 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5080 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5081 	} else {
5082 		buf_handle->mrc_linfo = NULL;
5083 		buf_handle->mrc_lmr = 0;
5084 		buf_handle->mrc_rmr = 0;
5085 	}
5086 	return (status);
5087 }
5088 
5089 /* ARGSUSED */
5090 static rdma_stat
5091 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5092     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5093 {
5094 
5095 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5096 	return (RDMA_SUCCESS);
5097 }
5098 
5099 /* ARGSUSED */
5100 static rdma_stat
5101 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5102 {
5103 
5104 	(void) ibt_deregister_mr(hca->hca_hdl,
5105 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5106 	return (RDMA_SUCCESS);
5107 }
5108 
5109 /*
5110  * Check if the IP interface named by `lifrp' is RDMA-capable.
5111  */
5112 static boolean_t
5113 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5114 {
5115 	char ifname[LIFNAMSIZ];
5116 	char *cp;
5117 
5118 	if (lifrp->lifr_type == IFT_IB)
5119 		return (B_TRUE);
5120 
5121 	/*
5122 	 * Strip off the logical interface portion before getting
5123 	 * intimate with the name.
5124 	 */
5125 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5126 	if ((cp = strchr(ifname, ':')) != NULL)
5127 		*cp = '\0';
5128 
5129 	return (strcmp("lo0", ifname) == 0);
5130 }
5131 
5132 static int
5133 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5134 {
5135 	vnode_t *kvp, *vp;
5136 	TIUSER  *tiptr;
5137 	struct  strioctl iocb;
5138 	k_sigset_t smask;
5139 	int	err = 0;
5140 
5141 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) {
5142 		if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE,
5143 		    &tiptr, CRED()) == 0) {
5144 			vp = tiptr->fp->f_vnode;
5145 		} else {
5146 			VN_RELE(kvp);
5147 			return (EPROTO);
5148 		}
5149 	} else {
5150 		return (EPROTO);
5151 	}
5152 
5153 	iocb.ic_cmd = cmd;
5154 	iocb.ic_timout = 0;
5155 	iocb.ic_len = len;
5156 	iocb.ic_dp = (caddr_t)arg;
5157 	sigintr(&smask, 0);
5158 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5159 	sigunintr(&smask);
5160 	(void) t_kclose(tiptr, 0);
5161 	VN_RELE(kvp);
5162 	return (err);
5163 }
5164 
5165 /*
5166  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5167  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5168  */
5169 static int
5170 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5171 {
5172 	int err;
5173 	struct lifnum lifn;
5174 
5175 	bzero(&lifn, sizeof (struct lifnum));
5176 	lifn.lifn_family = AF_UNSPEC;
5177 
5178 	err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5179 	if (err != 0)
5180 		return (err);
5181 
5182 	/*
5183 	 * Pad the interface count to account for additional interfaces that
5184 	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5185 	 */
5186 	lifn.lifn_count += 4;
5187 
5188 	bzero(lifcp, sizeof (struct lifconf));
5189 	lifcp->lifc_family = AF_UNSPEC;
5190 	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5191 	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5192 
5193 	err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5194 	if (err != 0) {
5195 		kmem_free(lifcp->lifc_buf, *bufsizep);
5196 		return (err);
5197 	}
5198 	return (0);
5199 }
5200 
5201 static boolean_t
5202 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5203 {
5204 	uint_t i, nifs;
5205 	uint_t bufsize;
5206 	struct lifconf lifc;
5207 	struct lifreq *lifrp;
5208 	struct sockaddr_in *sinp;
5209 	struct sockaddr_in6 *sin6p;
5210 
5211 	bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5212 	bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5213 
5214 	if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5215 		return (B_FALSE);
5216 
5217 	if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5218 		kmem_free(lifc.lifc_buf, bufsize);
5219 		return (B_FALSE);
5220 	}
5221 
5222 	/*
5223 	 * Worst case is that all of the addresses are IB-capable and have
5224 	 * the same address family, so size our buffers accordingly.
5225 	 */
5226 	addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5227 	addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5228 	addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5229 	addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5230 
5231 	for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5232 		if (!rpcib_rdma_capable_interface(lifrp))
5233 			continue;
5234 
5235 		if (lifrp->lifr_addr.ss_family == AF_INET) {
5236 			sinp = addrs4->ri_list;
5237 			bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5238 			    sizeof (struct sockaddr_in));
5239 		} else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5240 			sin6p = addrs6->ri_list;
5241 			bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5242 			    sizeof (struct sockaddr_in6));
5243 		}
5244 	}
5245 
5246 	kmem_free(lifc.lifc_buf, bufsize);
5247 	return (B_TRUE);
5248 }
5249 
5250 /* ARGSUSED */
5251 static int
5252 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5253 {
5254 	rib_hca_t *hca;
5255 
5256 	if (KSTAT_WRITE == rw) {
5257 		return (EACCES);
5258 	}
5259 
5260 	rpcib_kstat.cache_limit.value.ui64 =
5261 	    (uint64_t)cache_limit;
5262 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5263 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5264 		rpcib_kstat.cache_allocation.value.ui64 +=
5265 		    (uint64_t)hca->cache_allocation;
5266 		rpcib_kstat.cache_hits.value.ui64 +=
5267 		    (uint64_t)hca->cache_hits;
5268 		rpcib_kstat.cache_misses.value.ui64 +=
5269 		    (uint64_t)hca->cache_misses;
5270 		rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5271 		    (uint64_t)hca->cache_misses_above_the_limit;
5272 	}
5273 	rw_exit(&rib_stat->hcas_list_lock);
5274 	return (0);
5275 }
5276