xref: /illumos-gate/usr/src/uts/common/rpc/rpcib.c (revision 614f161203d313b00e559d24c1d439b11e022fd5)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
28  *
29  * Portions of this source code is developed by the team members of
30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31  * headed by Professor Dhabaleswar K. (DK) Panda.
32  *
33  * Acknowledgements to contributions from developors:
34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
35  *   Lei Chai      : chail@cse.ohio-state.edu
36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
37  *
38  */
39 
40 /*
41  * The rpcib plugin. Implements the interface for RDMATF's
42  * interaction with IBTF.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/user.h>
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/file.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/stropts.h>
56 #include <sys/errno.h>
57 #include <sys/kmem.h>
58 #include <sys/debug.h>
59 #include <sys/pathname.h>
60 #include <sys/kstat.h>
61 #include <sys/t_lock.h>
62 #include <sys/ddi.h>
63 #include <sys/cmn_err.h>
64 #include <sys/time.h>
65 #include <sys/isa_defs.h>
66 #include <sys/callb.h>
67 #include <sys/sunddi.h>
68 #include <sys/sunndi.h>
69 #include <sys/sdt.h>
70 #include <sys/ib/ibtl/ibti.h>
71 #include <rpc/rpc.h>
72 #include <rpc/ib.h>
73 #include <sys/modctl.h>
74 #include <sys/kstr.h>
75 #include <sys/sockio.h>
76 #include <sys/vnode.h>
77 #include <sys/tiuser.h>
78 #include <net/if.h>
79 #include <net/if_types.h>
80 #include <sys/cred.h>
81 #include <rpc/rpc_rdma.h>
82 #include <nfs/nfs.h>
83 #include <sys/atomic.h>
84 
85 #define	NFS_RDMA_PORT	20049
86 
87 
88 /*
89  * Convenience structures for connection management
90  */
91 typedef struct rpcib_ipaddrs {
92 	void	*ri_list;	/* pointer to list of addresses */
93 	uint_t	ri_count;	/* number of addresses in list */
94 	uint_t	ri_size;	/* size of ri_list in bytes */
95 } rpcib_ipaddrs_t;
96 
97 
98 typedef struct rpcib_ping {
99 	rib_hca_t  *hca;
100 	ibt_path_info_t path;
101 	ibt_ip_addr_t srcip;
102 	ibt_ip_addr_t dstip;
103 } rpcib_ping_t;
104 
105 /*
106  * Prototype declarations for driver ops
107  */
108 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
109 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
110 				void *, void **);
111 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
113 static int	rpcib_do_ip_ioctl(int, int, void *);
114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
115 static int rpcib_cache_kstat_update(kstat_t *, int);
116 static void rib_force_cleanup(void *);
117 static void rib_stop_hca_services(rib_hca_t *);
118 static void rib_attach_hca(void);
119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
120 		struct netbuf *d_svcaddr, CONN **conn);
121 
122 struct {
123 	kstat_named_t cache_limit;
124 	kstat_named_t cache_allocation;
125 	kstat_named_t cache_hits;
126 	kstat_named_t cache_misses;
127 	kstat_named_t cache_misses_above_the_limit;
128 } rpcib_kstat = {
129 	{"cache_limit",			KSTAT_DATA_UINT64 },
130 	{"cache_allocation",		KSTAT_DATA_UINT64 },
131 	{"cache_hits",			KSTAT_DATA_UINT64 },
132 	{"cache_misses",		KSTAT_DATA_UINT64 },
133 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
134 };
135 
136 /* rpcib cb_ops */
137 static struct cb_ops rpcib_cbops = {
138 	nulldev,		/* open */
139 	nulldev,		/* close */
140 	nodev,			/* strategy */
141 	nodev,			/* print */
142 	nodev,			/* dump */
143 	nodev,			/* read */
144 	nodev,			/* write */
145 	nodev,			/* ioctl */
146 	nodev,			/* devmap */
147 	nodev,			/* mmap */
148 	nodev,			/* segmap */
149 	nochpoll,		/* poll */
150 	ddi_prop_op,		/* prop_op */
151 	NULL,			/* stream */
152 	D_MP,			/* cb_flag */
153 	CB_REV,			/* rev */
154 	nodev,			/* int (*cb_aread)() */
155 	nodev			/* int (*cb_awrite)() */
156 };
157 
158 /*
159  * Device options
160  */
161 static struct dev_ops rpcib_ops = {
162 	DEVO_REV,		/* devo_rev, */
163 	0,			/* refcnt  */
164 	rpcib_getinfo,		/* info */
165 	nulldev,		/* identify */
166 	nulldev,		/* probe */
167 	rpcib_attach,		/* attach */
168 	rpcib_detach,		/* detach */
169 	nodev,			/* reset */
170 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
171 	NULL,			/* bus operations */
172 	NULL,			/* power */
173 	ddi_quiesce_not_needed,		/* quiesce */
174 };
175 
176 /*
177  * Module linkage information.
178  */
179 
180 static struct modldrv rib_modldrv = {
181 	&mod_driverops,		/* Driver module */
182 	"RPCIB plugin driver",	/* Driver name and version */
183 	&rpcib_ops,		/* Driver ops */
184 };
185 
186 static struct modlinkage rib_modlinkage = {
187 	MODREV_1,
188 	(void *)&rib_modldrv,
189 	NULL
190 };
191 
192 typedef struct rib_lrc_entry {
193 	struct rib_lrc_entry *forw;
194 	struct rib_lrc_entry *back;
195 	char *lrc_buf;
196 
197 	uint32_t lrc_len;
198 	void  *avl_node;
199 	bool_t registered;
200 
201 	struct mrc lrc_mhandle;
202 	bool_t lrc_on_freed_list;
203 } rib_lrc_entry_t;
204 
205 typedef	struct cache_struct	{
206 	rib_lrc_entry_t		r;
207 	uint32_t		len;
208 	uint32_t		elements;
209 	kmutex_t		node_lock;
210 	avl_node_t		avl_link;
211 } cache_avl_struct_t;
212 
213 uint64_t	cache_limit = 100 * 1024 * 1024;
214 static uint64_t	cache_watermark = 80 * 1024 * 1024;
215 static bool_t	stats_enabled = FALSE;
216 
217 static uint64_t max_unsignaled_rws = 5;
218 int nfs_rdma_port = NFS_RDMA_PORT;
219 
220 /*
221  * rib_stat: private data pointer used when registering
222  *	with the IBTF.  It is returned to the consumer
223  *	in all callbacks.
224  */
225 static rpcib_state_t *rib_stat = NULL;
226 
227 #define	RNR_RETRIES	IBT_RNR_RETRY_1
228 #define	MAX_PORTS	2
229 #define	RDMA_DUMMY_WRID	0x4D3A1D4D3A1D
230 #define	RDMA_CONN_REAP_RETRY	10	/* 10 secs */
231 
232 int preposted_rbufs = RDMA_BUFS_GRANT;
233 int send_threshold = 1;
234 
235 /*
236  * Old cards with Tavor driver have limited memory footprint
237  * when booted in 32bit. The rib_max_rbufs tunable can be
238  * tuned for more buffers if needed.
239  */
240 
241 #if !defined(_ELF64) && !defined(__sparc)
242 int rib_max_rbufs = MAX_BUFS;
243 #else
244 int rib_max_rbufs = 10 * MAX_BUFS;
245 #endif	/* !(_ELF64) && !(__sparc) */
246 
247 int rib_conn_timeout = 60 * 12;		/* 12 minutes */
248 
249 /*
250  * State of the plugin.
251  * ACCEPT = accepting new connections and requests.
252  * NO_ACCEPT = not accepting new connection and requests.
253  * This should eventually move to rpcib_state_t structure, since this
254  * will tell in which state the plugin is for a particular type of service
255  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
256  * state for one and in no_accept state for the other.
257  */
258 int		plugin_state;
259 kmutex_t	plugin_state_lock;
260 
261 ldi_ident_t rpcib_li;
262 
263 /*
264  * RPCIB RDMATF operations
265  */
266 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
267 static rdma_stat rib_disconnect(CONN *conn);
268 static void rib_listen(struct rdma_svc_data *rd);
269 static void rib_listen_stop(struct rdma_svc_data *rd);
270 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
271 	uint_t buflen, struct mrc *buf_handle);
272 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
273 	struct mrc buf_handle);
274 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
275 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
276 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
277 		struct mrc buf_handle);
278 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
279 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
280 	void *lrc);
281 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
282 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
283 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
284 	caddr_t buf, int len, int cpu);
285 
286 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
287 
288 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
289 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
290 
291 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
292 
293 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
294 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
295 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
296 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
297 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
298 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
299 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
300 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
301 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
302 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
303 	int addr_type, void *, CONN **);
304 static rdma_stat rib_conn_release(CONN *conn);
305 static rdma_stat rib_getinfo(rdma_info_t *info);
306 
307 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
308 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
309 static void rib_destroy_cache(rib_hca_t *hca);
310 static	void	rib_server_side_cache_reclaim(void *argp);
311 static int avl_compare(const void *t1, const void *t2);
312 
313 static void rib_stop_services(rib_hca_t *);
314 static void rib_close_channels(rib_conn_list_t *);
315 static void rib_conn_close(void *);
316 
317 /*
318  * RPCIB addressing operations
319  */
320 
321 /*
322  * RDMA operations the RPCIB module exports
323  */
324 static rdmaops_t rib_ops = {
325 	rib_reachable,
326 	rib_conn_get,
327 	rib_conn_release,
328 	rib_listen,
329 	rib_listen_stop,
330 	rib_registermem,
331 	rib_deregistermem,
332 	rib_registermemsync,
333 	rib_deregistermemsync,
334 	rib_syncmem,
335 	rib_reg_buf_alloc,
336 	rib_reg_buf_free,
337 	rib_send,
338 	rib_send_resp,
339 	rib_post_resp,
340 	rib_post_resp_remove,
341 	rib_post_recv,
342 	rib_recv,
343 	rib_read,
344 	rib_write,
345 	rib_getinfo,
346 };
347 
348 /*
349  * RDMATF RPCIB plugin details
350  */
351 static rdma_mod_t rib_mod = {
352 	"ibtf",		/* api name */
353 	RDMATF_VERS_1,
354 	0,
355 	&rib_ops,	/* rdma op vector for ibtf */
356 };
357 
358 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
359 static rdma_stat rib_qp_init(rib_qp_t *, int);
360 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
361 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
362 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
363 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
364 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
365 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
366 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
367 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
368 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
369 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
370 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
371 	rib_qp_t **);
372 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
373 	rib_qp_t **);
374 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
375 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
376 static int rib_free_sendwait(struct send_wid *);
377 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
378 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
379 static void rdma_done_rem_list(rib_qp_t *);
380 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
381 
382 static void rib_async_handler(void *,
383 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
384 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
385 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
386 static int rib_free_svc_recv(struct svc_recv *);
387 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
388 static void rib_free_wid(struct recv_wid *);
389 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
390 static void rib_detach_hca(rib_hca_t *);
391 static void rib_close_a_channel(CONN *);
392 static void rib_send_hold(rib_qp_t *);
393 static void rib_send_rele(rib_qp_t *);
394 
395 /*
396  * Registration with IBTF as a consumer
397  */
398 static struct ibt_clnt_modinfo_s rib_modinfo = {
399 	IBTI_V_CURR,
400 	IBT_GENERIC,
401 	rib_async_handler,	/* async event handler */
402 	NULL,			/* Memory Region Handler */
403 	"nfs/ib"
404 };
405 
406 /*
407  * Global strucuture
408  */
409 
410 typedef struct rpcib_s {
411 	dev_info_t	*rpcib_dip;
412 	kmutex_t	rpcib_mutex;
413 } rpcib_t;
414 
415 rpcib_t rpcib;
416 
417 /*
418  * /etc/system controlled variable to control
419  * debugging in rpcib kernel module.
420  * Set it to values greater that 1 to control
421  * the amount of debugging messages required.
422  */
423 int rib_debug = 0;
424 
425 int
426 _init(void)
427 {
428 	int error;
429 
430 	error = mod_install((struct modlinkage *)&rib_modlinkage);
431 	if (error != 0) {
432 		/*
433 		 * Could not load module
434 		 */
435 		return (error);
436 	}
437 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
438 	return (0);
439 }
440 
441 int
442 _fini()
443 {
444 	int status;
445 
446 	/*
447 	 * Remove module
448 	 */
449 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
450 		return (status);
451 	}
452 	mutex_destroy(&plugin_state_lock);
453 	return (0);
454 }
455 
456 int
457 _info(struct modinfo *modinfop)
458 {
459 	return (mod_info(&rib_modlinkage, modinfop));
460 }
461 
462 /*
463  * rpcib_getinfo()
464  * Given the device number, return the devinfo pointer or the
465  * instance number.
466  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
467  */
468 
469 /*ARGSUSED*/
470 static int
471 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
472 {
473 	int ret = DDI_SUCCESS;
474 
475 	switch (cmd) {
476 	case DDI_INFO_DEVT2DEVINFO:
477 		if (rpcib.rpcib_dip != NULL)
478 			*result = rpcib.rpcib_dip;
479 		else {
480 			*result = NULL;
481 			ret = DDI_FAILURE;
482 		}
483 		break;
484 
485 	case DDI_INFO_DEVT2INSTANCE:
486 		*result = NULL;
487 		break;
488 
489 	default:
490 		ret = DDI_FAILURE;
491 	}
492 	return (ret);
493 }
494 
495 static void
496 rpcib_free_hca_list()
497 {
498 	rib_hca_t *hca, *hcap;
499 
500 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
501 	hca = rib_stat->hcas_list;
502 	rib_stat->hcas_list = NULL;
503 	rw_exit(&rib_stat->hcas_list_lock);
504 	while (hca != NULL) {
505 		rw_enter(&hca->state_lock, RW_WRITER);
506 		hcap = hca;
507 		hca = hca->next;
508 		rib_stat->nhca_inited--;
509 		rib_mod.rdma_count--;
510 		hcap->state = HCA_DETACHED;
511 		rw_exit(&hcap->state_lock);
512 		rib_stop_hca_services(hcap);
513 
514 		kmem_free(hcap, sizeof (*hcap));
515 	}
516 }
517 
518 static rdma_stat
519 rpcib_free_service_list()
520 {
521 	rib_service_t *service;
522 	ibt_status_t ret;
523 
524 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
525 	while (rib_stat->service_list != NULL) {
526 		service = rib_stat->service_list;
527 		ret = ibt_unbind_all_services(service->srv_hdl);
528 		if (ret != IBT_SUCCESS) {
529 			rw_exit(&rib_stat->service_list_lock);
530 #ifdef DEBUG
531 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
532 			    "ibt_unbind_all_services failed (%d)\n", (int)ret);
533 #endif
534 			return (RDMA_FAILED);
535 		}
536 		ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
537 		    service->srv_hdl);
538 		if (ret != IBT_SUCCESS) {
539 			rw_exit(&rib_stat->service_list_lock);
540 #ifdef DEBUG
541 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
542 			    "ibt_deregister_service failed (%d)\n", (int)ret);
543 #endif
544 			return (RDMA_FAILED);
545 		}
546 		rib_stat->service_list = service->next;
547 		kmem_free(service, sizeof (rib_service_t));
548 	}
549 	rw_exit(&rib_stat->service_list_lock);
550 
551 	return (RDMA_SUCCESS);
552 }
553 
554 static int
555 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
556 {
557 	ibt_status_t	ibt_status;
558 	rdma_stat	r_status;
559 
560 	switch (cmd) {
561 	case DDI_ATTACH:
562 		break;
563 	case DDI_RESUME:
564 		return (DDI_SUCCESS);
565 	default:
566 		return (DDI_FAILURE);
567 	}
568 
569 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
570 
571 	mutex_enter(&rpcib.rpcib_mutex);
572 	if (rpcib.rpcib_dip != NULL) {
573 		mutex_exit(&rpcib.rpcib_mutex);
574 		return (DDI_FAILURE);
575 	}
576 	rpcib.rpcib_dip = dip;
577 	mutex_exit(&rpcib.rpcib_mutex);
578 	/*
579 	 * Create the "rpcib" minor-node.
580 	 */
581 	if (ddi_create_minor_node(dip,
582 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
583 		/* Error message, no cmn_err as they print on console */
584 		return (DDI_FAILURE);
585 	}
586 
587 	if (rib_stat == NULL) {
588 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
589 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
590 		rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
591 		mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
592 	}
593 
594 	rib_stat->hca_count = ibt_get_hca_list(NULL);
595 	if (rib_stat->hca_count < 1) {
596 		mutex_destroy(&rib_stat->listen_lock);
597 		rw_destroy(&rib_stat->hcas_list_lock);
598 		mutex_destroy(&rib_stat->open_hca_lock);
599 		kmem_free(rib_stat, sizeof (*rib_stat));
600 		rib_stat = NULL;
601 		return (DDI_FAILURE);
602 	}
603 
604 	ibt_status = ibt_attach(&rib_modinfo, dip,
605 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
606 
607 	if (ibt_status != IBT_SUCCESS) {
608 		mutex_destroy(&rib_stat->listen_lock);
609 		rw_destroy(&rib_stat->hcas_list_lock);
610 		mutex_destroy(&rib_stat->open_hca_lock);
611 		kmem_free(rib_stat, sizeof (*rib_stat));
612 		rib_stat = NULL;
613 		return (DDI_FAILURE);
614 	}
615 
616 	rib_stat->service_list = NULL;
617 	rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
618 	mutex_enter(&rib_stat->open_hca_lock);
619 	if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
620 		mutex_exit(&rib_stat->open_hca_lock);
621 		goto open_fail;
622 	}
623 	mutex_exit(&rib_stat->open_hca_lock);
624 
625 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
626 	    DDI_PROP_SUCCESS) {
627 		cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
628 		    "failed.");
629 		goto register_fail;
630 	}
631 
632 	/*
633 	 * Register with rdmatf
634 	 */
635 	r_status = rdma_register_mod(&rib_mod);
636 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
637 		cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
638 		    "status = %d", r_status);
639 		goto register_fail;
640 	}
641 
642 	return (DDI_SUCCESS);
643 
644 register_fail:
645 
646 open_fail:
647 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
648 	rpcib_free_hca_list();
649 	(void) rpcib_free_service_list();
650 	mutex_destroy(&rib_stat->listen_lock);
651 	rw_destroy(&rib_stat->hcas_list_lock);
652 	mutex_destroy(&rib_stat->open_hca_lock);
653 	rw_destroy(&rib_stat->service_list_lock);
654 	kmem_free(rib_stat, sizeof (*rib_stat));
655 	rib_stat = NULL;
656 	return (DDI_FAILURE);
657 }
658 
659 /*ARGSUSED*/
660 static int
661 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
662 {
663 	switch (cmd) {
664 
665 	case DDI_DETACH:
666 		break;
667 
668 	case DDI_SUSPEND:
669 	default:
670 		return (DDI_FAILURE);
671 	}
672 
673 	/*
674 	 * Detach the hca and free resources
675 	 */
676 	mutex_enter(&plugin_state_lock);
677 	plugin_state = NO_ACCEPT;
678 	mutex_exit(&plugin_state_lock);
679 
680 	if (rpcib_free_service_list() != RDMA_SUCCESS)
681 		return (DDI_FAILURE);
682 	rpcib_free_hca_list();
683 
684 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
685 	mutex_destroy(&rib_stat->listen_lock);
686 	rw_destroy(&rib_stat->hcas_list_lock);
687 	mutex_destroy(&rib_stat->open_hca_lock);
688 	rw_destroy(&rib_stat->service_list_lock);
689 
690 	kmem_free(rib_stat, sizeof (*rib_stat));
691 	rib_stat = NULL;
692 
693 	mutex_enter(&rpcib.rpcib_mutex);
694 	rpcib.rpcib_dip = NULL;
695 	mutex_exit(&rpcib.rpcib_mutex);
696 	mutex_destroy(&rpcib.rpcib_mutex);
697 	return (DDI_SUCCESS);
698 }
699 
700 
701 static void rib_rbufpool_free(rib_hca_t *, int);
702 static void rib_rbufpool_deregister(rib_hca_t *, int);
703 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
704 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
705 static rdma_stat rib_rem_replylist(rib_qp_t *);
706 static int rib_remreply(rib_qp_t *, struct reply *);
707 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
708 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
709 
710 
711 /*
712  * One CQ pair per HCA
713  */
714 static rdma_stat
715 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
716 	rib_cq_t **cqp)
717 {
718 	rib_cq_t	*cq;
719 	ibt_cq_attr_t	cq_attr;
720 	uint32_t	real_size;
721 	ibt_status_t	status;
722 	rdma_stat	error = RDMA_SUCCESS;
723 
724 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
725 	cq->rib_hca = hca;
726 	cq_attr.cq_size = cq_size;
727 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
728 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
729 	    &real_size);
730 	if (status != IBT_SUCCESS) {
731 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
732 		    " status=%d", status);
733 		error = RDMA_FAILED;
734 		goto fail;
735 	}
736 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
737 
738 	/*
739 	 * Enable CQ callbacks. CQ Callbacks are single shot
740 	 * (e.g. you have to call ibt_enable_cq_notify()
741 	 * after each callback to get another one).
742 	 */
743 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
744 	if (status != IBT_SUCCESS) {
745 		cmn_err(CE_WARN, "rib_create_cq: "
746 		    "enable_cq_notify failed, status %d", status);
747 		error = RDMA_FAILED;
748 		goto fail;
749 	}
750 	*cqp = cq;
751 
752 	return (error);
753 fail:
754 	if (cq->rib_cq_hdl)
755 		(void) ibt_free_cq(cq->rib_cq_hdl);
756 	if (cq)
757 		kmem_free(cq, sizeof (rib_cq_t));
758 	return (error);
759 }
760 
761 /*
762  * rpcib_find_hca
763  *
764  * Caller should have already locked the hcas_lock before calling
765  * this function.
766  */
767 static rib_hca_t *
768 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
769 {
770 	rib_hca_t *hca = ribstat->hcas_list;
771 
772 	while (hca && hca->hca_guid != guid)
773 		hca = hca->next;
774 
775 	return (hca);
776 }
777 
778 static rdma_stat
779 rpcib_open_hcas(rpcib_state_t *ribstat)
780 {
781 	rib_hca_t		*hca;
782 	ibt_status_t		ibt_status;
783 	rdma_stat		status;
784 	ibt_hca_portinfo_t	*pinfop;
785 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
786 	uint_t			size, cq_size;
787 	int			i;
788 	kstat_t *ksp;
789 	cache_avl_struct_t example_avl_node;
790 	char rssc_name[32];
791 	int old_nhca_inited = ribstat->nhca_inited;
792 	ib_guid_t		*hca_guids;
793 
794 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
795 
796 	ribstat->hca_count = ibt_get_hca_list(&hca_guids);
797 	if (ribstat->hca_count == 0)
798 		return (RDMA_FAILED);
799 
800 	rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
801 	/*
802 	 * Open a hca and setup for RDMA
803 	 */
804 	for (i = 0; i < ribstat->hca_count; i++) {
805 		if (rpcib_find_hca(ribstat, hca_guids[i]))
806 			continue;
807 		hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
808 
809 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
810 		    hca_guids[i], &hca->hca_hdl);
811 		if (ibt_status != IBT_SUCCESS) {
812 			kmem_free(hca, sizeof (rib_hca_t));
813 			continue;
814 		}
815 		hca->hca_guid = hca_guids[i];
816 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
817 		hca->state = HCA_INITED;
818 
819 		/*
820 		 * query HCA info
821 		 */
822 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
823 		if (ibt_status != IBT_SUCCESS) {
824 			goto fail1;
825 		}
826 
827 		/*
828 		 * One PD (Protection Domain) per HCA.
829 		 * A qp is allowed to access a memory region
830 		 * only when it's in the same PD as that of
831 		 * the memory region.
832 		 */
833 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
834 		if (ibt_status != IBT_SUCCESS) {
835 			goto fail1;
836 		}
837 
838 		/*
839 		 * query HCA ports
840 		 */
841 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
842 		    0, &pinfop, &hca->hca_nports, &size);
843 		if (ibt_status != IBT_SUCCESS) {
844 			goto fail2;
845 		}
846 		hca->hca_ports = pinfop;
847 		hca->hca_pinfosz = size;
848 		pinfop = NULL;
849 
850 		cq_size = DEF_CQ_SIZE; /* default cq size */
851 		/*
852 		 * Create 2 pairs of cq's (1 pair for client
853 		 * and the other pair for server) on this hca.
854 		 * If number of qp's gets too large, then several
855 		 * cq's will be needed.
856 		 */
857 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
858 		    &hca->svc_rcq);
859 		if (status != RDMA_SUCCESS) {
860 			goto fail3;
861 		}
862 
863 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
864 		    &hca->svc_scq);
865 		if (status != RDMA_SUCCESS) {
866 			goto fail3;
867 		}
868 
869 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
870 		    &hca->clnt_rcq);
871 		if (status != RDMA_SUCCESS) {
872 			goto fail3;
873 		}
874 
875 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
876 		    &hca->clnt_scq);
877 		if (status != RDMA_SUCCESS) {
878 			goto fail3;
879 		}
880 
881 		/*
882 		 * Create buffer pools.
883 		 * Note rib_rbuf_create also allocates memory windows.
884 		 */
885 		hca->recv_pool = rib_rbufpool_create(hca,
886 		    RECV_BUFFER, rib_max_rbufs);
887 		if (hca->recv_pool == NULL) {
888 			goto fail3;
889 		}
890 
891 		hca->send_pool = rib_rbufpool_create(hca,
892 		    SEND_BUFFER, rib_max_rbufs);
893 		if (hca->send_pool == NULL) {
894 			rib_rbufpool_destroy(hca, RECV_BUFFER);
895 			goto fail3;
896 		}
897 
898 		if (hca->server_side_cache == NULL) {
899 			(void) sprintf(rssc_name,
900 			    "rib_srvr_cache_%llx",
901 			    (long long unsigned int) hca->hca_guid);
902 			hca->server_side_cache = kmem_cache_create(
903 			    rssc_name,
904 			    sizeof (cache_avl_struct_t), 0,
905 			    NULL,
906 			    NULL,
907 			    rib_server_side_cache_reclaim,
908 			    hca, NULL, 0);
909 		}
910 
911 		avl_create(&hca->avl_tree,
912 		    avl_compare,
913 		    sizeof (cache_avl_struct_t),
914 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
915 		    (uint_t)(uintptr_t)&example_avl_node);
916 
917 		rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
918 		    hca->iblock);
919 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
920 		rw_init(&hca->avl_rw_lock,
921 		    NULL, RW_DRIVER, hca->iblock);
922 		mutex_init(&hca->cache_allocation_lock,
923 		    NULL, MUTEX_DRIVER, NULL);
924 		hca->avl_init = TRUE;
925 
926 		/* Create kstats for the cache */
927 		ASSERT(INGLOBALZONE(curproc));
928 
929 		if (!stats_enabled) {
930 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
931 			    KSTAT_TYPE_NAMED,
932 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
933 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
934 			    GLOBAL_ZONEID);
935 			if (ksp) {
936 				ksp->ks_data = (void *) &rpcib_kstat;
937 				ksp->ks_update = rpcib_cache_kstat_update;
938 				kstat_install(ksp);
939 				stats_enabled = TRUE;
940 			}
941 		}
942 		if (hca->cleanup_helper == NULL) {
943 			char tq_name[sizeof (hca->hca_guid) * 2 + 1];
944 
945 			(void) snprintf(tq_name, sizeof (tq_name), "%llX",
946 			    (unsigned long long int) hca->hca_guid);
947 			hca->cleanup_helper = ddi_taskq_create(NULL,
948 			    tq_name, 1, TASKQ_DEFAULTPRI, 0);
949 		}
950 
951 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
952 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
953 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
954 		    hca->iblock);
955 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
956 		    hca->iblock);
957 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
958 		hca->inuse = TRUE;
959 
960 		hca->next = ribstat->hcas_list;
961 		ribstat->hcas_list = hca;
962 		ribstat->nhca_inited++;
963 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
964 		continue;
965 
966 fail3:
967 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
968 fail2:
969 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
970 fail1:
971 		(void) ibt_close_hca(hca->hca_hdl);
972 		kmem_free(hca, sizeof (rib_hca_t));
973 	}
974 	rw_exit(&ribstat->hcas_list_lock);
975 	ibt_free_hca_list(hca_guids, ribstat->hca_count);
976 	rib_mod.rdma_count = rib_stat->nhca_inited;
977 
978 	/*
979 	 * return success if at least one new hca has been configured.
980 	 */
981 	if (ribstat->nhca_inited != old_nhca_inited)
982 		return (RDMA_SUCCESS);
983 	else
984 		return (RDMA_FAILED);
985 }
986 
987 /*
988  * Callback routines
989  */
990 
991 /*
992  * SCQ handlers
993  */
994 /* ARGSUSED */
995 static void
996 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
997 {
998 	ibt_status_t	ibt_status;
999 	ibt_wc_t	wc;
1000 	struct send_wid	*wd;
1001 	CONN		*conn;
1002 	rib_qp_t	*qp;
1003 	int		i;
1004 
1005 	/*
1006 	 * Re-enable cq notify here to avoid missing any
1007 	 * completion queue notification.
1008 	 */
1009 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1010 
1011 	ibt_status = IBT_SUCCESS;
1012 	while (ibt_status != IBT_CQ_EMPTY) {
1013 		bzero(&wc, sizeof (wc));
1014 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1015 		if (ibt_status != IBT_SUCCESS)
1016 			return;
1017 
1018 		/*
1019 		 * Got a send completion
1020 		 */
1021 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1022 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1023 			qp = wd->qp;
1024 			conn = qptoc(qp);
1025 
1026 			mutex_enter(&wd->sendwait_lock);
1027 			switch (wc.wc_status) {
1028 			case IBT_WC_SUCCESS:
1029 				wd->status = RDMA_SUCCESS;
1030 				break;
1031 			default:
1032 /*
1033  *    RC Send Q Error Code		Local state     Remote State
1034  *    ==================== 		===========     ============
1035  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
1036  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
1037  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
1038  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
1039  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
1040  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
1041  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
1042  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
1043  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
1044  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
1045  *    IBT_WC_WR_FLUSHED_ERR               ERROR           None
1046  */
1047 				/*
1048 				 * Channel in error state. Set connection to
1049 				 * ERROR and cleanup will happen either from
1050 				 * conn_release  or from rib_conn_get
1051 				 */
1052 				wd->status = RDMA_FAILED;
1053 				mutex_enter(&conn->c_lock);
1054 				if (conn->c_state != C_DISCONN_PEND)
1055 					conn->c_state = C_ERROR_CONN;
1056 				mutex_exit(&conn->c_lock);
1057 				break;
1058 			}
1059 
1060 			if (wd->cv_sig == 1) {
1061 				/*
1062 				 * Notify poster
1063 				 */
1064 				cv_signal(&wd->wait_cv);
1065 				mutex_exit(&wd->sendwait_lock);
1066 			} else {
1067 				/*
1068 				 * Poster not waiting for notification.
1069 				 * Free the send buffers and send_wid
1070 				 */
1071 				for (i = 0; i < wd->nsbufs; i++) {
1072 					rib_rbuf_free(qptoc(wd->qp),
1073 					    SEND_BUFFER,
1074 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1075 				}
1076 
1077 				/* decrement the send ref count */
1078 				rib_send_rele(qp);
1079 
1080 				mutex_exit(&wd->sendwait_lock);
1081 				(void) rib_free_sendwait(wd);
1082 			}
1083 		}
1084 	}
1085 }
1086 
1087 /* ARGSUSED */
1088 static void
1089 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1090 {
1091 	ibt_status_t	ibt_status;
1092 	ibt_wc_t	wc;
1093 	struct send_wid	*wd;
1094 	rib_qp_t	*qp;
1095 	CONN		*conn;
1096 	int		i;
1097 
1098 	/*
1099 	 * Re-enable cq notify here to avoid missing any
1100 	 * completion queue notification.
1101 	 */
1102 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1103 
1104 	ibt_status = IBT_SUCCESS;
1105 	while (ibt_status != IBT_CQ_EMPTY) {
1106 		bzero(&wc, sizeof (wc));
1107 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1108 		if (ibt_status != IBT_SUCCESS)
1109 			return;
1110 
1111 		/*
1112 		 * Got a send completion
1113 		 */
1114 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1115 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1116 			qp = wd->qp;
1117 			conn = qptoc(qp);
1118 			mutex_enter(&wd->sendwait_lock);
1119 
1120 			switch (wc.wc_status) {
1121 			case IBT_WC_SUCCESS:
1122 				wd->status = RDMA_SUCCESS;
1123 				break;
1124 			default:
1125 				/*
1126 				 * Channel in error state. Set connection to
1127 				 * ERROR and cleanup will happen either from
1128 				 * conn_release  or conn timeout.
1129 				 */
1130 				wd->status = RDMA_FAILED;
1131 				mutex_enter(&conn->c_lock);
1132 				if (conn->c_state != C_DISCONN_PEND)
1133 					conn->c_state = C_ERROR_CONN;
1134 				mutex_exit(&conn->c_lock);
1135 				break;
1136 			}
1137 
1138 			if (wd->cv_sig == 1) {
1139 				/*
1140 				 * Update completion status and notify poster
1141 				 */
1142 				cv_signal(&wd->wait_cv);
1143 				mutex_exit(&wd->sendwait_lock);
1144 			} else {
1145 				/*
1146 				 * Poster not waiting for notification.
1147 				 * Free the send buffers and send_wid
1148 				 */
1149 				for (i = 0; i < wd->nsbufs; i++) {
1150 					rib_rbuf_free(qptoc(wd->qp),
1151 					    SEND_BUFFER,
1152 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1153 				}
1154 
1155 				/* decrement the send ref count */
1156 				rib_send_rele(qp);
1157 
1158 				mutex_exit(&wd->sendwait_lock);
1159 				(void) rib_free_sendwait(wd);
1160 			}
1161 		}
1162 	}
1163 }
1164 
1165 /*
1166  * RCQ handler
1167  */
1168 /* ARGSUSED */
1169 static void
1170 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1171 {
1172 	rib_qp_t	*qp;
1173 	ibt_status_t	ibt_status;
1174 	ibt_wc_t	wc;
1175 	struct recv_wid	*rwid;
1176 
1177 	/*
1178 	 * Re-enable cq notify here to avoid missing any
1179 	 * completion queue notification.
1180 	 */
1181 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1182 
1183 	ibt_status = IBT_SUCCESS;
1184 	while (ibt_status != IBT_CQ_EMPTY) {
1185 		bzero(&wc, sizeof (wc));
1186 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1187 		if (ibt_status != IBT_SUCCESS)
1188 			return;
1189 
1190 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1191 		qp = rwid->qp;
1192 		if (wc.wc_status == IBT_WC_SUCCESS) {
1193 			XDR	inxdrs, *xdrs;
1194 			uint_t	xid, vers, op, find_xid = 0;
1195 			struct reply	*r;
1196 			CONN *conn = qptoc(qp);
1197 			uint32_t rdma_credit = 0;
1198 
1199 			xdrs = &inxdrs;
1200 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1201 			    wc.wc_bytes_xfer, XDR_DECODE);
1202 			/*
1203 			 * Treat xid as opaque (xid is the first entity
1204 			 * in the rpc rdma message).
1205 			 */
1206 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1207 
1208 			/* Skip xid and set the xdr position accordingly. */
1209 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1210 			(void) xdr_u_int(xdrs, &vers);
1211 			(void) xdr_u_int(xdrs, &rdma_credit);
1212 			(void) xdr_u_int(xdrs, &op);
1213 			XDR_DESTROY(xdrs);
1214 
1215 			if (vers != RPCRDMA_VERS) {
1216 				/*
1217 				 * Invalid RPC/RDMA version. Cannot
1218 				 * interoperate.  Set connection to
1219 				 * ERROR state and bail out.
1220 				 */
1221 				mutex_enter(&conn->c_lock);
1222 				if (conn->c_state != C_DISCONN_PEND)
1223 					conn->c_state = C_ERROR_CONN;
1224 				mutex_exit(&conn->c_lock);
1225 				rib_rbuf_free(conn, RECV_BUFFER,
1226 				    (void *)(uintptr_t)rwid->addr);
1227 				rib_free_wid(rwid);
1228 				continue;
1229 			}
1230 
1231 			mutex_enter(&qp->replylist_lock);
1232 			for (r = qp->replylist; r != NULL; r = r->next) {
1233 				if (r->xid == xid) {
1234 					find_xid = 1;
1235 					switch (op) {
1236 					case RDMA_MSG:
1237 					case RDMA_NOMSG:
1238 					case RDMA_MSGP:
1239 						r->status = RDMA_SUCCESS;
1240 						r->vaddr_cq = rwid->addr;
1241 						r->bytes_xfer =
1242 						    wc.wc_bytes_xfer;
1243 						cv_signal(&r->wait_cv);
1244 						break;
1245 					default:
1246 						rib_rbuf_free(qptoc(qp),
1247 						    RECV_BUFFER,
1248 						    (void *)(uintptr_t)
1249 						    rwid->addr);
1250 						break;
1251 					}
1252 					break;
1253 				}
1254 			}
1255 			mutex_exit(&qp->replylist_lock);
1256 			if (find_xid == 0) {
1257 				/* RPC caller not waiting for reply */
1258 
1259 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1260 				    int, xid);
1261 
1262 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1263 				    (void *)(uintptr_t)rwid->addr);
1264 			}
1265 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1266 			CONN *conn = qptoc(qp);
1267 
1268 			/*
1269 			 * Connection being flushed. Just free
1270 			 * the posted buffer
1271 			 */
1272 			rib_rbuf_free(conn, RECV_BUFFER,
1273 			    (void *)(uintptr_t)rwid->addr);
1274 		} else {
1275 			CONN *conn = qptoc(qp);
1276 /*
1277  *  RC Recv Q Error Code		Local state     Remote State
1278  *  ====================		===========     ============
1279  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1280  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1281  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1282  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1283  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1284  *  IBT_WC_WR_FLUSHED_ERR               None            None
1285  */
1286 			/*
1287 			 * Channel in error state. Set connection
1288 			 * in ERROR state.
1289 			 */
1290 			mutex_enter(&conn->c_lock);
1291 			if (conn->c_state != C_DISCONN_PEND)
1292 				conn->c_state = C_ERROR_CONN;
1293 			mutex_exit(&conn->c_lock);
1294 			rib_rbuf_free(conn, RECV_BUFFER,
1295 			    (void *)(uintptr_t)rwid->addr);
1296 		}
1297 		rib_free_wid(rwid);
1298 	}
1299 }
1300 
1301 /* Server side */
1302 /* ARGSUSED */
1303 static void
1304 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1305 {
1306 	rdma_recv_data_t *rdp;
1307 	rib_qp_t	*qp;
1308 	ibt_status_t	ibt_status;
1309 	ibt_wc_t	wc;
1310 	struct svc_recv	*s_recvp;
1311 	CONN		*conn;
1312 	mblk_t		*mp;
1313 
1314 	/*
1315 	 * Re-enable cq notify here to avoid missing any
1316 	 * completion queue notification.
1317 	 */
1318 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1319 
1320 	ibt_status = IBT_SUCCESS;
1321 	while (ibt_status != IBT_CQ_EMPTY) {
1322 		bzero(&wc, sizeof (wc));
1323 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1324 		if (ibt_status != IBT_SUCCESS)
1325 			return;
1326 
1327 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1328 		qp = s_recvp->qp;
1329 		conn = qptoc(qp);
1330 		mutex_enter(&qp->posted_rbufs_lock);
1331 		qp->n_posted_rbufs--;
1332 		if (qp->n_posted_rbufs == 0)
1333 			cv_signal(&qp->posted_rbufs_cv);
1334 		mutex_exit(&qp->posted_rbufs_lock);
1335 
1336 		if (wc.wc_status == IBT_WC_SUCCESS) {
1337 			XDR	inxdrs, *xdrs;
1338 			uint_t	xid, vers, op;
1339 			uint32_t rdma_credit;
1340 
1341 			xdrs = &inxdrs;
1342 			/* s_recvp->vaddr stores data */
1343 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1344 			    wc.wc_bytes_xfer, XDR_DECODE);
1345 
1346 			/*
1347 			 * Treat xid as opaque (xid is the first entity
1348 			 * in the rpc rdma message).
1349 			 */
1350 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1351 			/* Skip xid and set the xdr position accordingly. */
1352 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1353 			if (!xdr_u_int(xdrs, &vers) ||
1354 			    !xdr_u_int(xdrs, &rdma_credit) ||
1355 			    !xdr_u_int(xdrs, &op)) {
1356 				rib_rbuf_free(conn, RECV_BUFFER,
1357 				    (void *)(uintptr_t)s_recvp->vaddr);
1358 				XDR_DESTROY(xdrs);
1359 				(void) rib_free_svc_recv(s_recvp);
1360 				continue;
1361 			}
1362 			XDR_DESTROY(xdrs);
1363 
1364 			if (vers != RPCRDMA_VERS) {
1365 				/*
1366 				 * Invalid RPC/RDMA version.
1367 				 * Drop rpc rdma message.
1368 				 */
1369 				rib_rbuf_free(conn, RECV_BUFFER,
1370 				    (void *)(uintptr_t)s_recvp->vaddr);
1371 				(void) rib_free_svc_recv(s_recvp);
1372 				continue;
1373 			}
1374 			/*
1375 			 * Is this for RDMA_DONE?
1376 			 */
1377 			if (op == RDMA_DONE) {
1378 				rib_rbuf_free(conn, RECV_BUFFER,
1379 				    (void *)(uintptr_t)s_recvp->vaddr);
1380 				/*
1381 				 * Wake up the thread waiting on
1382 				 * a RDMA_DONE for xid
1383 				 */
1384 				mutex_enter(&qp->rdlist_lock);
1385 				rdma_done_notify(qp, xid);
1386 				mutex_exit(&qp->rdlist_lock);
1387 				(void) rib_free_svc_recv(s_recvp);
1388 				continue;
1389 			}
1390 
1391 			mutex_enter(&plugin_state_lock);
1392 			if (plugin_state == ACCEPT) {
1393 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1394 				    == NULL)
1395 					(void) strwaitbuf(
1396 					    sizeof (*rdp), BPRI_LO);
1397 				/*
1398 				 * Plugin is in accept state, hence the master
1399 				 * transport queue for this is still accepting
1400 				 * requests. Hence we can call svc_queuereq to
1401 				 * queue this recieved msg.
1402 				 */
1403 				rdp = (rdma_recv_data_t *)mp->b_rptr;
1404 				rdp->conn = conn;
1405 				rdp->rpcmsg.addr =
1406 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1407 				rdp->rpcmsg.type = RECV_BUFFER;
1408 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1409 				rdp->status = wc.wc_status;
1410 				mutex_enter(&conn->c_lock);
1411 				conn->c_ref++;
1412 				mutex_exit(&conn->c_lock);
1413 				mp->b_wptr += sizeof (*rdp);
1414 				svc_queuereq((queue_t *)rib_stat->q, mp);
1415 				mutex_exit(&plugin_state_lock);
1416 			} else {
1417 				/*
1418 				 * The master transport for this is going
1419 				 * away and the queue is not accepting anymore
1420 				 * requests for krpc, so don't do anything, just
1421 				 * free the msg.
1422 				 */
1423 				mutex_exit(&plugin_state_lock);
1424 				rib_rbuf_free(conn, RECV_BUFFER,
1425 				    (void *)(uintptr_t)s_recvp->vaddr);
1426 			}
1427 		} else {
1428 			rib_rbuf_free(conn, RECV_BUFFER,
1429 			    (void *)(uintptr_t)s_recvp->vaddr);
1430 		}
1431 		(void) rib_free_svc_recv(s_recvp);
1432 	}
1433 }
1434 
1435 static void
1436 rib_attach_hca()
1437 {
1438 	mutex_enter(&rib_stat->open_hca_lock);
1439 	rpcib_open_hcas(rib_stat);
1440 	rib_listen(NULL);
1441 	mutex_exit(&rib_stat->open_hca_lock);
1442 }
1443 
1444 /*
1445  * Handles DR event of IBT_HCA_DETACH_EVENT.
1446  */
1447 /* ARGSUSED */
1448 static void
1449 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1450 	ibt_async_code_t code, ibt_async_event_t *event)
1451 {
1452 	switch (code) {
1453 	case IBT_HCA_ATTACH_EVENT:
1454 		rib_attach_hca();
1455 		break;
1456 	case IBT_HCA_DETACH_EVENT:
1457 	{
1458 		rib_hca_t *hca;
1459 
1460 		rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1461 		for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1462 			rw_enter(&hca->state_lock, RW_READER);
1463 			if ((hca->state != HCA_DETACHED) &&
1464 			    (hca->hca_hdl == hca_hdl)) {
1465 				rw_exit(&hca->state_lock);
1466 				break;
1467 			}
1468 			rw_exit(&hca->state_lock);
1469 		}
1470 		rw_exit(&rib_stat->hcas_list_lock);
1471 
1472 		if (hca == NULL)
1473 			return;
1474 		ASSERT(hca->hca_hdl == hca_hdl);
1475 		rib_detach_hca(hca);
1476 #ifdef DEBUG
1477 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1478 #endif
1479 		break;
1480 	}
1481 	case IBT_EVENT_PORT_UP:
1482 		/*
1483 		 * A port is up. We should call rib_listen() since there is
1484 		 * a chance that rib_listen() may have failed during
1485 		 * rib_attach_hca() because the port had not been up yet.
1486 		 */
1487 		rib_listen(NULL);
1488 #ifdef DEBUG
1489 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1490 #endif
1491 		break;
1492 #ifdef DEBUG
1493 	case IBT_EVENT_PATH_MIGRATED:
1494 		cmn_err(CE_NOTE, "rib_async_handler(): "
1495 		    "IBT_EVENT_PATH_MIGRATED\n");
1496 		break;
1497 	case IBT_EVENT_SQD:
1498 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1499 		break;
1500 	case IBT_EVENT_COM_EST:
1501 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1502 		break;
1503 	case IBT_ERROR_CATASTROPHIC_CHAN:
1504 		cmn_err(CE_NOTE, "rib_async_handler(): "
1505 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1506 		break;
1507 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1508 		cmn_err(CE_NOTE, "rib_async_handler(): "
1509 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1510 		break;
1511 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1512 		cmn_err(CE_NOTE, "rib_async_handler(): "
1513 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1514 		break;
1515 	case IBT_ERROR_PATH_MIGRATE_REQ:
1516 		cmn_err(CE_NOTE, "rib_async_handler(): "
1517 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1518 		break;
1519 	case IBT_ERROR_CQ:
1520 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1521 		break;
1522 	case IBT_ERROR_PORT_DOWN:
1523 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1524 		break;
1525 	case IBT_ASYNC_OPAQUE1:
1526 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1527 		break;
1528 	case IBT_ASYNC_OPAQUE2:
1529 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1530 		break;
1531 	case IBT_ASYNC_OPAQUE3:
1532 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1533 		break;
1534 	case IBT_ASYNC_OPAQUE4:
1535 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1536 		break;
1537 #endif
1538 	default:
1539 		break;
1540 	}
1541 }
1542 
1543 /*
1544  * Client's reachable function.
1545  */
1546 static rdma_stat
1547 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1548 {
1549 	rdma_stat	status;
1550 	rpcib_ping_t	rpt;
1551 
1552 	bzero(&rpt, sizeof (rpcib_ping_t));
1553 	status = rib_ping_srv(addr_type, raddr, &rpt);
1554 
1555 	if (status == RDMA_SUCCESS) {
1556 		*handle = (void *)rpt.hca;
1557 		return (RDMA_SUCCESS);
1558 	} else {
1559 		*handle = NULL;
1560 		DTRACE_PROBE(rpcib__i__pingfailed);
1561 		return (RDMA_FAILED);
1562 	}
1563 }
1564 
1565 /* Client side qp creation */
1566 static rdma_stat
1567 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1568 {
1569 	rib_qp_t	*kqp = NULL;
1570 	CONN		*conn;
1571 	rdma_clnt_cred_ctrl_t *cc_info;
1572 
1573 	ASSERT(qp != NULL);
1574 	*qp = NULL;
1575 
1576 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1577 	conn = qptoc(kqp);
1578 	kqp->hca = hca;
1579 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1580 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1581 
1582 	kqp->mode = RIB_CLIENT;
1583 	kqp->chan_flags = IBT_BLOCKING;
1584 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1585 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1586 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1587 	/*
1588 	 * Initialize
1589 	 */
1590 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1591 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1592 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1593 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1594 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1595 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1596 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1597 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1598 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1599 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1600 	/*
1601 	 * Initialize the client credit control
1602 	 * portion of the rdmaconn struct.
1603 	 */
1604 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1605 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1606 	cc_info->clnt_cc_granted_ops = 0;
1607 	cc_info->clnt_cc_in_flight_ops = 0;
1608 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1609 
1610 	*qp = kqp;
1611 	return (RDMA_SUCCESS);
1612 }
1613 
1614 /* Server side qp creation */
1615 static rdma_stat
1616 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1617 {
1618 	rib_qp_t	*kqp = NULL;
1619 	ibt_chan_sizes_t	chan_sizes;
1620 	ibt_rc_chan_alloc_args_t	qp_attr;
1621 	ibt_status_t		ibt_status;
1622 	rdma_srv_cred_ctrl_t *cc_info;
1623 
1624 	*qp = NULL;
1625 
1626 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1627 	kqp->hca = hca;
1628 	kqp->port_num = port;
1629 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1630 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1631 
1632 	/*
1633 	 * Create the qp handle
1634 	 */
1635 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1636 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1637 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1638 	qp_attr.rc_pd = hca->pd_hdl;
1639 	qp_attr.rc_hca_port_num = port;
1640 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1641 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1642 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1643 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1644 	qp_attr.rc_clone_chan = NULL;
1645 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1646 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1647 
1648 	rw_enter(&hca->state_lock, RW_READER);
1649 	if (hca->state != HCA_DETACHED) {
1650 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1651 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1652 		    &chan_sizes);
1653 	} else {
1654 		rw_exit(&hca->state_lock);
1655 		goto fail;
1656 	}
1657 	rw_exit(&hca->state_lock);
1658 
1659 	if (ibt_status != IBT_SUCCESS) {
1660 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1661 		    int, ibt_status);
1662 		goto fail;
1663 	}
1664 
1665 	kqp->mode = RIB_SERVER;
1666 	kqp->chan_flags = IBT_BLOCKING;
1667 	kqp->q = q;	/* server ONLY */
1668 
1669 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1670 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1671 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1672 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1673 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1674 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1675 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1676 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1677 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1678 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1679 	/*
1680 	 * Set the private data area to qp to be used in callbacks
1681 	 */
1682 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1683 	kqp->rdmaconn.c_state = C_CONNECTED;
1684 
1685 	/*
1686 	 * Initialize the server credit control
1687 	 * portion of the rdmaconn struct.
1688 	 */
1689 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1690 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1691 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1692 	cc_info->srv_cc_cur_buffers_used = 0;
1693 	cc_info->srv_cc_posted = preposted_rbufs;
1694 
1695 	*qp = kqp;
1696 
1697 	return (RDMA_SUCCESS);
1698 fail:
1699 	if (kqp)
1700 		kmem_free(kqp, sizeof (rib_qp_t));
1701 
1702 	return (RDMA_FAILED);
1703 }
1704 
1705 /* ARGSUSED */
1706 ibt_cm_status_t
1707 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1708     ibt_cm_return_args_t *ret_args, void *priv_data,
1709     ibt_priv_data_len_t len)
1710 {
1711 	rib_hca_t	*hca;
1712 
1713 	hca = (rib_hca_t *)clnt_hdl;
1714 
1715 	switch (event->cm_type) {
1716 
1717 	/* got a connection close event */
1718 	case IBT_CM_EVENT_CONN_CLOSED:
1719 	{
1720 		CONN	*conn;
1721 		rib_qp_t *qp;
1722 
1723 		/* check reason why connection was closed */
1724 		switch (event->cm_event.closed) {
1725 		case IBT_CM_CLOSED_DREP_RCVD:
1726 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1727 		case IBT_CM_CLOSED_DUP:
1728 		case IBT_CM_CLOSED_ABORT:
1729 		case IBT_CM_CLOSED_ALREADY:
1730 			/*
1731 			 * These cases indicate the local end initiated
1732 			 * the closing of the channel. Nothing to do here.
1733 			 */
1734 			break;
1735 		default:
1736 			/*
1737 			 * Reason for CONN_CLOSED event must be one of
1738 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1739 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1740 			 * the remote end is closing the channel. In these
1741 			 * cases free the channel and transition to error
1742 			 * state
1743 			 */
1744 			qp = ibt_get_chan_private(event->cm_channel);
1745 			conn = qptoc(qp);
1746 			mutex_enter(&conn->c_lock);
1747 			if (conn->c_state == C_DISCONN_PEND) {
1748 				mutex_exit(&conn->c_lock);
1749 				break;
1750 			}
1751 
1752 			conn->c_state = C_ERROR_CONN;
1753 
1754 			/*
1755 			 * Free the conn if c_ref is down to 0 already
1756 			 */
1757 			if (conn->c_ref == 0) {
1758 				/*
1759 				 * Remove from list and free conn
1760 				 */
1761 				conn->c_state = C_DISCONN_PEND;
1762 				mutex_exit(&conn->c_lock);
1763 				rw_enter(&hca->state_lock, RW_READER);
1764 				if (hca->state != HCA_DETACHED)
1765 					(void) rib_disconnect_channel(conn,
1766 					    &hca->cl_conn_list);
1767 				rw_exit(&hca->state_lock);
1768 			} else {
1769 				/*
1770 				 * conn will be freed when c_ref goes to 0.
1771 				 * Indicate to cleaning thread not to close
1772 				 * the connection, but just free the channel.
1773 				 */
1774 				conn->c_flags |= C_CLOSE_NOTNEEDED;
1775 				mutex_exit(&conn->c_lock);
1776 			}
1777 #ifdef DEBUG
1778 			if (rib_debug)
1779 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1780 				    "(CONN_CLOSED) channel disconnected");
1781 #endif
1782 			break;
1783 		}
1784 		break;
1785 	}
1786 	default:
1787 		break;
1788 	}
1789 	return (IBT_CM_ACCEPT);
1790 }
1791 
1792 /*
1793  * Connect to the server.
1794  */
1795 rdma_stat
1796 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1797 {
1798 	ibt_chan_open_args_t	chan_args;	/* channel args */
1799 	ibt_chan_sizes_t	chan_sizes;
1800 	ibt_rc_chan_alloc_args_t	qp_attr;
1801 	ibt_status_t		ibt_status;
1802 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1803 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1804 	ibt_ip_cm_info_t	ipcm_info;
1805 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1806 
1807 
1808 	(void) bzero(&chan_args, sizeof (chan_args));
1809 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1810 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1811 
1812 	ipcm_info.src_addr.family = rptp->srcip.family;
1813 	switch (ipcm_info.src_addr.family) {
1814 	case AF_INET:
1815 		ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1816 		break;
1817 	case AF_INET6:
1818 		ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1819 		break;
1820 	}
1821 
1822 	ipcm_info.dst_addr.family = rptp->srcip.family;
1823 	switch (ipcm_info.dst_addr.family) {
1824 	case AF_INET:
1825 		ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1826 		break;
1827 	case AF_INET6:
1828 		ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1829 		break;
1830 	}
1831 
1832 	ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1833 
1834 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1835 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1836 
1837 	if (ibt_status != IBT_SUCCESS) {
1838 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1839 		return (-1);
1840 	}
1841 
1842 	qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1843 	/* Alloc a RC channel */
1844 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1845 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1846 	qp_attr.rc_pd = hca->pd_hdl;
1847 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1848 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1849 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1850 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1851 	qp_attr.rc_clone_chan = NULL;
1852 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1853 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1854 
1855 	rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1856 	chan_args.oc_path = &rptp->path;
1857 
1858 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1859 	chan_args.oc_cm_clnt_private = (void *)hca;
1860 	chan_args.oc_rdma_ra_out = 4;
1861 	chan_args.oc_rdma_ra_in = 4;
1862 	chan_args.oc_path_retry_cnt = 2;
1863 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1864 	chan_args.oc_priv_data = cmp_ip_pvt;
1865 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1866 
1867 refresh:
1868 	rw_enter(&hca->state_lock, RW_READER);
1869 	if (hca->state != HCA_DETACHED) {
1870 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1871 		    IBT_ACHAN_NO_FLAGS,
1872 		    &qp_attr, &qp->qp_hdl,
1873 		    &chan_sizes);
1874 	} else {
1875 		rw_exit(&hca->state_lock);
1876 		return (RDMA_FAILED);
1877 	}
1878 	rw_exit(&hca->state_lock);
1879 
1880 	if (ibt_status != IBT_SUCCESS) {
1881 		DTRACE_PROBE1(rpcib__i_conntosrv,
1882 		    int, ibt_status);
1883 		return (RDMA_FAILED);
1884 	}
1885 
1886 	/* Connect to the Server */
1887 	(void) bzero(&ret_args, sizeof (ret_args));
1888 	mutex_enter(&qp->cb_lock);
1889 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1890 	    IBT_BLOCKING, &chan_args, &ret_args);
1891 	if (ibt_status != IBT_SUCCESS) {
1892 		DTRACE_PROBE2(rpcib__i_openrctosrv,
1893 		    int, ibt_status, int, ret_args.rc_status);
1894 
1895 		(void) ibt_free_channel(qp->qp_hdl);
1896 		qp->qp_hdl = NULL;
1897 		mutex_exit(&qp->cb_lock);
1898 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1899 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1900 			/*
1901 			 * Got IBT_CM_CONN_STALE probably because of stale
1902 			 * data on the passive end of a channel that existed
1903 			 * prior to reboot. Retry establishing a channel
1904 			 * REFRESH_ATTEMPTS times, during which time the
1905 			 * stale conditions on the server might clear up.
1906 			 */
1907 			goto refresh;
1908 		}
1909 		return (RDMA_FAILED);
1910 	}
1911 	mutex_exit(&qp->cb_lock);
1912 	/*
1913 	 * Set the private data area to qp to be used in callbacks
1914 	 */
1915 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1916 	return (RDMA_SUCCESS);
1917 }
1918 
1919 rdma_stat
1920 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1921 {
1922 	uint_t			i, addr_count;
1923 	ibt_status_t		ibt_status;
1924 	uint8_t			num_paths_p;
1925 	ibt_ip_path_attr_t	ipattr;
1926 	ibt_path_ip_src_t	srcip;
1927 	rpcib_ipaddrs_t		addrs4;
1928 	rpcib_ipaddrs_t		addrs6;
1929 	struct sockaddr_in	*sinp;
1930 	struct sockaddr_in6	*sin6p;
1931 	rdma_stat		retval = RDMA_FAILED;
1932 	rib_hca_t *hca;
1933 
1934 	if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1935 		return (RDMA_INVAL);
1936 	ASSERT(raddr->buf != NULL);
1937 
1938 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1939 
1940 	if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1941 	    (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1942 		retval = RDMA_FAILED;
1943 		goto done2;
1944 	}
1945 
1946 	if (addr_type == AF_INET) {
1947 		addr_count = addrs4.ri_count;
1948 		sinp = (struct sockaddr_in *)raddr->buf;
1949 		rptp->dstip.family = AF_INET;
1950 		rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1951 		sinp = addrs4.ri_list;
1952 	} else {
1953 		addr_count = addrs6.ri_count;
1954 		sin6p = (struct sockaddr_in6 *)raddr->buf;
1955 		rptp->dstip.family = AF_INET6;
1956 		rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1957 		sin6p = addrs6.ri_list;
1958 	}
1959 
1960 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1961 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1962 		rw_enter(&hca->state_lock, RW_READER);
1963 		if (hca->state == HCA_DETACHED) {
1964 			rw_exit(&hca->state_lock);
1965 			continue;
1966 		}
1967 
1968 		ipattr.ipa_dst_ip 	= &rptp->dstip;
1969 		ipattr.ipa_hca_guid	= hca->hca_guid;
1970 		ipattr.ipa_ndst		= 1;
1971 		ipattr.ipa_max_paths	= 1;
1972 		ipattr.ipa_src_ip.family = rptp->dstip.family;
1973 		for (i = 0; i < addr_count; i++) {
1974 			num_paths_p = 0;
1975 			if (addr_type == AF_INET) {
1976 				ipattr.ipa_src_ip.un.ip4addr =
1977 				    sinp[i].sin_addr.s_addr;
1978 			} else {
1979 				ipattr.ipa_src_ip.un.ip6addr =
1980 				    sin6p[i].sin6_addr;
1981 			}
1982 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
1983 
1984 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1985 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1986 			    &num_paths_p, &srcip);
1987 			if (ibt_status == IBT_SUCCESS &&
1988 			    num_paths_p != 0 &&
1989 			    rptp->path.pi_hca_guid == hca->hca_guid) {
1990 				rptp->hca = hca;
1991 				rw_exit(&hca->state_lock);
1992 				if (addr_type == AF_INET) {
1993 					rptp->srcip.family = AF_INET;
1994 					rptp->srcip.un.ip4addr =
1995 					    srcip.ip_primary.un.ip4addr;
1996 				} else {
1997 					rptp->srcip.family = AF_INET6;
1998 					rptp->srcip.un.ip6addr =
1999 					    srcip.ip_primary.un.ip6addr;
2000 
2001 				}
2002 				retval = RDMA_SUCCESS;
2003 				goto done1;
2004 			}
2005 		}
2006 		rw_exit(&hca->state_lock);
2007 	}
2008 done1:
2009 	rw_exit(&rib_stat->hcas_list_lock);
2010 done2:
2011 	if (addrs4.ri_size > 0)
2012 		kmem_free(addrs4.ri_list, addrs4.ri_size);
2013 	if (addrs6.ri_size > 0)
2014 		kmem_free(addrs6.ri_list, addrs6.ri_size);
2015 	return (retval);
2016 }
2017 
2018 /*
2019  * Close channel, remove from connection list and
2020  * free up resources allocated for that channel.
2021  */
2022 rdma_stat
2023 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2024 {
2025 	rib_qp_t	*qp = ctoqp(conn);
2026 	rib_hca_t	*hca;
2027 
2028 	mutex_enter(&conn->c_lock);
2029 	if (conn->c_timeout != NULL) {
2030 		mutex_exit(&conn->c_lock);
2031 		(void) untimeout(conn->c_timeout);
2032 		mutex_enter(&conn->c_lock);
2033 	}
2034 
2035 	while (conn->c_flags & C_CLOSE_PENDING) {
2036 		cv_wait(&conn->c_cv, &conn->c_lock);
2037 	}
2038 	mutex_exit(&conn->c_lock);
2039 
2040 	/*
2041 	 * c_ref == 0 and connection is in C_DISCONN_PEND
2042 	 */
2043 	hca = qp->hca;
2044 	if (conn_list != NULL)
2045 		(void) rib_rm_conn(conn, conn_list);
2046 
2047 	/*
2048 	 * There is only one case where we get here with
2049 	 * qp_hdl = NULL, which is during connection setup on
2050 	 * the client. In such a case there are no posted
2051 	 * send/recv buffers.
2052 	 */
2053 	if (qp->qp_hdl != NULL) {
2054 		mutex_enter(&qp->posted_rbufs_lock);
2055 		while (qp->n_posted_rbufs)
2056 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2057 		mutex_exit(&qp->posted_rbufs_lock);
2058 
2059 		mutex_enter(&qp->send_rbufs_lock);
2060 		while (qp->n_send_rbufs)
2061 			cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2062 		mutex_exit(&qp->send_rbufs_lock);
2063 
2064 		(void) ibt_free_channel(qp->qp_hdl);
2065 		qp->qp_hdl = NULL;
2066 	}
2067 
2068 	ASSERT(qp->rdlist == NULL);
2069 
2070 	if (qp->replylist != NULL) {
2071 		(void) rib_rem_replylist(qp);
2072 	}
2073 
2074 	cv_destroy(&qp->cb_conn_cv);
2075 	cv_destroy(&qp->posted_rbufs_cv);
2076 	cv_destroy(&qp->send_rbufs_cv);
2077 	mutex_destroy(&qp->cb_lock);
2078 	mutex_destroy(&qp->replylist_lock);
2079 	mutex_destroy(&qp->posted_rbufs_lock);
2080 	mutex_destroy(&qp->send_rbufs_lock);
2081 	mutex_destroy(&qp->rdlist_lock);
2082 
2083 	cv_destroy(&conn->c_cv);
2084 	mutex_destroy(&conn->c_lock);
2085 
2086 	if (conn->c_raddr.buf != NULL) {
2087 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2088 	}
2089 	if (conn->c_laddr.buf != NULL) {
2090 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2091 	}
2092 
2093 	/*
2094 	 * Credit control cleanup.
2095 	 */
2096 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2097 		rdma_clnt_cred_ctrl_t *cc_info;
2098 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2099 		cv_destroy(&cc_info->clnt_cc_cv);
2100 	}
2101 
2102 	kmem_free(qp, sizeof (rib_qp_t));
2103 
2104 	/*
2105 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2106 	 * then the hca is no longer being used.
2107 	 */
2108 	if (conn_list != NULL) {
2109 		rw_enter(&hca->state_lock, RW_READER);
2110 		if (hca->state == HCA_DETACHED) {
2111 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2112 			if (hca->srv_conn_list.conn_hd == NULL) {
2113 				rw_enter(&hca->cl_conn_list.conn_lock,
2114 				    RW_READER);
2115 
2116 				if (hca->cl_conn_list.conn_hd == NULL) {
2117 					mutex_enter(&hca->inuse_lock);
2118 					hca->inuse = FALSE;
2119 					cv_signal(&hca->cb_cv);
2120 					mutex_exit(&hca->inuse_lock);
2121 				}
2122 				rw_exit(&hca->cl_conn_list.conn_lock);
2123 			}
2124 			rw_exit(&hca->srv_conn_list.conn_lock);
2125 		}
2126 		rw_exit(&hca->state_lock);
2127 	}
2128 
2129 	return (RDMA_SUCCESS);
2130 }
2131 
2132 /*
2133  * All sends are done under the protection of
2134  * the wdesc->sendwait_lock. n_send_rbufs count
2135  * is protected using the send_rbufs_lock.
2136  * lock ordering is:
2137  * sendwait_lock -> send_rbufs_lock
2138  */
2139 
2140 void
2141 rib_send_hold(rib_qp_t *qp)
2142 {
2143 	mutex_enter(&qp->send_rbufs_lock);
2144 	qp->n_send_rbufs++;
2145 	mutex_exit(&qp->send_rbufs_lock);
2146 }
2147 
2148 void
2149 rib_send_rele(rib_qp_t *qp)
2150 {
2151 	mutex_enter(&qp->send_rbufs_lock);
2152 	qp->n_send_rbufs--;
2153 	if (qp->n_send_rbufs == 0)
2154 		cv_signal(&qp->send_rbufs_cv);
2155 	mutex_exit(&qp->send_rbufs_lock);
2156 }
2157 
2158 /*
2159  * Wait for send completion notification. Only on receiving a
2160  * notification be it a successful or error completion, free the
2161  * send_wid.
2162  */
2163 static rdma_stat
2164 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2165 {
2166 	clock_t timout, cv_wait_ret;
2167 	rdma_stat error = RDMA_SUCCESS;
2168 	int	i;
2169 
2170 	/*
2171 	 * Wait for send to complete
2172 	 */
2173 	ASSERT(wd != NULL);
2174 	mutex_enter(&wd->sendwait_lock);
2175 	if (wd->status == (uint_t)SEND_WAIT) {
2176 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2177 		    ddi_get_lbolt();
2178 
2179 		if (qp->mode == RIB_SERVER) {
2180 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2181 			    &wd->sendwait_lock, timout)) > 0 &&
2182 			    wd->status == (uint_t)SEND_WAIT)
2183 				;
2184 			switch (cv_wait_ret) {
2185 			case -1:	/* timeout */
2186 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2187 
2188 				wd->cv_sig = 0;		/* no signal needed */
2189 				error = RDMA_TIMEDOUT;
2190 				break;
2191 			default:	/* got send completion */
2192 				break;
2193 			}
2194 		} else {
2195 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2196 			    &wd->sendwait_lock, timout)) > 0 &&
2197 			    wd->status == (uint_t)SEND_WAIT)
2198 				;
2199 			switch (cv_wait_ret) {
2200 			case -1:	/* timeout */
2201 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2202 
2203 				wd->cv_sig = 0;		/* no signal needed */
2204 				error = RDMA_TIMEDOUT;
2205 				break;
2206 			case 0:		/* interrupted */
2207 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2208 
2209 				wd->cv_sig = 0;		/* no signal needed */
2210 				error = RDMA_INTR;
2211 				break;
2212 			default:	/* got send completion */
2213 				break;
2214 			}
2215 		}
2216 	}
2217 
2218 	if (wd->status != (uint_t)SEND_WAIT) {
2219 		/* got send completion */
2220 		if (wd->status != RDMA_SUCCESS) {
2221 			switch (wd->status) {
2222 			case RDMA_CONNLOST:
2223 				error = RDMA_CONNLOST;
2224 				break;
2225 			default:
2226 				error = RDMA_FAILED;
2227 				break;
2228 			}
2229 		}
2230 		for (i = 0; i < wd->nsbufs; i++) {
2231 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2232 			    (void *)(uintptr_t)wd->sbufaddr[i]);
2233 		}
2234 
2235 		rib_send_rele(qp);
2236 
2237 		mutex_exit(&wd->sendwait_lock);
2238 		(void) rib_free_sendwait(wd);
2239 
2240 	} else {
2241 		mutex_exit(&wd->sendwait_lock);
2242 	}
2243 	return (error);
2244 }
2245 
2246 static struct send_wid *
2247 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2248 {
2249 	struct send_wid	*wd;
2250 
2251 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2252 	wd->xid = xid;
2253 	wd->cv_sig = cv_sig;
2254 	wd->qp = qp;
2255 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2256 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2257 	wd->status = (uint_t)SEND_WAIT;
2258 
2259 	return (wd);
2260 }
2261 
2262 static int
2263 rib_free_sendwait(struct send_wid *wdesc)
2264 {
2265 	cv_destroy(&wdesc->wait_cv);
2266 	mutex_destroy(&wdesc->sendwait_lock);
2267 	kmem_free(wdesc, sizeof (*wdesc));
2268 
2269 	return (0);
2270 }
2271 
2272 static rdma_stat
2273 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2274 {
2275 	mutex_enter(&qp->replylist_lock);
2276 	if (rep != NULL) {
2277 		(void) rib_remreply(qp, rep);
2278 		mutex_exit(&qp->replylist_lock);
2279 		return (RDMA_SUCCESS);
2280 	}
2281 	mutex_exit(&qp->replylist_lock);
2282 	return (RDMA_FAILED);
2283 }
2284 
2285 /*
2286  * Send buffers are freed here only in case of error in posting
2287  * on QP. If the post succeeded, the send buffers are freed upon
2288  * send completion in rib_sendwait() or in the scq_handler.
2289  */
2290 rdma_stat
2291 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2292 	int send_sig, int cv_sig, caddr_t *swid)
2293 {
2294 	struct send_wid	*wdesc;
2295 	struct clist	*clp;
2296 	ibt_status_t	ibt_status = IBT_SUCCESS;
2297 	rdma_stat	ret = RDMA_SUCCESS;
2298 	ibt_send_wr_t	tx_wr;
2299 	int		i, nds;
2300 	ibt_wr_ds_t	sgl[DSEG_MAX];
2301 	uint_t		total_msg_size;
2302 	rib_qp_t	*qp;
2303 
2304 	qp = ctoqp(conn);
2305 
2306 	ASSERT(cl != NULL);
2307 
2308 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2309 
2310 	nds = 0;
2311 	total_msg_size = 0;
2312 	clp = cl;
2313 	while (clp != NULL) {
2314 		if (nds >= DSEG_MAX) {
2315 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2316 			return (RDMA_FAILED);
2317 		}
2318 		sgl[nds].ds_va = clp->w.c_saddr;
2319 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2320 		sgl[nds].ds_len = clp->c_len;
2321 		total_msg_size += clp->c_len;
2322 		clp = clp->c_next;
2323 		nds++;
2324 	}
2325 
2326 	if (send_sig) {
2327 		/* Set SEND_SIGNAL flag. */
2328 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2329 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2330 		*swid = (caddr_t)wdesc;
2331 		tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2332 		mutex_enter(&wdesc->sendwait_lock);
2333 		wdesc->nsbufs = nds;
2334 		for (i = 0; i < nds; i++) {
2335 			wdesc->sbufaddr[i] = sgl[i].ds_va;
2336 		}
2337 	} else {
2338 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2339 		*swid = NULL;
2340 		tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2341 	}
2342 
2343 	tx_wr.wr_opcode = IBT_WRC_SEND;
2344 	tx_wr.wr_trans = IBT_RC_SRV;
2345 	tx_wr.wr_nds = nds;
2346 	tx_wr.wr_sgl = sgl;
2347 
2348 	mutex_enter(&conn->c_lock);
2349 	if (conn->c_state == C_CONNECTED) {
2350 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2351 	}
2352 	if (conn->c_state != C_CONNECTED ||
2353 	    ibt_status != IBT_SUCCESS) {
2354 		if (conn->c_state != C_DISCONN_PEND)
2355 			conn->c_state = C_ERROR_CONN;
2356 		mutex_exit(&conn->c_lock);
2357 		if (send_sig) {
2358 			for (i = 0; i < nds; i++) {
2359 				rib_rbuf_free(conn, SEND_BUFFER,
2360 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2361 			}
2362 			mutex_exit(&wdesc->sendwait_lock);
2363 			(void) rib_free_sendwait(wdesc);
2364 		}
2365 		return (RDMA_CONNLOST);
2366 	}
2367 
2368 	mutex_exit(&conn->c_lock);
2369 
2370 	if (send_sig) {
2371 		rib_send_hold(qp);
2372 		mutex_exit(&wdesc->sendwait_lock);
2373 		if (cv_sig) {
2374 			/*
2375 			 * cv_wait for send to complete.
2376 			 * We can fail due to a timeout or signal or
2377 			 * unsuccessful send.
2378 			 */
2379 			ret = rib_sendwait(qp, wdesc);
2380 
2381 			return (ret);
2382 		}
2383 	}
2384 
2385 	return (RDMA_SUCCESS);
2386 }
2387 
2388 
2389 rdma_stat
2390 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2391 {
2392 	rdma_stat	ret;
2393 	caddr_t		wd;
2394 
2395 	/* send-wait & cv_signal */
2396 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2397 	return (ret);
2398 }
2399 
2400 /*
2401  * Deprecated/obsolete interface not used currently
2402  * but earlier used for READ-READ protocol.
2403  * Send RPC reply and wait for RDMA_DONE.
2404  */
2405 rdma_stat
2406 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2407 {
2408 	rdma_stat ret = RDMA_SUCCESS;
2409 	struct rdma_done_list *rd;
2410 	clock_t timout, cv_wait_ret;
2411 	caddr_t *wid = NULL;
2412 	rib_qp_t *qp = ctoqp(conn);
2413 
2414 	mutex_enter(&qp->rdlist_lock);
2415 	rd = rdma_done_add(qp, msgid);
2416 
2417 	/* No cv_signal (whether send-wait or no-send-wait) */
2418 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2419 
2420 	if (ret != RDMA_SUCCESS) {
2421 		rdma_done_rm(qp, rd);
2422 	} else {
2423 		/*
2424 		 * Wait for RDMA_DONE from remote end
2425 		 */
2426 		timout =
2427 		    drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2428 		cv_wait_ret = cv_timedwait(&rd->rdma_done_cv,
2429 		    &qp->rdlist_lock,
2430 		    timout);
2431 
2432 		rdma_done_rm(qp, rd);
2433 
2434 		if (cv_wait_ret < 0) {
2435 			ret = RDMA_TIMEDOUT;
2436 		}
2437 	}
2438 
2439 	mutex_exit(&qp->rdlist_lock);
2440 	return (ret);
2441 }
2442 
2443 static struct recv_wid *
2444 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2445 {
2446 	struct recv_wid	*rwid;
2447 
2448 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2449 	rwid->xid = msgid;
2450 	rwid->addr = sgl->ds_va;
2451 	rwid->qp = qp;
2452 
2453 	return (rwid);
2454 }
2455 
2456 static void
2457 rib_free_wid(struct recv_wid *rwid)
2458 {
2459 	kmem_free(rwid, sizeof (struct recv_wid));
2460 }
2461 
2462 rdma_stat
2463 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2464 {
2465 	rib_qp_t	*qp = ctoqp(conn);
2466 	struct clist	*clp = cl;
2467 	struct reply	*rep;
2468 	struct recv_wid	*rwid;
2469 	int		nds;
2470 	ibt_wr_ds_t	sgl[DSEG_MAX];
2471 	ibt_recv_wr_t	recv_wr;
2472 	rdma_stat	ret;
2473 	ibt_status_t	ibt_status;
2474 
2475 	/*
2476 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2477 	 */
2478 
2479 	nds = 0;
2480 	while (cl != NULL) {
2481 		if (nds >= DSEG_MAX) {
2482 			ret = RDMA_FAILED;
2483 			goto done;
2484 		}
2485 		sgl[nds].ds_va = cl->w.c_saddr;
2486 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2487 		sgl[nds].ds_len = cl->c_len;
2488 		cl = cl->c_next;
2489 		nds++;
2490 	}
2491 
2492 	if (nds != 1) {
2493 		ret = RDMA_FAILED;
2494 		goto done;
2495 	}
2496 
2497 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2498 	recv_wr.wr_nds = nds;
2499 	recv_wr.wr_sgl = sgl;
2500 
2501 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2502 	if (rwid) {
2503 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2504 	} else {
2505 		ret = RDMA_NORESOURCE;
2506 		goto done;
2507 	}
2508 	rep = rib_addreplylist(qp, msgid);
2509 	if (!rep) {
2510 		rib_free_wid(rwid);
2511 		ret = RDMA_NORESOURCE;
2512 		goto done;
2513 	}
2514 
2515 	mutex_enter(&conn->c_lock);
2516 
2517 	if (conn->c_state == C_CONNECTED) {
2518 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2519 	}
2520 
2521 	if (conn->c_state != C_CONNECTED ||
2522 	    ibt_status != IBT_SUCCESS) {
2523 		if (conn->c_state != C_DISCONN_PEND)
2524 			conn->c_state = C_ERROR_CONN;
2525 		mutex_exit(&conn->c_lock);
2526 		rib_free_wid(rwid);
2527 		(void) rib_rem_rep(qp, rep);
2528 		ret = RDMA_CONNLOST;
2529 		goto done;
2530 	}
2531 	mutex_exit(&conn->c_lock);
2532 	return (RDMA_SUCCESS);
2533 
2534 done:
2535 	while (clp != NULL) {
2536 		rib_rbuf_free(conn, RECV_BUFFER,
2537 		    (void *)(uintptr_t)clp->w.c_saddr3);
2538 		clp = clp->c_next;
2539 	}
2540 	return (ret);
2541 }
2542 
2543 rdma_stat
2544 rib_svc_post(CONN* conn, struct clist *cl)
2545 {
2546 	rib_qp_t	*qp = ctoqp(conn);
2547 	struct svc_recv	*s_recvp;
2548 	int		nds;
2549 	ibt_wr_ds_t	sgl[DSEG_MAX];
2550 	ibt_recv_wr_t	recv_wr;
2551 	ibt_status_t	ibt_status;
2552 
2553 	nds = 0;
2554 	while (cl != NULL) {
2555 		if (nds >= DSEG_MAX) {
2556 			return (RDMA_FAILED);
2557 		}
2558 		sgl[nds].ds_va = cl->w.c_saddr;
2559 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2560 		sgl[nds].ds_len = cl->c_len;
2561 		cl = cl->c_next;
2562 		nds++;
2563 	}
2564 
2565 	if (nds != 1) {
2566 		rib_rbuf_free(conn, RECV_BUFFER,
2567 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2568 
2569 		return (RDMA_FAILED);
2570 	}
2571 
2572 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2573 	recv_wr.wr_nds = nds;
2574 	recv_wr.wr_sgl = sgl;
2575 
2576 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2577 	/* Use s_recvp's addr as wr id */
2578 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2579 	mutex_enter(&conn->c_lock);
2580 	if (conn->c_state == C_CONNECTED) {
2581 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2582 	}
2583 	if (conn->c_state != C_CONNECTED ||
2584 	    ibt_status != IBT_SUCCESS) {
2585 		if (conn->c_state != C_DISCONN_PEND)
2586 			conn->c_state = C_ERROR_CONN;
2587 		mutex_exit(&conn->c_lock);
2588 		rib_rbuf_free(conn, RECV_BUFFER,
2589 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2590 		(void) rib_free_svc_recv(s_recvp);
2591 
2592 		return (RDMA_CONNLOST);
2593 	}
2594 	mutex_exit(&conn->c_lock);
2595 
2596 	return (RDMA_SUCCESS);
2597 }
2598 
2599 /* Client */
2600 rdma_stat
2601 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2602 {
2603 
2604 	return (rib_clnt_post(conn, cl, msgid));
2605 }
2606 
2607 /* Client */
2608 rdma_stat
2609 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2610 {
2611 	rib_qp_t	*qp = ctoqp(conn);
2612 	struct reply	*rep;
2613 
2614 	mutex_enter(&qp->replylist_lock);
2615 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2616 		if (rep->xid == msgid) {
2617 			if (rep->vaddr_cq) {
2618 				rib_rbuf_free(conn, RECV_BUFFER,
2619 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2620 			}
2621 			(void) rib_remreply(qp, rep);
2622 			break;
2623 		}
2624 	}
2625 	mutex_exit(&qp->replylist_lock);
2626 
2627 	return (RDMA_SUCCESS);
2628 }
2629 
2630 /* Server */
2631 rdma_stat
2632 rib_post_recv(CONN *conn, struct clist *cl)
2633 {
2634 	rib_qp_t	*qp = ctoqp(conn);
2635 
2636 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2637 		mutex_enter(&qp->posted_rbufs_lock);
2638 		qp->n_posted_rbufs++;
2639 		mutex_exit(&qp->posted_rbufs_lock);
2640 		return (RDMA_SUCCESS);
2641 	}
2642 	return (RDMA_FAILED);
2643 }
2644 
2645 /*
2646  * Client side only interface to "recv" the rpc reply buf
2647  * posted earlier by rib_post_resp(conn, cl, msgid).
2648  */
2649 rdma_stat
2650 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2651 {
2652 	struct reply *rep = NULL;
2653 	clock_t timout, cv_wait_ret;
2654 	rdma_stat ret = RDMA_SUCCESS;
2655 	rib_qp_t *qp = ctoqp(conn);
2656 
2657 	/*
2658 	 * Find the reply structure for this msgid
2659 	 */
2660 	mutex_enter(&qp->replylist_lock);
2661 
2662 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2663 		if (rep->xid == msgid)
2664 			break;
2665 	}
2666 
2667 	if (rep != NULL) {
2668 		/*
2669 		 * If message not yet received, wait.
2670 		 */
2671 		if (rep->status == (uint_t)REPLY_WAIT) {
2672 			timout = ddi_get_lbolt() +
2673 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2674 
2675 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2676 			    &qp->replylist_lock, timout)) > 0 &&
2677 			    rep->status == (uint_t)REPLY_WAIT)
2678 				;
2679 
2680 			switch (cv_wait_ret) {
2681 			case -1:	/* timeout */
2682 				ret = RDMA_TIMEDOUT;
2683 				break;
2684 			case 0:
2685 				ret = RDMA_INTR;
2686 				break;
2687 			default:
2688 				break;
2689 			}
2690 		}
2691 
2692 		if (rep->status == RDMA_SUCCESS) {
2693 			struct clist *cl = NULL;
2694 
2695 			/*
2696 			 * Got message successfully
2697 			 */
2698 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2699 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2700 			*clp = cl;
2701 		} else {
2702 			if (rep->status != (uint_t)REPLY_WAIT) {
2703 				/*
2704 				 * Got error in reply message. Free
2705 				 * recv buffer here.
2706 				 */
2707 				ret = rep->status;
2708 				rib_rbuf_free(conn, RECV_BUFFER,
2709 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2710 			}
2711 		}
2712 		(void) rib_remreply(qp, rep);
2713 	} else {
2714 		/*
2715 		 * No matching reply structure found for given msgid on the
2716 		 * reply wait list.
2717 		 */
2718 		ret = RDMA_INVAL;
2719 		DTRACE_PROBE(rpcib__i__nomatchxid2);
2720 	}
2721 
2722 	/*
2723 	 * Done.
2724 	 */
2725 	mutex_exit(&qp->replylist_lock);
2726 	return (ret);
2727 }
2728 
2729 /*
2730  * RDMA write a buffer to the remote address.
2731  */
2732 rdma_stat
2733 rib_write(CONN *conn, struct clist *cl, int wait)
2734 {
2735 	ibt_send_wr_t	tx_wr;
2736 	int		cv_sig;
2737 	ibt_wr_ds_t	sgl[DSEG_MAX];
2738 	struct send_wid	*wdesc;
2739 	ibt_status_t	ibt_status;
2740 	rdma_stat	ret = RDMA_SUCCESS;
2741 	rib_qp_t	*qp = ctoqp(conn);
2742 	uint64_t	n_writes = 0;
2743 
2744 	if (cl == NULL) {
2745 		return (RDMA_FAILED);
2746 	}
2747 
2748 	while ((cl != NULL)) {
2749 		if (cl->c_len > 0) {
2750 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2751 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2752 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2753 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2754 			sgl[0].ds_va = cl->w.c_saddr;
2755 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2756 			sgl[0].ds_len = cl->c_len;
2757 
2758 			if (wait) {
2759 				cv_sig = 1;
2760 			} else {
2761 				if (n_writes > max_unsignaled_rws) {
2762 					n_writes = 0;
2763 					cv_sig = 1;
2764 				} else {
2765 					cv_sig = 0;
2766 				}
2767 			}
2768 
2769 			if (cv_sig) {
2770 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2771 				wdesc = rib_init_sendwait(0, cv_sig, qp);
2772 				tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2773 				mutex_enter(&wdesc->sendwait_lock);
2774 			} else {
2775 				tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2776 				tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2777 			}
2778 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2779 			tx_wr.wr_trans = IBT_RC_SRV;
2780 			tx_wr.wr_nds = 1;
2781 			tx_wr.wr_sgl = sgl;
2782 
2783 			mutex_enter(&conn->c_lock);
2784 			if (conn->c_state == C_CONNECTED) {
2785 				ibt_status =
2786 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2787 			}
2788 			if (conn->c_state != C_CONNECTED ||
2789 			    ibt_status != IBT_SUCCESS) {
2790 				if (conn->c_state != C_DISCONN_PEND)
2791 					conn->c_state = C_ERROR_CONN;
2792 				mutex_exit(&conn->c_lock);
2793 				if (cv_sig) {
2794 					mutex_exit(&wdesc->sendwait_lock);
2795 					(void) rib_free_sendwait(wdesc);
2796 				}
2797 				return (RDMA_CONNLOST);
2798 			}
2799 
2800 			mutex_exit(&conn->c_lock);
2801 
2802 			/*
2803 			 * Wait for send to complete
2804 			 */
2805 			if (cv_sig) {
2806 
2807 				rib_send_hold(qp);
2808 				mutex_exit(&wdesc->sendwait_lock);
2809 
2810 				ret = rib_sendwait(qp, wdesc);
2811 				if (ret != 0)
2812 					return (ret);
2813 			}
2814 			n_writes ++;
2815 		}
2816 		cl = cl->c_next;
2817 	}
2818 	return (RDMA_SUCCESS);
2819 }
2820 
2821 /*
2822  * RDMA Read a buffer from the remote address.
2823  */
2824 rdma_stat
2825 rib_read(CONN *conn, struct clist *cl, int wait)
2826 {
2827 	ibt_send_wr_t	rx_wr;
2828 	int		cv_sig = 0;
2829 	ibt_wr_ds_t	sgl;
2830 	struct send_wid	*wdesc;
2831 	ibt_status_t	ibt_status = IBT_SUCCESS;
2832 	rdma_stat	ret = RDMA_SUCCESS;
2833 	rib_qp_t	*qp = ctoqp(conn);
2834 
2835 	if (cl == NULL) {
2836 		return (RDMA_FAILED);
2837 	}
2838 
2839 	while (cl != NULL) {
2840 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2841 		/*
2842 		 * Remote address is at the head chunk item in list.
2843 		 */
2844 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2845 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2846 
2847 		sgl.ds_va = cl->u.c_daddr;
2848 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2849 		sgl.ds_len = cl->c_len;
2850 
2851 		/*
2852 		 * If there are multiple chunks to be read, and
2853 		 * wait is set, ask for signal only for the last chunk
2854 		 * and wait only on the last chunk. The completion of
2855 		 * RDMA_READ on last chunk ensures that reads on all
2856 		 * previous chunks are also completed.
2857 		 */
2858 		if (wait && (cl->c_next == NULL)) {
2859 			cv_sig = 1;
2860 			wdesc = rib_init_sendwait(0, cv_sig, qp);
2861 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2862 			rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2863 			mutex_enter(&wdesc->sendwait_lock);
2864 		} else {
2865 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2866 			rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2867 		}
2868 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2869 		rx_wr.wr_trans = IBT_RC_SRV;
2870 		rx_wr.wr_nds = 1;
2871 		rx_wr.wr_sgl = &sgl;
2872 
2873 		mutex_enter(&conn->c_lock);
2874 		if (conn->c_state == C_CONNECTED) {
2875 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2876 		}
2877 		if (conn->c_state != C_CONNECTED ||
2878 		    ibt_status != IBT_SUCCESS) {
2879 			if (conn->c_state != C_DISCONN_PEND)
2880 				conn->c_state = C_ERROR_CONN;
2881 			mutex_exit(&conn->c_lock);
2882 			if (wait && (cl->c_next == NULL)) {
2883 				mutex_exit(&wdesc->sendwait_lock);
2884 				(void) rib_free_sendwait(wdesc);
2885 			}
2886 			return (RDMA_CONNLOST);
2887 		}
2888 
2889 		mutex_exit(&conn->c_lock);
2890 
2891 		/*
2892 		 * Wait for send to complete if this is the
2893 		 * last item in the list.
2894 		 */
2895 		if (wait && cl->c_next == NULL) {
2896 			rib_send_hold(qp);
2897 			mutex_exit(&wdesc->sendwait_lock);
2898 
2899 			ret = rib_sendwait(qp, wdesc);
2900 
2901 			if (ret != 0)
2902 				return (ret);
2903 		}
2904 		cl = cl->c_next;
2905 	}
2906 	return (RDMA_SUCCESS);
2907 }
2908 
2909 /*
2910  * rib_srv_cm_handler()
2911  *    Connection Manager callback to handle RC connection requests.
2912  */
2913 /* ARGSUSED */
2914 static ibt_cm_status_t
2915 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2916 	ibt_cm_return_args_t *ret_args, void *priv_data,
2917 	ibt_priv_data_len_t len)
2918 {
2919 	queue_t		*q;
2920 	rib_qp_t	*qp;
2921 	rib_hca_t	*hca;
2922 	rdma_stat	status = RDMA_SUCCESS;
2923 	int		i;
2924 	struct clist	cl;
2925 	rdma_buf_t	rdbuf = {0};
2926 	void		*buf = NULL;
2927 	CONN		*conn;
2928 	ibt_ip_cm_info_t	ipinfo;
2929 	struct sockaddr_in *s;
2930 	struct sockaddr_in6 *s6;
2931 	int sin_size = sizeof (struct sockaddr_in);
2932 	int in_size = sizeof (struct in_addr);
2933 	int sin6_size = sizeof (struct sockaddr_in6);
2934 
2935 	ASSERT(any != NULL);
2936 	ASSERT(event != NULL);
2937 
2938 	hca = (rib_hca_t *)any;
2939 
2940 	/* got a connection request */
2941 	switch (event->cm_type) {
2942 	case IBT_CM_EVENT_REQ_RCV:
2943 		/*
2944 		 * If the plugin is in the NO_ACCEPT state, bail out.
2945 		 */
2946 		mutex_enter(&plugin_state_lock);
2947 		if (plugin_state == NO_ACCEPT) {
2948 			mutex_exit(&plugin_state_lock);
2949 			return (IBT_CM_REJECT);
2950 		}
2951 		mutex_exit(&plugin_state_lock);
2952 
2953 		/*
2954 		 * Need to send a MRA MAD to CM so that it does not
2955 		 * timeout on us.
2956 		 */
2957 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2958 		    event->cm_event.req.req_timeout * 8, NULL, 0);
2959 
2960 		mutex_enter(&rib_stat->open_hca_lock);
2961 		q = rib_stat->q;
2962 		mutex_exit(&rib_stat->open_hca_lock);
2963 
2964 		status = rib_svc_create_chan(hca, (caddr_t)q,
2965 		    event->cm_event.req.req_prim_hca_port, &qp);
2966 
2967 		if (status) {
2968 			return (IBT_CM_REJECT);
2969 		}
2970 
2971 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2972 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2973 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2974 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2975 
2976 		/*
2977 		 * Pre-posts RECV buffers
2978 		 */
2979 		conn = qptoc(qp);
2980 		for (i = 0; i < preposted_rbufs; i++) {
2981 			bzero(&rdbuf, sizeof (rdbuf));
2982 			rdbuf.type = RECV_BUFFER;
2983 			buf = rib_rbuf_alloc(conn, &rdbuf);
2984 			if (buf == NULL) {
2985 				/*
2986 				 * A connection is not established yet.
2987 				 * Just flush the channel. Buffers
2988 				 * posted till now will error out with
2989 				 * IBT_WC_WR_FLUSHED_ERR.
2990 				 */
2991 				(void) ibt_flush_channel(qp->qp_hdl);
2992 				(void) rib_disconnect_channel(conn, NULL);
2993 				return (IBT_CM_REJECT);
2994 			}
2995 
2996 			bzero(&cl, sizeof (cl));
2997 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
2998 			cl.c_len = rdbuf.len;
2999 			cl.c_smemhandle.mrc_lmr =
3000 			    rdbuf.handle.mrc_lmr; /* lkey */
3001 			cl.c_next = NULL;
3002 			status = rib_post_recv(conn, &cl);
3003 			if (status != RDMA_SUCCESS) {
3004 				/*
3005 				 * A connection is not established yet.
3006 				 * Just flush the channel. Buffers
3007 				 * posted till now will error out with
3008 				 * IBT_WC_WR_FLUSHED_ERR.
3009 				 */
3010 				(void) ibt_flush_channel(qp->qp_hdl);
3011 				(void) rib_disconnect_channel(conn, NULL);
3012 				return (IBT_CM_REJECT);
3013 			}
3014 		}
3015 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
3016 
3017 		/*
3018 		 * Get the address translation
3019 		 */
3020 		rw_enter(&hca->state_lock, RW_READER);
3021 		if (hca->state == HCA_DETACHED) {
3022 			rw_exit(&hca->state_lock);
3023 			return (IBT_CM_REJECT);
3024 		}
3025 		rw_exit(&hca->state_lock);
3026 
3027 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3028 
3029 		if (ibt_get_ip_data(event->cm_priv_data_len,
3030 		    event->cm_priv_data,
3031 		    &ipinfo) != IBT_SUCCESS) {
3032 
3033 			return (IBT_CM_REJECT);
3034 		}
3035 
3036 		switch (ipinfo.src_addr.family) {
3037 		case AF_INET:
3038 
3039 			conn->c_raddr.maxlen =
3040 			    conn->c_raddr.len = sin_size;
3041 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3042 
3043 			s = (struct sockaddr_in *)conn->c_raddr.buf;
3044 			s->sin_family = AF_INET;
3045 
3046 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3047 			    &s->sin_addr, in_size);
3048 
3049 			break;
3050 
3051 		case AF_INET6:
3052 
3053 			conn->c_raddr.maxlen =
3054 			    conn->c_raddr.len = sin6_size;
3055 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3056 
3057 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3058 			s6->sin6_family = AF_INET6;
3059 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3060 			    &s6->sin6_addr,
3061 			    sizeof (struct in6_addr));
3062 
3063 			break;
3064 
3065 		default:
3066 			return (IBT_CM_REJECT);
3067 		}
3068 
3069 		break;
3070 
3071 	case IBT_CM_EVENT_CONN_CLOSED:
3072 	{
3073 		CONN		*conn;
3074 		rib_qp_t	*qp;
3075 
3076 		switch (event->cm_event.closed) {
3077 		case IBT_CM_CLOSED_DREP_RCVD:
3078 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
3079 		case IBT_CM_CLOSED_DUP:
3080 		case IBT_CM_CLOSED_ABORT:
3081 		case IBT_CM_CLOSED_ALREADY:
3082 			/*
3083 			 * These cases indicate the local end initiated
3084 			 * the closing of the channel. Nothing to do here.
3085 			 */
3086 			break;
3087 		default:
3088 			/*
3089 			 * Reason for CONN_CLOSED event must be one of
3090 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3091 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
3092 			 * the remote end is closing the channel. In these
3093 			 * cases free the channel and transition to error
3094 			 * state
3095 			 */
3096 			qp = ibt_get_chan_private(event->cm_channel);
3097 			conn = qptoc(qp);
3098 			mutex_enter(&conn->c_lock);
3099 			if (conn->c_state == C_DISCONN_PEND) {
3100 				mutex_exit(&conn->c_lock);
3101 				break;
3102 			}
3103 			conn->c_state = C_ERROR_CONN;
3104 
3105 			/*
3106 			 * Free the conn if c_ref goes down to 0
3107 			 */
3108 			if (conn->c_ref == 0) {
3109 				/*
3110 				 * Remove from list and free conn
3111 				 */
3112 				conn->c_state = C_DISCONN_PEND;
3113 				mutex_exit(&conn->c_lock);
3114 				(void) rib_disconnect_channel(conn,
3115 				    &hca->srv_conn_list);
3116 			} else {
3117 				/*
3118 				 * conn will be freed when c_ref goes to 0.
3119 				 * Indicate to cleaning thread not to close
3120 				 * the connection, but just free the channel.
3121 				 */
3122 				conn->c_flags |= C_CLOSE_NOTNEEDED;
3123 				mutex_exit(&conn->c_lock);
3124 			}
3125 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3126 			break;
3127 		}
3128 		break;
3129 	}
3130 	case IBT_CM_EVENT_CONN_EST:
3131 		/*
3132 		 * RTU received, hence connection established.
3133 		 */
3134 		if (rib_debug > 1)
3135 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3136 			    "(CONN_EST) channel established");
3137 		break;
3138 
3139 	default:
3140 		if (rib_debug > 2) {
3141 			/* Let CM handle the following events. */
3142 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3143 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3144 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3145 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3146 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3147 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3148 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3149 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3150 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3151 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3152 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3153 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3154 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3155 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3156 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
3157 			}
3158 		}
3159 		return (IBT_CM_DEFAULT);
3160 	}
3161 
3162 	/* accept all other CM messages (i.e. let the CM handle them) */
3163 	return (IBT_CM_ACCEPT);
3164 }
3165 
3166 static rdma_stat
3167 rib_register_service(rib_hca_t *hca, int service_type,
3168 	uint8_t protocol_num, in_port_t dst_port)
3169 {
3170 	ibt_srv_desc_t		sdesc;
3171 	ibt_hca_portinfo_t	*port_infop;
3172 	ib_svc_id_t		srv_id;
3173 	ibt_srv_hdl_t		srv_hdl;
3174 	uint_t			port_size;
3175 	uint_t			pki, i, num_ports, nbinds;
3176 	ibt_status_t		ibt_status;
3177 	rib_service_t		*service;
3178 	ib_pkey_t		pkey;
3179 
3180 	/*
3181 	 * Query all ports for the given HCA
3182 	 */
3183 	rw_enter(&hca->state_lock, RW_READER);
3184 	if (hca->state != HCA_DETACHED) {
3185 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3186 		    &num_ports, &port_size);
3187 		rw_exit(&hca->state_lock);
3188 	} else {
3189 		rw_exit(&hca->state_lock);
3190 		return (RDMA_FAILED);
3191 	}
3192 	if (ibt_status != IBT_SUCCESS) {
3193 		return (RDMA_FAILED);
3194 	}
3195 
3196 	DTRACE_PROBE1(rpcib__i__regservice_numports,
3197 	    int, num_ports);
3198 
3199 	for (i = 0; i < num_ports; i++) {
3200 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3201 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3202 			    int, i+1);
3203 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3204 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
3205 			    int, i+1);
3206 		}
3207 	}
3208 
3209 	/*
3210 	 * Get all the IP addresses on this system to register the
3211 	 * given "service type" on all DNS recognized IP addrs.
3212 	 * Each service type such as NFS will have all the systems
3213 	 * IP addresses as its different names. For now the only
3214 	 * type of service we support in RPCIB is NFS.
3215 	 */
3216 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3217 	/*
3218 	 * Start registering and binding service to active
3219 	 * on active ports on this HCA.
3220 	 */
3221 	nbinds = 0;
3222 	for (service = rib_stat->service_list;
3223 	    service && (service->srv_type != service_type);
3224 	    service = service->next)
3225 		;
3226 
3227 	if (service == NULL) {
3228 		/*
3229 		 * We use IP addresses as the service names for
3230 		 * service registration.  Register each of them
3231 		 * with CM to obtain a svc_id and svc_hdl.  We do not
3232 		 * register the service with machine's loopback address.
3233 		 */
3234 		(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3235 		(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3236 		(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3237 		sdesc.sd_handler = rib_srv_cm_handler;
3238 		sdesc.sd_flags = 0;
3239 		ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3240 		    &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3241 		    1, &srv_hdl, &srv_id);
3242 		if ((ibt_status != IBT_SUCCESS) &&
3243 		    (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3244 			rw_exit(&rib_stat->service_list_lock);
3245 			DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3246 			    int, ibt_status);
3247 			ibt_free_portinfo(port_infop, port_size);
3248 			return (RDMA_FAILED);
3249 		}
3250 
3251 		/*
3252 		 * Allocate and prepare a service entry
3253 		 */
3254 		service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3255 
3256 		service->srv_type = service_type;
3257 		service->srv_hdl = srv_hdl;
3258 		service->srv_id = srv_id;
3259 
3260 		service->next = rib_stat->service_list;
3261 		rib_stat->service_list = service;
3262 		DTRACE_PROBE1(rpcib__i__regservice__new__service,
3263 		    int, service->srv_type);
3264 	} else {
3265 		srv_hdl = service->srv_hdl;
3266 		srv_id = service->srv_id;
3267 		DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3268 		    int, service->srv_type);
3269 	}
3270 
3271 	for (i = 0; i < num_ports; i++) {
3272 		ibt_sbind_hdl_t		sbp;
3273 		rib_hca_service_t	*hca_srv;
3274 		ib_gid_t		gid;
3275 
3276 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3277 			continue;
3278 
3279 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3280 			pkey = port_infop[i].p_pkey_tbl[pki];
3281 
3282 			rw_enter(&hca->bound_services_lock, RW_READER);
3283 			gid = port_infop[i].p_sgid_tbl[0];
3284 			for (hca_srv = hca->bound_services; hca_srv;
3285 			    hca_srv = hca_srv->next) {
3286 				if ((hca_srv->srv_id == service->srv_id) &&
3287 				    (hca_srv->gid.gid_prefix ==
3288 				    gid.gid_prefix) &&
3289 				    (hca_srv->gid.gid_guid == gid.gid_guid))
3290 					break;
3291 			}
3292 			rw_exit(&hca->bound_services_lock);
3293 			if (hca_srv != NULL) {
3294 				/*
3295 				 * port is alreay bound the the service
3296 				 */
3297 				DTRACE_PROBE1(
3298 				    rpcib__i__regservice__already__bound,
3299 				    int, i+1);
3300 				nbinds++;
3301 				continue;
3302 			}
3303 
3304 			if ((pkey & IBSRM_HB) &&
3305 			    (pkey != IB_PKEY_INVALID_FULL)) {
3306 
3307 				sbp = NULL;
3308 				ibt_status = ibt_bind_service(srv_hdl,
3309 				    gid, NULL, hca, &sbp);
3310 
3311 				if (ibt_status == IBT_SUCCESS) {
3312 					hca_srv = kmem_zalloc(
3313 					    sizeof (rib_hca_service_t),
3314 					    KM_SLEEP);
3315 					hca_srv->srv_id = srv_id;
3316 					hca_srv->gid = gid;
3317 					hca_srv->sbind_hdl = sbp;
3318 
3319 					rw_enter(&hca->bound_services_lock,
3320 					    RW_WRITER);
3321 					hca_srv->next = hca->bound_services;
3322 					hca->bound_services = hca_srv;
3323 					rw_exit(&hca->bound_services_lock);
3324 					nbinds++;
3325 				}
3326 
3327 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3328 				    int, ibt_status);
3329 			}
3330 		}
3331 	}
3332 	rw_exit(&rib_stat->service_list_lock);
3333 
3334 	ibt_free_portinfo(port_infop, port_size);
3335 
3336 	if (nbinds == 0) {
3337 		return (RDMA_FAILED);
3338 	} else {
3339 		/*
3340 		 * Put this plugin into accept state, since atleast
3341 		 * one registration was successful.
3342 		 */
3343 		mutex_enter(&plugin_state_lock);
3344 		plugin_state = ACCEPT;
3345 		mutex_exit(&plugin_state_lock);
3346 		return (RDMA_SUCCESS);
3347 	}
3348 }
3349 
3350 void
3351 rib_listen(struct rdma_svc_data *rd)
3352 {
3353 	rdma_stat status;
3354 	int n_listening = 0;
3355 	rib_hca_t *hca;
3356 
3357 	mutex_enter(&rib_stat->listen_lock);
3358 	/*
3359 	 * if rd parameter is NULL then it means that rib_stat->q is
3360 	 * already initialized by a call from RDMA and we just want to
3361 	 * add a newly attached HCA to the same listening state as other
3362 	 * HCAs.
3363 	 */
3364 	if (rd == NULL) {
3365 		if (rib_stat->q == NULL) {
3366 			mutex_exit(&rib_stat->listen_lock);
3367 			return;
3368 		}
3369 	} else {
3370 		rib_stat->q = &rd->q;
3371 	}
3372 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3373 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3374 		/*
3375 		 * First check if a hca is still attached
3376 		 */
3377 		rw_enter(&hca->state_lock, RW_READER);
3378 		if (hca->state != HCA_INITED) {
3379 			rw_exit(&hca->state_lock);
3380 			continue;
3381 		}
3382 		rw_exit(&hca->state_lock);
3383 
3384 		/*
3385 		 * Right now the only service type is NFS. Hence
3386 		 * force feed this value. Ideally to communicate
3387 		 * the service type it should be passed down in
3388 		 * rdma_svc_data.
3389 		 */
3390 		status = rib_register_service(hca, NFS,
3391 		    IPPROTO_TCP, nfs_rdma_port);
3392 		if (status == RDMA_SUCCESS)
3393 			n_listening++;
3394 	}
3395 	rw_exit(&rib_stat->hcas_list_lock);
3396 
3397 	/*
3398 	 * Service active on an HCA, check rd->err_code for more
3399 	 * explainable errors.
3400 	 */
3401 	if (rd) {
3402 		if (n_listening > 0) {
3403 			rd->active = 1;
3404 			rd->err_code = RDMA_SUCCESS;
3405 		} else {
3406 			rd->active = 0;
3407 			rd->err_code = RDMA_FAILED;
3408 		}
3409 	}
3410 	mutex_exit(&rib_stat->listen_lock);
3411 }
3412 
3413 /* XXXX */
3414 /* ARGSUSED */
3415 static void
3416 rib_listen_stop(struct rdma_svc_data *svcdata)
3417 {
3418 	rib_hca_t		*hca;
3419 
3420 	mutex_enter(&rib_stat->listen_lock);
3421 	/*
3422 	 * KRPC called the RDMATF to stop the listeners, this means
3423 	 * stop sending incomming or recieved requests to KRPC master
3424 	 * transport handle for RDMA-IB. This is also means that the
3425 	 * master transport handle, responsible for us, is going away.
3426 	 */
3427 	mutex_enter(&plugin_state_lock);
3428 	plugin_state = NO_ACCEPT;
3429 	if (svcdata != NULL)
3430 		svcdata->active = 0;
3431 	mutex_exit(&plugin_state_lock);
3432 
3433 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3434 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3435 		/*
3436 		 * First check if a hca is still attached
3437 		 */
3438 		rw_enter(&hca->state_lock, RW_READER);
3439 		if (hca->state == HCA_DETACHED) {
3440 			rw_exit(&hca->state_lock);
3441 			continue;
3442 		}
3443 		rib_close_channels(&hca->srv_conn_list);
3444 		rib_stop_services(hca);
3445 		rw_exit(&hca->state_lock);
3446 	}
3447 	rw_exit(&rib_stat->hcas_list_lock);
3448 
3449 	/*
3450 	 * Avoid rib_listen() using the stale q field.
3451 	 * This could happen if a port goes up after all services
3452 	 * are already unregistered.
3453 	 */
3454 	rib_stat->q = NULL;
3455 	mutex_exit(&rib_stat->listen_lock);
3456 }
3457 
3458 /*
3459  * Traverse the HCA's service list to unbind and deregister services.
3460  * For each bound service of HCA to be removed, first find the corresponding
3461  * service handle (srv_hdl) and then unbind the service by calling
3462  * ibt_unbind_service().
3463  */
3464 static void
3465 rib_stop_services(rib_hca_t *hca)
3466 {
3467 	rib_hca_service_t *srv_list, *to_remove;
3468 
3469 	/*
3470 	 * unbind and deregister the services for this service type.
3471 	 * Right now there is only one service type. In future it will
3472 	 * be passed down to this function.
3473 	 */
3474 	rw_enter(&hca->bound_services_lock, RW_READER);
3475 	srv_list = hca->bound_services;
3476 	hca->bound_services = NULL;
3477 	rw_exit(&hca->bound_services_lock);
3478 
3479 	while (srv_list != NULL) {
3480 		rib_service_t *sc;
3481 
3482 		to_remove = srv_list;
3483 		srv_list = to_remove->next;
3484 		rw_enter(&rib_stat->service_list_lock, RW_READER);
3485 		for (sc = rib_stat->service_list;
3486 		    sc && (sc->srv_id != to_remove->srv_id);
3487 		    sc = sc->next)
3488 			;
3489 		/*
3490 		 * if sc is NULL then the service doesn't exist anymore,
3491 		 * probably just removed completely through rib_stat.
3492 		 */
3493 		if (sc != NULL)
3494 			(void) ibt_unbind_service(sc->srv_hdl,
3495 			    to_remove->sbind_hdl);
3496 		rw_exit(&rib_stat->service_list_lock);
3497 		kmem_free(to_remove, sizeof (rib_hca_service_t));
3498 	}
3499 }
3500 
3501 static struct svc_recv *
3502 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3503 {
3504 	struct svc_recv	*recvp;
3505 
3506 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3507 	recvp->vaddr = sgl->ds_va;
3508 	recvp->qp = qp;
3509 	recvp->bytes_xfer = 0;
3510 	return (recvp);
3511 }
3512 
3513 static int
3514 rib_free_svc_recv(struct svc_recv *recvp)
3515 {
3516 	kmem_free(recvp, sizeof (*recvp));
3517 
3518 	return (0);
3519 }
3520 
3521 static struct reply *
3522 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3523 {
3524 	struct reply	*rep;
3525 
3526 
3527 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3528 	if (rep == NULL) {
3529 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3530 		return (NULL);
3531 	}
3532 	rep->xid = msgid;
3533 	rep->vaddr_cq = NULL;
3534 	rep->bytes_xfer = 0;
3535 	rep->status = (uint_t)REPLY_WAIT;
3536 	rep->prev = NULL;
3537 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3538 
3539 	mutex_enter(&qp->replylist_lock);
3540 	if (qp->replylist) {
3541 		rep->next = qp->replylist;
3542 		qp->replylist->prev = rep;
3543 	}
3544 	qp->rep_list_size++;
3545 
3546 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3547 	    int, qp->rep_list_size);
3548 
3549 	qp->replylist = rep;
3550 	mutex_exit(&qp->replylist_lock);
3551 
3552 	return (rep);
3553 }
3554 
3555 static rdma_stat
3556 rib_rem_replylist(rib_qp_t *qp)
3557 {
3558 	struct reply	*r, *n;
3559 
3560 	mutex_enter(&qp->replylist_lock);
3561 	for (r = qp->replylist; r != NULL; r = n) {
3562 		n = r->next;
3563 		(void) rib_remreply(qp, r);
3564 	}
3565 	mutex_exit(&qp->replylist_lock);
3566 
3567 	return (RDMA_SUCCESS);
3568 }
3569 
3570 static int
3571 rib_remreply(rib_qp_t *qp, struct reply *rep)
3572 {
3573 
3574 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3575 	if (rep->prev) {
3576 		rep->prev->next = rep->next;
3577 	}
3578 	if (rep->next) {
3579 		rep->next->prev = rep->prev;
3580 	}
3581 	if (qp->replylist == rep)
3582 		qp->replylist = rep->next;
3583 
3584 	cv_destroy(&rep->wait_cv);
3585 	qp->rep_list_size--;
3586 
3587 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3588 	    int, qp->rep_list_size);
3589 
3590 	kmem_free(rep, sizeof (*rep));
3591 
3592 	return (0);
3593 }
3594 
3595 rdma_stat
3596 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3597 	struct mrc *buf_handle)
3598 {
3599 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3600 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3601 	rdma_stat	status;
3602 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3603 
3604 	/*
3605 	 * Note: ALL buffer pools use the same memory type RDMARW.
3606 	 */
3607 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3608 	if (status == RDMA_SUCCESS) {
3609 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3610 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3611 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3612 	} else {
3613 		buf_handle->mrc_linfo = NULL;
3614 		buf_handle->mrc_lmr = 0;
3615 		buf_handle->mrc_rmr = 0;
3616 	}
3617 	return (status);
3618 }
3619 
3620 static rdma_stat
3621 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3622 	ibt_mr_flags_t spec,
3623 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3624 {
3625 	ibt_mr_attr_t	mem_attr;
3626 	ibt_status_t	ibt_status;
3627 	mem_attr.mr_vaddr = (uintptr_t)buf;
3628 	mem_attr.mr_len = (ib_msglen_t)size;
3629 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3630 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3631 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3632 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3633 
3634 	rw_enter(&hca->state_lock, RW_READER);
3635 	if (hca->state != HCA_DETACHED) {
3636 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3637 		    &mem_attr, mr_hdlp, mr_descp);
3638 		rw_exit(&hca->state_lock);
3639 	} else {
3640 		rw_exit(&hca->state_lock);
3641 		return (RDMA_FAILED);
3642 	}
3643 
3644 	if (ibt_status != IBT_SUCCESS) {
3645 		return (RDMA_FAILED);
3646 	}
3647 	return (RDMA_SUCCESS);
3648 }
3649 
3650 rdma_stat
3651 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3652 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3653 {
3654 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3655 	rib_lrc_entry_t *l;
3656 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3657 	rdma_stat	status;
3658 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3659 
3660 	/*
3661 	 * Non-coherent memory registration.
3662 	 */
3663 	l = (rib_lrc_entry_t *)lrc;
3664 	if (l) {
3665 		if (l->registered) {
3666 			buf_handle->mrc_linfo =
3667 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3668 			buf_handle->mrc_lmr =
3669 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3670 			buf_handle->mrc_rmr =
3671 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3672 			*sync_handle = (RIB_SYNCMEM_HANDLE)
3673 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3674 			return (RDMA_SUCCESS);
3675 		} else {
3676 			/* Always register the whole buffer */
3677 			buf = (caddr_t)l->lrc_buf;
3678 			buflen = l->lrc_len;
3679 		}
3680 	}
3681 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3682 
3683 	if (status == RDMA_SUCCESS) {
3684 		if (l) {
3685 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3686 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3687 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3688 			l->registered		 = TRUE;
3689 		}
3690 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3691 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3692 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3693 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3694 	} else {
3695 		buf_handle->mrc_linfo = NULL;
3696 		buf_handle->mrc_lmr = 0;
3697 		buf_handle->mrc_rmr = 0;
3698 	}
3699 	return (status);
3700 }
3701 
3702 /* ARGSUSED */
3703 rdma_stat
3704 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3705 {
3706 	rib_hca_t *hca = (ctoqp(conn))->hca;
3707 	/*
3708 	 * Allow memory deregistration even if HCA is
3709 	 * getting detached. Need all outstanding
3710 	 * memory registrations to be deregistered
3711 	 * before HCA_DETACH_EVENT can be accepted.
3712 	 */
3713 	(void) ibt_deregister_mr(hca->hca_hdl,
3714 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3715 	return (RDMA_SUCCESS);
3716 }
3717 
3718 /* ARGSUSED */
3719 rdma_stat
3720 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3721 		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3722 {
3723 	rib_lrc_entry_t *l;
3724 	l = (rib_lrc_entry_t *)lrc;
3725 	if (l)
3726 		if (l->registered)
3727 			return (RDMA_SUCCESS);
3728 
3729 	(void) rib_deregistermem(conn, buf, buf_handle);
3730 
3731 	return (RDMA_SUCCESS);
3732 }
3733 
3734 /* ARGSUSED */
3735 rdma_stat
3736 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3737 		int len, int cpu)
3738 {
3739 	ibt_status_t	status;
3740 	rib_hca_t *hca = (ctoqp(conn))->hca;
3741 	ibt_mr_sync_t	mr_segment;
3742 
3743 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3744 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3745 	mr_segment.ms_len = (ib_memlen_t)len;
3746 	if (cpu) {
3747 		/* make incoming data visible to memory */
3748 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3749 	} else {
3750 		/* make memory changes visible to IO */
3751 		mr_segment.ms_flags = IBT_SYNC_READ;
3752 	}
3753 	rw_enter(&hca->state_lock, RW_READER);
3754 	if (hca->state != HCA_DETACHED) {
3755 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3756 		rw_exit(&hca->state_lock);
3757 	} else {
3758 		rw_exit(&hca->state_lock);
3759 		return (RDMA_FAILED);
3760 	}
3761 
3762 	if (status == IBT_SUCCESS)
3763 		return (RDMA_SUCCESS);
3764 	else {
3765 		return (RDMA_FAILED);
3766 	}
3767 }
3768 
3769 /*
3770  * XXXX	????
3771  */
3772 static rdma_stat
3773 rib_getinfo(rdma_info_t *info)
3774 {
3775 	/*
3776 	 * XXXX	Hack!
3777 	 */
3778 	info->addrlen = 16;
3779 	info->mts = 1000000;
3780 	info->mtu = 1000000;
3781 
3782 	return (RDMA_SUCCESS);
3783 }
3784 
3785 rib_bufpool_t *
3786 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3787 {
3788 	rib_bufpool_t	*rbp = NULL;
3789 	bufpool_t	*bp = NULL;
3790 	caddr_t		buf;
3791 	ibt_mr_attr_t	mem_attr;
3792 	ibt_status_t	ibt_status;
3793 	int		i, j;
3794 
3795 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3796 
3797 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3798 	    num * sizeof (void *), KM_SLEEP);
3799 
3800 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3801 	bp->numelems = num;
3802 
3803 
3804 	switch (ptype) {
3805 	case SEND_BUFFER:
3806 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3807 		bp->rsize = RPC_MSG_SZ;
3808 		break;
3809 	case RECV_BUFFER:
3810 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3811 		bp->rsize = RPC_BUF_SIZE;
3812 		break;
3813 	default:
3814 		goto fail;
3815 	}
3816 
3817 	/*
3818 	 * Register the pool.
3819 	 */
3820 	bp->bufsize = num * bp->rsize;
3821 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3822 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3823 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3824 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3825 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3826 	rw_enter(&hca->state_lock, RW_READER);
3827 
3828 	if (hca->state == HCA_DETACHED) {
3829 		rw_exit(&hca->state_lock);
3830 		goto fail;
3831 	}
3832 
3833 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3834 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3835 		mem_attr.mr_vaddr = (uintptr_t)buf;
3836 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3837 		mem_attr.mr_as = NULL;
3838 		ibt_status = ibt_register_mr(hca->hca_hdl,
3839 		    hca->pd_hdl, &mem_attr,
3840 		    &rbp->mr_hdl[i],
3841 		    &rbp->mr_desc[i]);
3842 		if (ibt_status != IBT_SUCCESS) {
3843 			for (j = 0; j < i; j++) {
3844 				(void) ibt_deregister_mr(hca->hca_hdl,
3845 				    rbp->mr_hdl[j]);
3846 			}
3847 			rw_exit(&hca->state_lock);
3848 			goto fail;
3849 		}
3850 	}
3851 	rw_exit(&hca->state_lock);
3852 	buf = (caddr_t)bp->buf;
3853 	for (i = 0; i < num; i++, buf += bp->rsize) {
3854 		bp->buflist[i] = (void *)buf;
3855 	}
3856 	bp->buffree = num - 1;	/* no. of free buffers */
3857 	rbp->bpool = bp;
3858 
3859 	return (rbp);
3860 fail:
3861 	if (bp) {
3862 		if (bp->buf)
3863 			kmem_free(bp->buf, bp->bufsize);
3864 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3865 	}
3866 	if (rbp) {
3867 		if (rbp->mr_hdl)
3868 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3869 		if (rbp->mr_desc)
3870 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3871 		kmem_free(rbp, sizeof (rib_bufpool_t));
3872 	}
3873 	return (NULL);
3874 }
3875 
3876 static void
3877 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3878 {
3879 	int i;
3880 	rib_bufpool_t *rbp = NULL;
3881 	bufpool_t *bp;
3882 
3883 	/*
3884 	 * Obtain pool address based on type of pool
3885 	 */
3886 	switch (ptype) {
3887 		case SEND_BUFFER:
3888 			rbp = hca->send_pool;
3889 			break;
3890 		case RECV_BUFFER:
3891 			rbp = hca->recv_pool;
3892 			break;
3893 		default:
3894 			return;
3895 	}
3896 	if (rbp == NULL)
3897 		return;
3898 
3899 	bp = rbp->bpool;
3900 
3901 	/*
3902 	 * Deregister the pool memory and free it.
3903 	 */
3904 	for (i = 0; i < bp->numelems; i++) {
3905 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3906 	}
3907 }
3908 
3909 static void
3910 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3911 {
3912 
3913 	rib_bufpool_t *rbp = NULL;
3914 	bufpool_t *bp;
3915 
3916 	/*
3917 	 * Obtain pool address based on type of pool
3918 	 */
3919 	switch (ptype) {
3920 		case SEND_BUFFER:
3921 			rbp = hca->send_pool;
3922 			break;
3923 		case RECV_BUFFER:
3924 			rbp = hca->recv_pool;
3925 			break;
3926 		default:
3927 			return;
3928 	}
3929 	if (rbp == NULL)
3930 		return;
3931 
3932 	bp = rbp->bpool;
3933 
3934 	/*
3935 	 * Free the pool memory.
3936 	 */
3937 	if (rbp->mr_hdl)
3938 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3939 
3940 	if (rbp->mr_desc)
3941 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
3942 	if (bp->buf)
3943 		kmem_free(bp->buf, bp->bufsize);
3944 	mutex_destroy(&bp->buflock);
3945 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
3946 	kmem_free(rbp, sizeof (rib_bufpool_t));
3947 }
3948 
3949 void
3950 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
3951 {
3952 	/*
3953 	 * Deregister the pool memory and free it.
3954 	 */
3955 	rib_rbufpool_deregister(hca, ptype);
3956 	rib_rbufpool_free(hca, ptype);
3957 }
3958 
3959 /*
3960  * Fetch a buffer from the pool of type specified in rdbuf->type.
3961  */
3962 static rdma_stat
3963 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3964 {
3965 	rib_lrc_entry_t *rlep;
3966 
3967 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
3968 		rlep = rib_get_cache_buf(conn, rdbuf->len);
3969 		rdbuf->rb_private =  (caddr_t)rlep;
3970 		rdbuf->addr = rlep->lrc_buf;
3971 		rdbuf->handle = rlep->lrc_mhandle;
3972 		return (RDMA_SUCCESS);
3973 	}
3974 
3975 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
3976 	if (rdbuf->addr) {
3977 		switch (rdbuf->type) {
3978 		case SEND_BUFFER:
3979 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
3980 			break;
3981 		case RECV_BUFFER:
3982 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
3983 			break;
3984 		default:
3985 			rdbuf->len = 0;
3986 		}
3987 		return (RDMA_SUCCESS);
3988 	} else
3989 		return (RDMA_FAILED);
3990 }
3991 
3992 /*
3993  * Fetch a buffer of specified type.
3994  * Note that rdbuf->handle is mw's rkey.
3995  */
3996 static void *
3997 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
3998 {
3999 	rib_qp_t	*qp = ctoqp(conn);
4000 	rib_hca_t	*hca = qp->hca;
4001 	rdma_btype	ptype = rdbuf->type;
4002 	void		*buf;
4003 	rib_bufpool_t	*rbp = NULL;
4004 	bufpool_t	*bp;
4005 	int		i;
4006 
4007 	/*
4008 	 * Obtain pool address based on type of pool
4009 	 */
4010 	switch (ptype) {
4011 	case SEND_BUFFER:
4012 		rbp = hca->send_pool;
4013 		break;
4014 	case RECV_BUFFER:
4015 		rbp = hca->recv_pool;
4016 		break;
4017 	default:
4018 		return (NULL);
4019 	}
4020 	if (rbp == NULL)
4021 		return (NULL);
4022 
4023 	bp = rbp->bpool;
4024 
4025 	mutex_enter(&bp->buflock);
4026 	if (bp->buffree < 0) {
4027 		mutex_exit(&bp->buflock);
4028 		return (NULL);
4029 	}
4030 
4031 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4032 	buf = bp->buflist[bp->buffree];
4033 	rdbuf->addr = buf;
4034 	rdbuf->len = bp->rsize;
4035 	for (i = bp->numelems - 1; i >= 0; i--) {
4036 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4037 			rdbuf->handle.mrc_rmr =
4038 			    (uint32_t)rbp->mr_desc[i].md_rkey;
4039 			rdbuf->handle.mrc_linfo =
4040 			    (uintptr_t)rbp->mr_hdl[i];
4041 			rdbuf->handle.mrc_lmr =
4042 			    (uint32_t)rbp->mr_desc[i].md_lkey;
4043 			bp->buffree--;
4044 
4045 			mutex_exit(&bp->buflock);
4046 
4047 			return (buf);
4048 		}
4049 	}
4050 
4051 	mutex_exit(&bp->buflock);
4052 
4053 	return (NULL);
4054 }
4055 
4056 static void
4057 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4058 {
4059 
4060 	if (rdbuf->type == RDMA_LONG_BUFFER) {
4061 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4062 		rdbuf->rb_private = NULL;
4063 		return;
4064 	}
4065 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4066 }
4067 
4068 static void
4069 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4070 {
4071 	rib_qp_t *qp = ctoqp(conn);
4072 	rib_hca_t *hca = qp->hca;
4073 	rib_bufpool_t *rbp = NULL;
4074 	bufpool_t *bp;
4075 
4076 	/*
4077 	 * Obtain pool address based on type of pool
4078 	 */
4079 	switch (ptype) {
4080 	case SEND_BUFFER:
4081 		rbp = hca->send_pool;
4082 		break;
4083 	case RECV_BUFFER:
4084 		rbp = hca->recv_pool;
4085 		break;
4086 	default:
4087 		return;
4088 	}
4089 	if (rbp == NULL)
4090 		return;
4091 
4092 	bp = rbp->bpool;
4093 
4094 	mutex_enter(&bp->buflock);
4095 	if (++bp->buffree >= bp->numelems) {
4096 		/*
4097 		 * Should never happen
4098 		 */
4099 		bp->buffree--;
4100 	} else {
4101 		bp->buflist[bp->buffree] = buf;
4102 	}
4103 	mutex_exit(&bp->buflock);
4104 }
4105 
4106 static rdma_stat
4107 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4108 {
4109 	rw_enter(&connlist->conn_lock, RW_WRITER);
4110 	if (connlist->conn_hd) {
4111 		cn->c_next = connlist->conn_hd;
4112 		connlist->conn_hd->c_prev = cn;
4113 	}
4114 	connlist->conn_hd = cn;
4115 	rw_exit(&connlist->conn_lock);
4116 
4117 	return (RDMA_SUCCESS);
4118 }
4119 
4120 static rdma_stat
4121 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4122 {
4123 	rw_enter(&connlist->conn_lock, RW_WRITER);
4124 	if (cn->c_prev) {
4125 		cn->c_prev->c_next = cn->c_next;
4126 	}
4127 	if (cn->c_next) {
4128 		cn->c_next->c_prev = cn->c_prev;
4129 	}
4130 	if (connlist->conn_hd == cn)
4131 		connlist->conn_hd = cn->c_next;
4132 	rw_exit(&connlist->conn_lock);
4133 
4134 	return (RDMA_SUCCESS);
4135 }
4136 
4137 /*
4138  * rib_find_hca_connection
4139  *
4140  * if there is an existing connection to the specified address then
4141  * it will be returned in conn, otherwise conn will be set to NULL.
4142  * Also cleans up any connection that is in error state.
4143  */
4144 static int
4145 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4146     struct netbuf *d_svcaddr, CONN **conn)
4147 {
4148 	CONN *cn;
4149 	clock_t cv_stat, timout;
4150 
4151 	*conn = NULL;
4152 again:
4153 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4154 	cn = hca->cl_conn_list.conn_hd;
4155 	while (cn != NULL) {
4156 		/*
4157 		 * First, clear up any connection in the ERROR state
4158 		 */
4159 		mutex_enter(&cn->c_lock);
4160 		if (cn->c_state == C_ERROR_CONN) {
4161 			if (cn->c_ref == 0) {
4162 				/*
4163 				 * Remove connection from list and destroy it.
4164 				 */
4165 				cn->c_state = C_DISCONN_PEND;
4166 				mutex_exit(&cn->c_lock);
4167 				rw_exit(&hca->cl_conn_list.conn_lock);
4168 				rib_conn_close((void *)cn);
4169 				goto again;
4170 			}
4171 			mutex_exit(&cn->c_lock);
4172 			cn = cn->c_next;
4173 			continue;
4174 		}
4175 		if (cn->c_state == C_DISCONN_PEND) {
4176 			mutex_exit(&cn->c_lock);
4177 			cn = cn->c_next;
4178 			continue;
4179 		}
4180 
4181 		/*
4182 		 * source address is only checked for if there is one,
4183 		 * this is the case for retries.
4184 		 */
4185 		if ((cn->c_raddr.len == d_svcaddr->len) &&
4186 		    (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4187 		    d_svcaddr->len) == 0) &&
4188 		    ((s_svcaddr->len == 0) ||
4189 		    ((cn->c_laddr.len == s_svcaddr->len) &&
4190 		    (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4191 		    s_svcaddr->len) == 0)))) {
4192 			/*
4193 			 * Our connection. Give up conn list lock
4194 			 * as we are done traversing the list.
4195 			 */
4196 			rw_exit(&hca->cl_conn_list.conn_lock);
4197 			if (cn->c_state == C_CONNECTED) {
4198 				cn->c_ref++;	/* sharing a conn */
4199 				mutex_exit(&cn->c_lock);
4200 				*conn = cn;
4201 				return (RDMA_SUCCESS);
4202 			}
4203 			if (cn->c_state == C_CONN_PEND) {
4204 				/*
4205 				 * Hold a reference to this conn before
4206 				 * we give up the lock.
4207 				 */
4208 				cn->c_ref++;
4209 				timout =  ddi_get_lbolt() +
4210 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
4211 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4212 				    &cn->c_lock, timout)) > 0 &&
4213 				    cn->c_state == C_CONN_PEND)
4214 					;
4215 				if (cv_stat == 0) {
4216 					cn->c_ref--;
4217 					mutex_exit(&cn->c_lock);
4218 					return (RDMA_INTR);
4219 				}
4220 				if (cv_stat < 0) {
4221 					cn->c_ref--;
4222 					mutex_exit(&cn->c_lock);
4223 					return (RDMA_TIMEDOUT);
4224 				}
4225 				if (cn->c_state == C_CONNECTED) {
4226 					*conn = cn;
4227 					mutex_exit(&cn->c_lock);
4228 					return (RDMA_SUCCESS);
4229 				} else {
4230 					cn->c_ref--;
4231 					mutex_exit(&cn->c_lock);
4232 					return (RDMA_TIMEDOUT);
4233 				}
4234 			}
4235 		}
4236 		mutex_exit(&cn->c_lock);
4237 		cn = cn->c_next;
4238 	}
4239 	rw_exit(&hca->cl_conn_list.conn_lock);
4240 	*conn = NULL;
4241 	return (RDMA_FAILED);
4242 }
4243 
4244 /*
4245  * Connection management.
4246  * IBTF does not support recycling of channels. So connections are only
4247  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4248  * C_DISCONN_PEND state. No C_IDLE state.
4249  * C_CONN_PEND state: Connection establishment in progress to the server.
4250  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4251  * It has an RC channel associated with it. ibt_post_send/recv are allowed
4252  * only in this state.
4253  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4254  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4255  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4256  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4257  * c_ref drops to 0 (this indicates that RPC has no more references to this
4258  * connection), the connection should be destroyed. A connection transitions
4259  * into this state when it is being destroyed.
4260  */
4261 /* ARGSUSED */
4262 static rdma_stat
4263 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4264     int addr_type, void *handle, CONN **conn)
4265 {
4266 	CONN *cn;
4267 	int status;
4268 	rib_hca_t *hca;
4269 	rib_qp_t *qp;
4270 	rpcib_ping_t rpt;
4271 	int s_addr_len;
4272 	char *s_addr_buf;
4273 
4274 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4275 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4276 		rw_enter(&hca->state_lock, RW_READER);
4277 		if (hca->state != HCA_DETACHED) {
4278 			status = rib_find_hca_connection(hca, s_svcaddr,
4279 			    d_svcaddr, conn);
4280 			rw_exit(&hca->state_lock);
4281 			if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4282 				rw_exit(&rib_stat->hcas_list_lock);
4283 				return (status);
4284 			}
4285 		} else
4286 			rw_exit(&hca->state_lock);
4287 	}
4288 	rw_exit(&rib_stat->hcas_list_lock);
4289 
4290 	/*
4291 	 * No existing connection found, establish a new connection.
4292 	 */
4293 	bzero(&rpt, sizeof (rpcib_ping_t));
4294 
4295 	status = rib_ping_srv(addr_type, d_svcaddr, &rpt);
4296 	if (status != RDMA_SUCCESS) {
4297 		return (RDMA_FAILED);
4298 	}
4299 	hca = rpt.hca;
4300 
4301 	if (rpt.srcip.family == AF_INET) {
4302 		s_addr_len = sizeof (rpt.srcip.un.ip4addr);
4303 		s_addr_buf = (char *)&rpt.srcip.un.ip4addr;
4304 	} else if (rpt.srcip.family == AF_INET6) {
4305 		s_addr_len = sizeof (rpt.srcip.un.ip6addr);
4306 		s_addr_buf = (char *)&rpt.srcip.un.ip6addr;
4307 	} else
4308 		return (RDMA_FAILED);
4309 
4310 	/*
4311 	 * Channel to server doesn't exist yet, create one.
4312 	 */
4313 	if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4314 		return (RDMA_FAILED);
4315 	}
4316 	cn = qptoc(qp);
4317 	cn->c_state = C_CONN_PEND;
4318 	cn->c_ref = 1;
4319 
4320 	cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4321 	bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4322 	cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4323 
4324 	/*
4325 	 * Add to conn list.
4326 	 * We had given up the READER lock. In the time since then,
4327 	 * another thread might have created the connection we are
4328 	 * trying here. But for now, that is quiet alright - there
4329 	 * might be two connections between a pair of hosts instead
4330 	 * of one. If we really want to close that window,
4331 	 * then need to check the list after acquiring the
4332 	 * WRITER lock.
4333 	 */
4334 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4335 	status = rib_conn_to_srv(hca, qp, &rpt);
4336 	mutex_enter(&cn->c_lock);
4337 	if (status == RDMA_SUCCESS) {
4338 		cn->c_state = C_CONNECTED;
4339 		*conn = cn;
4340 	} else {
4341 		cn->c_state = C_ERROR_CONN;
4342 		cn->c_ref--;
4343 	}
4344 	cv_broadcast(&cn->c_cv);
4345 	mutex_exit(&cn->c_lock);
4346 	return (status);
4347 }
4348 
4349 static void
4350 rib_conn_close(void *rarg)
4351 {
4352 	CONN *conn = (CONN *)rarg;
4353 	rib_qp_t *qp = ctoqp(conn);
4354 
4355 	mutex_enter(&conn->c_lock);
4356 	if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4357 
4358 		conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4359 		/*
4360 		 * Live connection in CONNECTED state.
4361 		 */
4362 		if (conn->c_state == C_CONNECTED) {
4363 			conn->c_state = C_ERROR_CONN;
4364 		}
4365 		mutex_exit(&conn->c_lock);
4366 
4367 		rib_close_a_channel(conn);
4368 
4369 		mutex_enter(&conn->c_lock);
4370 		conn->c_flags &= ~C_CLOSE_PENDING;
4371 		cv_signal(&conn->c_cv);
4372 	}
4373 
4374 	mutex_exit(&conn->c_lock);
4375 
4376 	if (qp->mode == RIB_SERVER)
4377 		(void) rib_disconnect_channel(conn,
4378 		    &qp->hca->srv_conn_list);
4379 	else
4380 		(void) rib_disconnect_channel(conn,
4381 		    &qp->hca->cl_conn_list);
4382 }
4383 
4384 static void
4385 rib_conn_timeout_call(void *carg)
4386 {
4387 	time_t idle_time;
4388 	CONN *conn = (CONN *)carg;
4389 	rib_hca_t *hca = ctoqp(conn)->hca;
4390 	int error;
4391 
4392 	mutex_enter(&conn->c_lock);
4393 	if ((conn->c_ref > 0) ||
4394 	    (conn->c_state == C_DISCONN_PEND)) {
4395 		conn->c_timeout = NULL;
4396 		mutex_exit(&conn->c_lock);
4397 		return;
4398 	}
4399 
4400 	idle_time = (gethrestime_sec() - conn->c_last_used);
4401 
4402 	if ((idle_time <= rib_conn_timeout) &&
4403 	    (conn->c_state != C_ERROR_CONN)) {
4404 		/*
4405 		 * There was activity after the last timeout.
4406 		 * Extend the conn life. Unless the conn is
4407 		 * already in error state.
4408 		 */
4409 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4410 		    SEC_TO_TICK(rib_conn_timeout - idle_time));
4411 		mutex_exit(&conn->c_lock);
4412 		return;
4413 	}
4414 
4415 	error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4416 	    (void *)conn, DDI_NOSLEEP);
4417 
4418 	/*
4419 	 * If taskq dispatch fails above, then reset the timeout
4420 	 * to try again after 10 secs.
4421 	 */
4422 
4423 	if (error != DDI_SUCCESS) {
4424 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4425 		    SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4426 		mutex_exit(&conn->c_lock);
4427 		return;
4428 	}
4429 
4430 	conn->c_state = C_DISCONN_PEND;
4431 	mutex_exit(&conn->c_lock);
4432 }
4433 
4434 static rdma_stat
4435 rib_conn_release(CONN *conn)
4436 {
4437 
4438 	mutex_enter(&conn->c_lock);
4439 	conn->c_ref--;
4440 
4441 	conn->c_last_used = gethrestime_sec();
4442 	if (conn->c_ref > 0) {
4443 		mutex_exit(&conn->c_lock);
4444 		return (RDMA_SUCCESS);
4445 	}
4446 
4447 	/*
4448 	 * If a conn is C_ERROR_CONN, close the channel.
4449 	 */
4450 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4451 		conn->c_state = C_DISCONN_PEND;
4452 		mutex_exit(&conn->c_lock);
4453 		rib_conn_close((void *)conn);
4454 		return (RDMA_SUCCESS);
4455 	}
4456 
4457 	/*
4458 	 * c_ref == 0, set a timeout for conn release
4459 	 */
4460 
4461 	if (conn->c_timeout == NULL) {
4462 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4463 		    SEC_TO_TICK(rib_conn_timeout));
4464 	}
4465 
4466 	mutex_exit(&conn->c_lock);
4467 	return (RDMA_SUCCESS);
4468 }
4469 
4470 /*
4471  * Add at front of list
4472  */
4473 static struct rdma_done_list *
4474 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4475 {
4476 	struct rdma_done_list *rd;
4477 
4478 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4479 
4480 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4481 	rd->xid = xid;
4482 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4483 
4484 	rd->prev = NULL;
4485 	rd->next = qp->rdlist;
4486 	if (qp->rdlist != NULL)
4487 		qp->rdlist->prev = rd;
4488 	qp->rdlist = rd;
4489 
4490 	return (rd);
4491 }
4492 
4493 static void
4494 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4495 {
4496 	struct rdma_done_list *r;
4497 
4498 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4499 
4500 	r = rd->next;
4501 	if (r != NULL) {
4502 		r->prev = rd->prev;
4503 	}
4504 
4505 	r = rd->prev;
4506 	if (r != NULL) {
4507 		r->next = rd->next;
4508 	} else {
4509 		qp->rdlist = rd->next;
4510 	}
4511 
4512 	cv_destroy(&rd->rdma_done_cv);
4513 	kmem_free(rd, sizeof (*rd));
4514 }
4515 
4516 static void
4517 rdma_done_rem_list(rib_qp_t *qp)
4518 {
4519 	struct rdma_done_list	*r, *n;
4520 
4521 	mutex_enter(&qp->rdlist_lock);
4522 	for (r = qp->rdlist; r != NULL; r = n) {
4523 		n = r->next;
4524 		rdma_done_rm(qp, r);
4525 	}
4526 	mutex_exit(&qp->rdlist_lock);
4527 }
4528 
4529 static void
4530 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4531 {
4532 	struct rdma_done_list *r = qp->rdlist;
4533 
4534 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4535 
4536 	while (r) {
4537 		if (r->xid == xid) {
4538 			cv_signal(&r->rdma_done_cv);
4539 			return;
4540 		} else {
4541 			r = r->next;
4542 		}
4543 	}
4544 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4545 	    int, xid);
4546 }
4547 
4548 /*
4549  * Expects conn->c_lock to be held by the caller.
4550  */
4551 
4552 static void
4553 rib_close_a_channel(CONN *conn)
4554 {
4555 	rib_qp_t	*qp;
4556 	qp = ctoqp(conn);
4557 
4558 	if (qp->qp_hdl == NULL) {
4559 		/* channel already freed */
4560 		return;
4561 	}
4562 
4563 	/*
4564 	 * Call ibt_close_rc_channel in blocking mode
4565 	 * with no callbacks.
4566 	 */
4567 	(void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4568 	    NULL, 0, NULL, NULL, 0);
4569 }
4570 
4571 /*
4572  * Goes through all connections and closes the channel
4573  * This will cause all the WRs on those channels to be
4574  * flushed.
4575  */
4576 static void
4577 rib_close_channels(rib_conn_list_t *connlist)
4578 {
4579 	CONN 		*conn, *tmp;
4580 
4581 	rw_enter(&connlist->conn_lock, RW_READER);
4582 	conn = connlist->conn_hd;
4583 	while (conn != NULL) {
4584 		mutex_enter(&conn->c_lock);
4585 		tmp = conn->c_next;
4586 		if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4587 
4588 			conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4589 
4590 			/*
4591 			 * Live connection in CONNECTED state.
4592 			 */
4593 			if (conn->c_state == C_CONNECTED)
4594 				conn->c_state = C_ERROR_CONN;
4595 			mutex_exit(&conn->c_lock);
4596 
4597 			rib_close_a_channel(conn);
4598 
4599 			mutex_enter(&conn->c_lock);
4600 			conn->c_flags &= ~C_CLOSE_PENDING;
4601 			/* Signal a pending rib_disconnect_channel() */
4602 			cv_signal(&conn->c_cv);
4603 		}
4604 		mutex_exit(&conn->c_lock);
4605 		conn = tmp;
4606 	}
4607 	rw_exit(&connlist->conn_lock);
4608 }
4609 
4610 /*
4611  * Frees up all connections that are no longer being referenced
4612  */
4613 static void
4614 rib_purge_connlist(rib_conn_list_t *connlist)
4615 {
4616 	CONN 		*conn;
4617 
4618 top:
4619 	rw_enter(&connlist->conn_lock, RW_READER);
4620 	conn = connlist->conn_hd;
4621 	while (conn != NULL) {
4622 		mutex_enter(&conn->c_lock);
4623 
4624 		/*
4625 		 * At this point connection is either in ERROR
4626 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4627 		 * then some other thread is culling that connection.
4628 		 * If not and if c_ref is 0, then destroy the connection.
4629 		 */
4630 		if (conn->c_ref == 0 &&
4631 		    conn->c_state != C_DISCONN_PEND) {
4632 			/*
4633 			 * Cull the connection
4634 			 */
4635 			conn->c_state = C_DISCONN_PEND;
4636 			mutex_exit(&conn->c_lock);
4637 			rw_exit(&connlist->conn_lock);
4638 			(void) rib_disconnect_channel(conn, connlist);
4639 			goto top;
4640 		} else {
4641 			/*
4642 			 * conn disconnect already scheduled or will
4643 			 * happen from conn_release when c_ref drops to 0.
4644 			 */
4645 			mutex_exit(&conn->c_lock);
4646 		}
4647 		conn = conn->c_next;
4648 	}
4649 	rw_exit(&connlist->conn_lock);
4650 
4651 	/*
4652 	 * At this point, only connections with c_ref != 0 are on the list
4653 	 */
4654 }
4655 
4656 /*
4657  * Free all the HCA resources and close
4658  * the hca.
4659  */
4660 
4661 static void
4662 rib_free_hca(rib_hca_t *hca)
4663 {
4664 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4665 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4666 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4667 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4668 
4669 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4670 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4671 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4672 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4673 
4674 	rib_rbufpool_destroy(hca, RECV_BUFFER);
4675 	rib_rbufpool_destroy(hca, SEND_BUFFER);
4676 	rib_destroy_cache(hca);
4677 	if (rib_mod.rdma_count == 0)
4678 		rdma_unregister_mod(&rib_mod);
4679 	(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4680 	(void) ibt_close_hca(hca->hca_hdl);
4681 	hca->hca_hdl = NULL;
4682 }
4683 
4684 
4685 static void
4686 rib_stop_hca_services(rib_hca_t *hca)
4687 {
4688 	rib_stop_services(hca);
4689 	rib_close_channels(&hca->cl_conn_list);
4690 	rib_close_channels(&hca->srv_conn_list);
4691 
4692 	rib_purge_connlist(&hca->cl_conn_list);
4693 	rib_purge_connlist(&hca->srv_conn_list);
4694 
4695 	if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4696 		kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4697 		    GLOBAL_ZONEID);
4698 		stats_enabled = FALSE;
4699 	}
4700 
4701 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4702 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4703 	if (hca->srv_conn_list.conn_hd == NULL &&
4704 	    hca->cl_conn_list.conn_hd == NULL) {
4705 		/*
4706 		 * conn_lists are NULL, so destroy
4707 		 * buffers, close hca and be done.
4708 		 */
4709 		rib_free_hca(hca);
4710 	}
4711 	rw_exit(&hca->cl_conn_list.conn_lock);
4712 	rw_exit(&hca->srv_conn_list.conn_lock);
4713 
4714 	if (hca->hca_hdl != NULL) {
4715 		mutex_enter(&hca->inuse_lock);
4716 		while (hca->inuse)
4717 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4718 		mutex_exit(&hca->inuse_lock);
4719 
4720 		rib_free_hca(hca);
4721 	}
4722 	rw_destroy(&hca->bound_services_lock);
4723 
4724 	if (hca->cleanup_helper != NULL) {
4725 		ddi_taskq_destroy(hca->cleanup_helper);
4726 		hca->cleanup_helper = NULL;
4727 	}
4728 }
4729 
4730 /*
4731  * Cleans and closes up all uses of the HCA
4732  */
4733 static void
4734 rib_detach_hca(rib_hca_t *hca)
4735 {
4736 	rib_hca_t **hcap;
4737 
4738 	/*
4739 	 * Stop all services on the HCA
4740 	 * Go through cl_conn_list and close all rc_channels
4741 	 * Go through svr_conn_list and close all rc_channels
4742 	 * Free connections whose c_ref has dropped to 0
4743 	 * Destroy all CQs
4744 	 * Deregister and released all buffer pool memory after all
4745 	 * connections are destroyed
4746 	 * Free the protection domain
4747 	 * ibt_close_hca()
4748 	 */
4749 	rw_enter(&hca->state_lock, RW_WRITER);
4750 	if (hca->state == HCA_DETACHED) {
4751 		rw_exit(&hca->state_lock);
4752 		return;
4753 	}
4754 
4755 	hca->state = HCA_DETACHED;
4756 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4757 	for (hcap = &rib_stat->hcas_list; *hcap && (*hcap != hca);
4758 	    hcap = &(*hcap)->next)
4759 		;
4760 	ASSERT(*hcap == hca);
4761 	*hcap = hca->next;
4762 	rib_stat->nhca_inited--;
4763 	rib_mod.rdma_count--;
4764 	rw_exit(&rib_stat->hcas_list_lock);
4765 	rw_exit(&hca->state_lock);
4766 
4767 	rib_stop_hca_services(hca);
4768 
4769 	kmem_free(hca, sizeof (*hca));
4770 }
4771 
4772 static void
4773 rib_server_side_cache_reclaim(void *argp)
4774 {
4775 	cache_avl_struct_t    *rcas;
4776 	rib_lrc_entry_t		*rb;
4777 	rib_hca_t *hca = (rib_hca_t *)argp;
4778 
4779 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4780 	rcas = avl_first(&hca->avl_tree);
4781 	if (rcas != NULL)
4782 		avl_remove(&hca->avl_tree, rcas);
4783 
4784 	while (rcas != NULL) {
4785 		while (rcas->r.forw != &rcas->r) {
4786 			rcas->elements--;
4787 			rb = rcas->r.forw;
4788 			remque(rb);
4789 			if (rb->registered)
4790 				(void) rib_deregistermem_via_hca(hca,
4791 				    rb->lrc_buf, rb->lrc_mhandle);
4792 
4793 			hca->cache_allocation -= rb->lrc_len;
4794 			kmem_free(rb->lrc_buf, rb->lrc_len);
4795 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4796 		}
4797 		mutex_destroy(&rcas->node_lock);
4798 		kmem_cache_free(hca->server_side_cache, rcas);
4799 		rcas = avl_first(&hca->avl_tree);
4800 		if (rcas != NULL)
4801 			avl_remove(&hca->avl_tree, rcas);
4802 	}
4803 	rw_exit(&hca->avl_rw_lock);
4804 }
4805 
4806 static void
4807 rib_server_side_cache_cleanup(void *argp)
4808 {
4809 	cache_avl_struct_t    *rcas;
4810 	rib_lrc_entry_t		*rb;
4811 	rib_hca_t *hca = (rib_hca_t *)argp;
4812 
4813 	mutex_enter(&hca->cache_allocation_lock);
4814 	if (hca->cache_allocation < cache_limit) {
4815 		mutex_exit(&hca->cache_allocation_lock);
4816 		return;
4817 	}
4818 	mutex_exit(&hca->cache_allocation_lock);
4819 
4820 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4821 	rcas = avl_last(&hca->avl_tree);
4822 	if (rcas != NULL)
4823 		avl_remove(&hca->avl_tree, rcas);
4824 
4825 	while (rcas != NULL) {
4826 		while (rcas->r.forw != &rcas->r) {
4827 			rcas->elements--;
4828 			rb = rcas->r.forw;
4829 			remque(rb);
4830 			if (rb->registered)
4831 				(void) rib_deregistermem_via_hca(hca,
4832 				    rb->lrc_buf, rb->lrc_mhandle);
4833 
4834 			hca->cache_allocation -= rb->lrc_len;
4835 
4836 			kmem_free(rb->lrc_buf, rb->lrc_len);
4837 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4838 		}
4839 		mutex_destroy(&rcas->node_lock);
4840 		if (hca->server_side_cache) {
4841 			kmem_cache_free(hca->server_side_cache, rcas);
4842 		}
4843 
4844 		if (hca->cache_allocation < cache_limit) {
4845 			rw_exit(&hca->avl_rw_lock);
4846 			return;
4847 		}
4848 
4849 		rcas = avl_last(&hca->avl_tree);
4850 		if (rcas != NULL)
4851 			avl_remove(&hca->avl_tree, rcas);
4852 	}
4853 	rw_exit(&hca->avl_rw_lock);
4854 }
4855 
4856 static int
4857 avl_compare(const void *t1, const void *t2)
4858 {
4859 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
4860 		return (0);
4861 
4862 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
4863 		return (-1);
4864 
4865 	return (1);
4866 }
4867 
4868 static void
4869 rib_destroy_cache(rib_hca_t *hca)
4870 {
4871 	if (hca->avl_init) {
4872 		rib_server_side_cache_reclaim((void *)hca);
4873 		if (hca->server_side_cache) {
4874 			kmem_cache_destroy(hca->server_side_cache);
4875 			hca->server_side_cache = NULL;
4876 		}
4877 		avl_destroy(&hca->avl_tree);
4878 		mutex_destroy(&hca->cache_allocation_lock);
4879 		rw_destroy(&hca->avl_rw_lock);
4880 	}
4881 	hca->avl_init = FALSE;
4882 }
4883 
4884 static void
4885 rib_force_cleanup(void *hca)
4886 {
4887 	if (((rib_hca_t *)hca)->cleanup_helper != NULL)
4888 		(void) ddi_taskq_dispatch(
4889 		    ((rib_hca_t *)hca)->cleanup_helper,
4890 		    rib_server_side_cache_cleanup,
4891 		    (void *)hca, DDI_NOSLEEP);
4892 }
4893 
4894 static rib_lrc_entry_t *
4895 rib_get_cache_buf(CONN *conn, uint32_t len)
4896 {
4897 	cache_avl_struct_t	cas, *rcas;
4898 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4899 	rib_lrc_entry_t *reply_buf;
4900 	avl_index_t where = NULL;
4901 	uint64_t c_alloc = 0;
4902 
4903 	if (!hca->avl_init)
4904 		goto  error_alloc;
4905 
4906 	cas.len = len;
4907 
4908 	rw_enter(&hca->avl_rw_lock, RW_READER);
4909 
4910 	mutex_enter(&hca->cache_allocation_lock);
4911 	c_alloc = hca->cache_allocation;
4912 	mutex_exit(&hca->cache_allocation_lock);
4913 
4914 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
4915 	    &where)) == NULL) {
4916 		/* Am I above the cache limit */
4917 		if ((c_alloc + len) >= cache_limit) {
4918 			rib_force_cleanup((void *)hca);
4919 			rw_exit(&hca->avl_rw_lock);
4920 			mutex_enter(&hca->cache_allocation_lock);
4921 			hca->cache_misses_above_the_limit ++;
4922 			mutex_exit(&hca->cache_allocation_lock);
4923 
4924 			/* Allocate and register the buffer directly */
4925 			goto error_alloc;
4926 		}
4927 
4928 		rw_exit(&hca->avl_rw_lock);
4929 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
4930 
4931 		/* Recheck to make sure no other thread added the entry in */
4932 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
4933 		    &cas, &where)) == NULL) {
4934 			/* Allocate an avl tree entry */
4935 			rcas = (cache_avl_struct_t *)
4936 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
4937 
4938 			bzero(rcas, sizeof (cache_avl_struct_t));
4939 			rcas->elements = 0;
4940 			rcas->r.forw = &rcas->r;
4941 			rcas->r.back = &rcas->r;
4942 			rcas->len = len;
4943 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
4944 			avl_insert(&hca->avl_tree, rcas, where);
4945 		}
4946 	}
4947 
4948 	mutex_enter(&rcas->node_lock);
4949 
4950 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
4951 		reply_buf = rcas->r.forw;
4952 		remque(reply_buf);
4953 		rcas->elements--;
4954 		mutex_exit(&rcas->node_lock);
4955 		rw_exit(&hca->avl_rw_lock);
4956 
4957 		mutex_enter(&hca->cache_allocation_lock);
4958 		hca->cache_hits++;
4959 		hca->cache_allocation -= len;
4960 		mutex_exit(&hca->cache_allocation_lock);
4961 	} else {
4962 		/* Am I above the cache limit */
4963 		mutex_exit(&rcas->node_lock);
4964 		if ((c_alloc + len) >= cache_limit) {
4965 			rib_force_cleanup((void *)hca);
4966 			rw_exit(&hca->avl_rw_lock);
4967 
4968 			mutex_enter(&hca->cache_allocation_lock);
4969 			hca->cache_misses_above_the_limit++;
4970 			mutex_exit(&hca->cache_allocation_lock);
4971 			/* Allocate and register the buffer directly */
4972 			goto error_alloc;
4973 		}
4974 		rw_exit(&hca->avl_rw_lock);
4975 		mutex_enter(&hca->cache_allocation_lock);
4976 		hca->cache_misses++;
4977 		mutex_exit(&hca->cache_allocation_lock);
4978 		/* Allocate a reply_buf entry */
4979 		reply_buf = (rib_lrc_entry_t *)
4980 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4981 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
4982 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
4983 		reply_buf->lrc_len  = len;
4984 		reply_buf->registered = FALSE;
4985 		reply_buf->avl_node = (void *)rcas;
4986 	}
4987 
4988 	return (reply_buf);
4989 
4990 error_alloc:
4991 	reply_buf = (rib_lrc_entry_t *)
4992 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
4993 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
4994 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
4995 	reply_buf->lrc_len = len;
4996 	reply_buf->registered = FALSE;
4997 	reply_buf->avl_node = NULL;
4998 
4999 	return (reply_buf);
5000 }
5001 
5002 /*
5003  * Return a pre-registered back to the cache (without
5004  * unregistering the buffer)..
5005  */
5006 
5007 static void
5008 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5009 {
5010 	cache_avl_struct_t    cas, *rcas;
5011 	avl_index_t where = NULL;
5012 	rib_hca_t	*hca = (ctoqp(conn))->hca;
5013 
5014 	if (!hca->avl_init)
5015 		goto  error_free;
5016 
5017 	cas.len = reg_buf->lrc_len;
5018 	rw_enter(&hca->avl_rw_lock, RW_READER);
5019 	if ((rcas = (cache_avl_struct_t *)
5020 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5021 		rw_exit(&hca->avl_rw_lock);
5022 		goto error_free;
5023 	} else {
5024 		cas.len = reg_buf->lrc_len;
5025 		mutex_enter(&rcas->node_lock);
5026 		insque(reg_buf, &rcas->r);
5027 		rcas->elements ++;
5028 		mutex_exit(&rcas->node_lock);
5029 		rw_exit(&hca->avl_rw_lock);
5030 		mutex_enter(&hca->cache_allocation_lock);
5031 		hca->cache_allocation += cas.len;
5032 		mutex_exit(&hca->cache_allocation_lock);
5033 	}
5034 
5035 	return;
5036 
5037 error_free:
5038 
5039 	if (reg_buf->registered)
5040 		(void) rib_deregistermem_via_hca(hca,
5041 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5042 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5043 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5044 }
5045 
5046 static rdma_stat
5047 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5048 	uint_t buflen, struct mrc *buf_handle)
5049 {
5050 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
5051 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
5052 	rdma_stat	status;
5053 
5054 
5055 	/*
5056 	 * Note: ALL buffer pools use the same memory type RDMARW.
5057 	 */
5058 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5059 	if (status == RDMA_SUCCESS) {
5060 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5061 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5062 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5063 	} else {
5064 		buf_handle->mrc_linfo = NULL;
5065 		buf_handle->mrc_lmr = 0;
5066 		buf_handle->mrc_rmr = 0;
5067 	}
5068 	return (status);
5069 }
5070 
5071 /* ARGSUSED */
5072 static rdma_stat
5073 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5074     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5075 {
5076 
5077 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5078 	return (RDMA_SUCCESS);
5079 }
5080 
5081 /* ARGSUSED */
5082 static rdma_stat
5083 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5084 {
5085 
5086 	(void) ibt_deregister_mr(hca->hca_hdl,
5087 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5088 	return (RDMA_SUCCESS);
5089 }
5090 
5091 /*
5092  * Check if the IP interface named by `lifrp' is RDMA-capable.
5093  */
5094 static boolean_t
5095 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5096 {
5097 	char ifname[LIFNAMSIZ];
5098 	char *cp;
5099 
5100 	if (lifrp->lifr_type == IFT_IB)
5101 		return (B_TRUE);
5102 
5103 	/*
5104 	 * Strip off the logical interface portion before getting
5105 	 * intimate with the name.
5106 	 */
5107 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5108 	if ((cp = strchr(ifname, ':')) != NULL)
5109 		*cp = '\0';
5110 
5111 	return (strcmp("lo0", ifname) == 0);
5112 }
5113 
5114 static int
5115 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5116 {
5117 	vnode_t *kvp, *vp;
5118 	TIUSER  *tiptr;
5119 	struct  strioctl iocb;
5120 	k_sigset_t smask;
5121 	int	err = 0;
5122 
5123 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) {
5124 		if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE,
5125 		    &tiptr, CRED()) == 0) {
5126 			vp = tiptr->fp->f_vnode;
5127 		} else {
5128 			VN_RELE(kvp);
5129 			return (EPROTO);
5130 		}
5131 	} else {
5132 		return (EPROTO);
5133 	}
5134 
5135 	iocb.ic_cmd = cmd;
5136 	iocb.ic_timout = 0;
5137 	iocb.ic_len = len;
5138 	iocb.ic_dp = (caddr_t)arg;
5139 	sigintr(&smask, 0);
5140 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5141 	sigunintr(&smask);
5142 	(void) t_kclose(tiptr, 0);
5143 	VN_RELE(kvp);
5144 	return (err);
5145 }
5146 
5147 /*
5148  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5149  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5150  */
5151 static int
5152 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5153 {
5154 	int err;
5155 	struct lifnum lifn;
5156 
5157 	bzero(&lifn, sizeof (struct lifnum));
5158 	lifn.lifn_family = AF_UNSPEC;
5159 
5160 	err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5161 	if (err != 0)
5162 		return (err);
5163 
5164 	/*
5165 	 * Pad the interface count to account for additional interfaces that
5166 	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5167 	 */
5168 	lifn.lifn_count += 4;
5169 
5170 	bzero(lifcp, sizeof (struct lifconf));
5171 	lifcp->lifc_family = AF_UNSPEC;
5172 	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5173 	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5174 
5175 	err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5176 	if (err != 0) {
5177 		kmem_free(lifcp->lifc_buf, *bufsizep);
5178 		return (err);
5179 	}
5180 	return (0);
5181 }
5182 
5183 static boolean_t
5184 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5185 {
5186 	uint_t i, nifs;
5187 	uint_t bufsize;
5188 	struct lifconf lifc;
5189 	struct lifreq *lifrp;
5190 	struct sockaddr_in *sinp;
5191 	struct sockaddr_in6 *sin6p;
5192 
5193 	bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5194 	bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5195 
5196 	if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5197 		return (B_FALSE);
5198 
5199 	if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5200 		kmem_free(lifc.lifc_buf, bufsize);
5201 		return (B_FALSE);
5202 	}
5203 
5204 	/*
5205 	 * Worst case is that all of the addresses are IB-capable and have
5206 	 * the same address family, so size our buffers accordingly.
5207 	 */
5208 	addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5209 	addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5210 	addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5211 	addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5212 
5213 	for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5214 		if (!rpcib_rdma_capable_interface(lifrp))
5215 			continue;
5216 
5217 		if (lifrp->lifr_addr.ss_family == AF_INET) {
5218 			sinp = addrs4->ri_list;
5219 			bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5220 			    sizeof (struct sockaddr_in));
5221 		} else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5222 			sin6p = addrs6->ri_list;
5223 			bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5224 			    sizeof (struct sockaddr_in6));
5225 		}
5226 	}
5227 
5228 	kmem_free(lifc.lifc_buf, bufsize);
5229 	return (B_TRUE);
5230 }
5231 
5232 /* ARGSUSED */
5233 static int
5234 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5235 {
5236 	rib_hca_t *hca;
5237 
5238 	if (KSTAT_WRITE == rw) {
5239 		return (EACCES);
5240 	}
5241 
5242 	rpcib_kstat.cache_limit.value.ui64 =
5243 	    (uint64_t)cache_limit;
5244 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5245 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5246 		rpcib_kstat.cache_allocation.value.ui64 +=
5247 		    (uint64_t)hca->cache_allocation;
5248 		rpcib_kstat.cache_hits.value.ui64 +=
5249 		    (uint64_t)hca->cache_hits;
5250 		rpcib_kstat.cache_misses.value.ui64 +=
5251 		    (uint64_t)hca->cache_misses;
5252 		rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5253 		    (uint64_t)hca->cache_misses_above_the_limit;
5254 	}
5255 	rw_exit(&rib_stat->hcas_list_lock);
5256 	return (0);
5257 }
5258