xref: /titanic_44/usr/src/uts/common/rpc/rpcib.c (revision cb8a054b1ab30d5caa746e6c44f29d4c9d3071c1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 
25 /*
26  * Copyright (c) 2007, The Ohio State University. All rights reserved.
27  *
28  * Portions of this source code is developed by the team members of
29  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
30  * headed by Professor Dhabaleswar K. (DK) Panda.
31  *
32  * Acknowledgements to contributions from developors:
33  *   Ranjit Noronha: noronha@cse.ohio-state.edu
34  *   Lei Chai      : chail@cse.ohio-state.edu
35  *   Weikuan Yu    : yuw@cse.ohio-state.edu
36  *
37  */
38 
39 /*
40  * The rpcib plugin. Implements the interface for RDMATF's
41  * interaction with IBTF.
42  */
43 
44 #include <sys/param.h>
45 #include <sys/types.h>
46 #include <sys/user.h>
47 #include <sys/systm.h>
48 #include <sys/sysmacros.h>
49 #include <sys/proc.h>
50 #include <sys/socket.h>
51 #include <sys/file.h>
52 #include <sys/stream.h>
53 #include <sys/strsubr.h>
54 #include <sys/stropts.h>
55 #include <sys/errno.h>
56 #include <sys/kmem.h>
57 #include <sys/debug.h>
58 #include <sys/pathname.h>
59 #include <sys/kstat.h>
60 #include <sys/t_lock.h>
61 #include <sys/ddi.h>
62 #include <sys/cmn_err.h>
63 #include <sys/time.h>
64 #include <sys/isa_defs.h>
65 #include <sys/callb.h>
66 #include <sys/sunddi.h>
67 #include <sys/sunndi.h>
68 #include <sys/sdt.h>
69 #include <sys/ib/ibtl/ibti.h>
70 #include <rpc/rpc.h>
71 #include <rpc/ib.h>
72 #include <sys/modctl.h>
73 #include <sys/kstr.h>
74 #include <sys/sockio.h>
75 #include <sys/vnode.h>
76 #include <sys/tiuser.h>
77 #include <net/if.h>
78 #include <net/if_types.h>
79 #include <sys/cred.h>
80 #include <rpc/rpc_rdma.h>
81 #include <nfs/nfs.h>
82 #include <sys/atomic.h>
83 
84 #define	NFS_RDMA_PORT	20049
85 
86 
87 /*
88  * Convenience structures for connection management
89  */
90 typedef struct rpcib_ipaddrs {
91 	void	*ri_list;	/* pointer to list of addresses */
92 	uint_t	ri_count;	/* number of addresses in list */
93 	uint_t	ri_size;	/* size of ri_list in bytes */
94 } rpcib_ipaddrs_t;
95 
96 
97 typedef struct rpcib_ping {
98 	rib_hca_t  *hca;
99 	ibt_path_info_t path;
100 	ibt_ip_addr_t srcip;
101 	ibt_ip_addr_t dstip;
102 } rpcib_ping_t;
103 
104 /*
105  * Prototype declarations for driver ops
106  */
107 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
108 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
109 				void *, void **);
110 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
111 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
112 static int	rpcib_do_ip_ioctl(int, int, void *);
113 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
114 static int rpcib_cache_kstat_update(kstat_t *, int);
115 static void rib_force_cleanup(void *);
116 static void rib_stop_hca_services(rib_hca_t *);
117 static void rib_attach_hca(void);
118 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
119 		struct netbuf *d_svcaddr, CONN **conn);
120 
121 struct {
122 	kstat_named_t cache_limit;
123 	kstat_named_t cache_allocation;
124 	kstat_named_t cache_hits;
125 	kstat_named_t cache_misses;
126 	kstat_named_t cache_misses_above_the_limit;
127 } rpcib_kstat = {
128 	{"cache_limit",			KSTAT_DATA_UINT64 },
129 	{"cache_allocation",		KSTAT_DATA_UINT64 },
130 	{"cache_hits",			KSTAT_DATA_UINT64 },
131 	{"cache_misses",		KSTAT_DATA_UINT64 },
132 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
133 };
134 
135 /* rpcib cb_ops */
136 static struct cb_ops rpcib_cbops = {
137 	nulldev,		/* open */
138 	nulldev,		/* close */
139 	nodev,			/* strategy */
140 	nodev,			/* print */
141 	nodev,			/* dump */
142 	nodev,			/* read */
143 	nodev,			/* write */
144 	nodev,			/* ioctl */
145 	nodev,			/* devmap */
146 	nodev,			/* mmap */
147 	nodev,			/* segmap */
148 	nochpoll,		/* poll */
149 	ddi_prop_op,		/* prop_op */
150 	NULL,			/* stream */
151 	D_MP,			/* cb_flag */
152 	CB_REV,			/* rev */
153 	nodev,			/* int (*cb_aread)() */
154 	nodev			/* int (*cb_awrite)() */
155 };
156 
157 /*
158  * Device options
159  */
160 static struct dev_ops rpcib_ops = {
161 	DEVO_REV,		/* devo_rev, */
162 	0,			/* refcnt  */
163 	rpcib_getinfo,		/* info */
164 	nulldev,		/* identify */
165 	nulldev,		/* probe */
166 	rpcib_attach,		/* attach */
167 	rpcib_detach,		/* detach */
168 	nodev,			/* reset */
169 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
170 	NULL,			/* bus operations */
171 	NULL,			/* power */
172 	ddi_quiesce_not_needed,		/* quiesce */
173 };
174 
175 /*
176  * Module linkage information.
177  */
178 
179 static struct modldrv rib_modldrv = {
180 	&mod_driverops,		/* Driver module */
181 	"RPCIB plugin driver",	/* Driver name and version */
182 	&rpcib_ops,		/* Driver ops */
183 };
184 
185 static struct modlinkage rib_modlinkage = {
186 	MODREV_1,
187 	(void *)&rib_modldrv,
188 	NULL
189 };
190 
191 typedef struct rib_lrc_entry {
192 	struct rib_lrc_entry *forw;
193 	struct rib_lrc_entry *back;
194 	char *lrc_buf;
195 
196 	uint32_t lrc_len;
197 	void  *avl_node;
198 	bool_t registered;
199 
200 	struct mrc lrc_mhandle;
201 	bool_t lrc_on_freed_list;
202 } rib_lrc_entry_t;
203 
204 typedef	struct cache_struct	{
205 	rib_lrc_entry_t		r;
206 	uint32_t		len;
207 	uint32_t		elements;
208 	kmutex_t		node_lock;
209 	avl_node_t		avl_link;
210 } cache_avl_struct_t;
211 
212 uint64_t	cache_limit = 100 * 1024 * 1024;
213 static uint64_t	cache_watermark = 80 * 1024 * 1024;
214 static bool_t	stats_enabled = FALSE;
215 
216 static uint64_t max_unsignaled_rws = 5;
217 int nfs_rdma_port = NFS_RDMA_PORT;
218 
219 #define	RIBNETID_TCP	"tcp"
220 #define	RIBNETID_TCP6	"tcp6"
221 
222 /*
223  * rib_stat: private data pointer used when registering
224  *	with the IBTF.  It is returned to the consumer
225  *	in all callbacks.
226  */
227 static rpcib_state_t *rib_stat = NULL;
228 
229 #define	RNR_RETRIES	IBT_RNR_RETRY_1
230 #define	MAX_PORTS	2
231 #define	RDMA_DUMMY_WRID	0x4D3A1D4D3A1D
232 #define	RDMA_CONN_REAP_RETRY	10	/* 10 secs */
233 
234 int preposted_rbufs = RDMA_BUFS_GRANT;
235 int send_threshold = 1;
236 
237 /*
238  * Old cards with Tavor driver have limited memory footprint
239  * when booted in 32bit. The rib_max_rbufs tunable can be
240  * tuned for more buffers if needed.
241  */
242 
243 #if !defined(_ELF64) && !defined(__sparc)
244 int rib_max_rbufs = MAX_BUFS;
245 #else
246 int rib_max_rbufs = 10 * MAX_BUFS;
247 #endif	/* !(_ELF64) && !(__sparc) */
248 
249 int rib_conn_timeout = 60 * 12;		/* 12 minutes */
250 
251 /*
252  * State of the plugin.
253  * ACCEPT = accepting new connections and requests.
254  * NO_ACCEPT = not accepting new connection and requests.
255  * This should eventually move to rpcib_state_t structure, since this
256  * will tell in which state the plugin is for a particular type of service
257  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
258  * state for one and in no_accept state for the other.
259  */
260 int		plugin_state;
261 kmutex_t	plugin_state_lock;
262 
263 ldi_ident_t rpcib_li;
264 
265 /*
266  * RPCIB RDMATF operations
267  */
268 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
269 static rdma_stat rib_disconnect(CONN *conn);
270 static void rib_listen(struct rdma_svc_data *rd);
271 static void rib_listen_stop(struct rdma_svc_data *rd);
272 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
273 	uint_t buflen, struct mrc *buf_handle);
274 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
275 	struct mrc buf_handle);
276 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
277 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
278 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
279 		struct mrc buf_handle);
280 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
281 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
282 	void *lrc);
283 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
284 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
285 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
286 	caddr_t buf, int len, int cpu);
287 
288 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
289 
290 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
291 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
292 
293 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
294 
295 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
296 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
297 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
298 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
299 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
300 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
301 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
302 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
303 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
304 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
305 	int addr_type, void *, CONN **);
306 static rdma_stat rib_conn_release(CONN *conn);
307 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
308 	rpcib_ping_t *, CONN **);
309 static rdma_stat rib_getinfo(rdma_info_t *info);
310 
311 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
312 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
313 static void rib_destroy_cache(rib_hca_t *hca);
314 static	void	rib_server_side_cache_reclaim(void *argp);
315 static int avl_compare(const void *t1, const void *t2);
316 
317 static void rib_stop_services(rib_hca_t *);
318 static void rib_close_channels(rib_conn_list_t *);
319 static void rib_conn_close(void *);
320 static void rib_recv_rele(rib_qp_t *);
321 static rdma_stat rib_conn_release_locked(CONN *conn);
322 
323 /*
324  * RPCIB addressing operations
325  */
326 
327 /*
328  * RDMA operations the RPCIB module exports
329  */
330 static rdmaops_t rib_ops = {
331 	rib_reachable,
332 	rib_conn_get,
333 	rib_conn_release,
334 	rib_listen,
335 	rib_listen_stop,
336 	rib_registermem,
337 	rib_deregistermem,
338 	rib_registermemsync,
339 	rib_deregistermemsync,
340 	rib_syncmem,
341 	rib_reg_buf_alloc,
342 	rib_reg_buf_free,
343 	rib_send,
344 	rib_send_resp,
345 	rib_post_resp,
346 	rib_post_resp_remove,
347 	rib_post_recv,
348 	rib_recv,
349 	rib_read,
350 	rib_write,
351 	rib_getinfo,
352 };
353 
354 /*
355  * RDMATF RPCIB plugin details
356  */
357 static rdma_mod_t rib_mod = {
358 	"ibtf",		/* api name */
359 	RDMATF_VERS_1,
360 	0,
361 	&rib_ops,	/* rdma op vector for ibtf */
362 };
363 
364 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
365 static rdma_stat rib_qp_init(rib_qp_t *, int);
366 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
367 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
368 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
369 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
370 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
371 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
372 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
373 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
374 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
375 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
376 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
377 	rib_qp_t **);
378 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
379 	rib_qp_t **);
380 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
381 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
382 static int rib_free_sendwait(struct send_wid *);
383 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
384 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
385 static void rdma_done_rem_list(rib_qp_t *);
386 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
387 
388 static void rib_async_handler(void *,
389 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
390 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
391 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
392 static int rib_free_svc_recv(struct svc_recv *);
393 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
394 static void rib_free_wid(struct recv_wid *);
395 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
396 static void rib_detach_hca(ibt_hca_hdl_t);
397 static void rib_close_a_channel(CONN *);
398 static void rib_send_hold(rib_qp_t *);
399 static void rib_send_rele(rib_qp_t *);
400 
401 /*
402  * Registration with IBTF as a consumer
403  */
404 static struct ibt_clnt_modinfo_s rib_modinfo = {
405 	IBTI_V_CURR,
406 	IBT_GENERIC,
407 	rib_async_handler,	/* async event handler */
408 	NULL,			/* Memory Region Handler */
409 	"nfs/ib"
410 };
411 
412 /*
413  * Global strucuture
414  */
415 
416 typedef struct rpcib_s {
417 	dev_info_t	*rpcib_dip;
418 	kmutex_t	rpcib_mutex;
419 } rpcib_t;
420 
421 rpcib_t rpcib;
422 
423 /*
424  * /etc/system controlled variable to control
425  * debugging in rpcib kernel module.
426  * Set it to values greater that 1 to control
427  * the amount of debugging messages required.
428  */
429 int rib_debug = 0;
430 
431 int
432 _init(void)
433 {
434 	int error;
435 
436 	error = mod_install((struct modlinkage *)&rib_modlinkage);
437 	if (error != 0) {
438 		/*
439 		 * Could not load module
440 		 */
441 		return (error);
442 	}
443 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
444 	return (0);
445 }
446 
447 int
448 _fini()
449 {
450 	int status;
451 
452 	/*
453 	 * Remove module
454 	 */
455 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
456 		return (status);
457 	}
458 	mutex_destroy(&plugin_state_lock);
459 	return (0);
460 }
461 
462 int
463 _info(struct modinfo *modinfop)
464 {
465 	return (mod_info(&rib_modlinkage, modinfop));
466 }
467 
468 /*
469  * rpcib_getinfo()
470  * Given the device number, return the devinfo pointer or the
471  * instance number.
472  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
473  */
474 
475 /*ARGSUSED*/
476 static int
477 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
478 {
479 	int ret = DDI_SUCCESS;
480 
481 	switch (cmd) {
482 	case DDI_INFO_DEVT2DEVINFO:
483 		if (rpcib.rpcib_dip != NULL)
484 			*result = rpcib.rpcib_dip;
485 		else {
486 			*result = NULL;
487 			ret = DDI_FAILURE;
488 		}
489 		break;
490 
491 	case DDI_INFO_DEVT2INSTANCE:
492 		*result = NULL;
493 		break;
494 
495 	default:
496 		ret = DDI_FAILURE;
497 	}
498 	return (ret);
499 }
500 
501 static void
502 rpcib_free_hca_list()
503 {
504 	rib_hca_t *hca, *hcap;
505 
506 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
507 	hca = rib_stat->hcas_list;
508 	rib_stat->hcas_list = NULL;
509 	rw_exit(&rib_stat->hcas_list_lock);
510 	while (hca != NULL) {
511 		rw_enter(&hca->state_lock, RW_WRITER);
512 		hcap = hca;
513 		hca = hca->next;
514 		rib_stat->nhca_inited--;
515 		rib_mod.rdma_count--;
516 		hcap->state = HCA_DETACHED;
517 		rw_exit(&hcap->state_lock);
518 		rib_stop_hca_services(hcap);
519 
520 		kmem_free(hcap, sizeof (*hcap));
521 	}
522 }
523 
524 static rdma_stat
525 rpcib_free_service_list()
526 {
527 	rib_service_t *service;
528 	ibt_status_t ret;
529 
530 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
531 	while (rib_stat->service_list != NULL) {
532 		service = rib_stat->service_list;
533 		ret = ibt_unbind_all_services(service->srv_hdl);
534 		if (ret != IBT_SUCCESS) {
535 			rw_exit(&rib_stat->service_list_lock);
536 #ifdef DEBUG
537 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
538 			    "ibt_unbind_all_services failed (%d)\n", (int)ret);
539 #endif
540 			return (RDMA_FAILED);
541 		}
542 		ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
543 		    service->srv_hdl);
544 		if (ret != IBT_SUCCESS) {
545 			rw_exit(&rib_stat->service_list_lock);
546 #ifdef DEBUG
547 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
548 			    "ibt_deregister_service failed (%d)\n", (int)ret);
549 #endif
550 			return (RDMA_FAILED);
551 		}
552 		rib_stat->service_list = service->next;
553 		kmem_free(service, sizeof (rib_service_t));
554 	}
555 	rw_exit(&rib_stat->service_list_lock);
556 
557 	return (RDMA_SUCCESS);
558 }
559 
560 static int
561 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
562 {
563 	ibt_status_t	ibt_status;
564 	rdma_stat	r_status;
565 
566 	switch (cmd) {
567 	case DDI_ATTACH:
568 		break;
569 	case DDI_RESUME:
570 		return (DDI_SUCCESS);
571 	default:
572 		return (DDI_FAILURE);
573 	}
574 
575 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
576 
577 	mutex_enter(&rpcib.rpcib_mutex);
578 	if (rpcib.rpcib_dip != NULL) {
579 		mutex_exit(&rpcib.rpcib_mutex);
580 		return (DDI_FAILURE);
581 	}
582 	rpcib.rpcib_dip = dip;
583 	mutex_exit(&rpcib.rpcib_mutex);
584 	/*
585 	 * Create the "rpcib" minor-node.
586 	 */
587 	if (ddi_create_minor_node(dip,
588 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
589 		/* Error message, no cmn_err as they print on console */
590 		return (DDI_FAILURE);
591 	}
592 
593 	if (rib_stat == NULL) {
594 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
595 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
596 		rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
597 		mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
598 	}
599 
600 	rib_stat->hca_count = ibt_get_hca_list(NULL);
601 	if (rib_stat->hca_count < 1) {
602 		mutex_destroy(&rib_stat->listen_lock);
603 		rw_destroy(&rib_stat->hcas_list_lock);
604 		mutex_destroy(&rib_stat->open_hca_lock);
605 		kmem_free(rib_stat, sizeof (*rib_stat));
606 		rib_stat = NULL;
607 		return (DDI_FAILURE);
608 	}
609 
610 	ibt_status = ibt_attach(&rib_modinfo, dip,
611 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
612 
613 	if (ibt_status != IBT_SUCCESS) {
614 		mutex_destroy(&rib_stat->listen_lock);
615 		rw_destroy(&rib_stat->hcas_list_lock);
616 		mutex_destroy(&rib_stat->open_hca_lock);
617 		kmem_free(rib_stat, sizeof (*rib_stat));
618 		rib_stat = NULL;
619 		return (DDI_FAILURE);
620 	}
621 
622 	rib_stat->service_list = NULL;
623 	rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
624 	mutex_enter(&rib_stat->open_hca_lock);
625 	if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
626 		mutex_exit(&rib_stat->open_hca_lock);
627 		goto open_fail;
628 	}
629 	mutex_exit(&rib_stat->open_hca_lock);
630 
631 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
632 	    DDI_PROP_SUCCESS) {
633 		cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
634 		    "failed.");
635 		goto register_fail;
636 	}
637 
638 	/*
639 	 * Register with rdmatf
640 	 */
641 	r_status = rdma_register_mod(&rib_mod);
642 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
643 		cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
644 		    "status = %d", r_status);
645 		goto register_fail;
646 	}
647 
648 	return (DDI_SUCCESS);
649 
650 register_fail:
651 
652 open_fail:
653 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
654 	rpcib_free_hca_list();
655 	(void) rpcib_free_service_list();
656 	mutex_destroy(&rib_stat->listen_lock);
657 	rw_destroy(&rib_stat->hcas_list_lock);
658 	mutex_destroy(&rib_stat->open_hca_lock);
659 	rw_destroy(&rib_stat->service_list_lock);
660 	kmem_free(rib_stat, sizeof (*rib_stat));
661 	rib_stat = NULL;
662 	return (DDI_FAILURE);
663 }
664 
665 /*ARGSUSED*/
666 static int
667 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
668 {
669 	switch (cmd) {
670 
671 	case DDI_DETACH:
672 		break;
673 
674 	case DDI_SUSPEND:
675 	default:
676 		return (DDI_FAILURE);
677 	}
678 
679 	/*
680 	 * Detach the hca and free resources
681 	 */
682 	mutex_enter(&plugin_state_lock);
683 	plugin_state = NO_ACCEPT;
684 	mutex_exit(&plugin_state_lock);
685 
686 	if (rpcib_free_service_list() != RDMA_SUCCESS)
687 		return (DDI_FAILURE);
688 	rpcib_free_hca_list();
689 
690 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
691 	mutex_destroy(&rib_stat->listen_lock);
692 	rw_destroy(&rib_stat->hcas_list_lock);
693 	mutex_destroy(&rib_stat->open_hca_lock);
694 	rw_destroy(&rib_stat->service_list_lock);
695 
696 	kmem_free(rib_stat, sizeof (*rib_stat));
697 	rib_stat = NULL;
698 
699 	mutex_enter(&rpcib.rpcib_mutex);
700 	rpcib.rpcib_dip = NULL;
701 	mutex_exit(&rpcib.rpcib_mutex);
702 	mutex_destroy(&rpcib.rpcib_mutex);
703 	return (DDI_SUCCESS);
704 }
705 
706 
707 static void rib_rbufpool_free(rib_hca_t *, int);
708 static void rib_rbufpool_deregister(rib_hca_t *, int);
709 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
710 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
711 static rdma_stat rib_rem_replylist(rib_qp_t *);
712 static int rib_remreply(rib_qp_t *, struct reply *);
713 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
714 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
715 
716 
717 /*
718  * One CQ pair per HCA
719  */
720 static rdma_stat
721 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
722 	rib_cq_t **cqp)
723 {
724 	rib_cq_t	*cq;
725 	ibt_cq_attr_t	cq_attr;
726 	uint32_t	real_size;
727 	ibt_status_t	status;
728 	rdma_stat	error = RDMA_SUCCESS;
729 
730 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
731 	cq->rib_hca = hca;
732 	cq_attr.cq_size = cq_size;
733 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
734 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
735 	    &real_size);
736 	if (status != IBT_SUCCESS) {
737 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
738 		    " status=%d", status);
739 		error = RDMA_FAILED;
740 		goto fail;
741 	}
742 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
743 
744 	/*
745 	 * Enable CQ callbacks. CQ Callbacks are single shot
746 	 * (e.g. you have to call ibt_enable_cq_notify()
747 	 * after each callback to get another one).
748 	 */
749 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
750 	if (status != IBT_SUCCESS) {
751 		cmn_err(CE_WARN, "rib_create_cq: "
752 		    "enable_cq_notify failed, status %d", status);
753 		error = RDMA_FAILED;
754 		goto fail;
755 	}
756 	*cqp = cq;
757 
758 	return (error);
759 fail:
760 	if (cq->rib_cq_hdl)
761 		(void) ibt_free_cq(cq->rib_cq_hdl);
762 	if (cq)
763 		kmem_free(cq, sizeof (rib_cq_t));
764 	return (error);
765 }
766 
767 /*
768  * rpcib_find_hca
769  *
770  * Caller should have already locked the hcas_lock before calling
771  * this function.
772  */
773 static rib_hca_t *
774 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
775 {
776 	rib_hca_t *hca = ribstat->hcas_list;
777 
778 	while (hca && hca->hca_guid != guid)
779 		hca = hca->next;
780 
781 	return (hca);
782 }
783 
784 static rdma_stat
785 rpcib_open_hcas(rpcib_state_t *ribstat)
786 {
787 	rib_hca_t		*hca;
788 	ibt_status_t		ibt_status;
789 	rdma_stat		status;
790 	ibt_hca_portinfo_t	*pinfop;
791 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
792 	uint_t			size, cq_size;
793 	int			i;
794 	kstat_t *ksp;
795 	cache_avl_struct_t example_avl_node;
796 	char rssc_name[32];
797 	int old_nhca_inited = ribstat->nhca_inited;
798 	ib_guid_t		*hca_guids;
799 
800 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
801 
802 	ribstat->hca_count = ibt_get_hca_list(&hca_guids);
803 	if (ribstat->hca_count == 0)
804 		return (RDMA_FAILED);
805 
806 	rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
807 	/*
808 	 * Open a hca and setup for RDMA
809 	 */
810 	for (i = 0; i < ribstat->hca_count; i++) {
811 		if (rpcib_find_hca(ribstat, hca_guids[i]))
812 			continue;
813 		hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
814 
815 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
816 		    hca_guids[i], &hca->hca_hdl);
817 		if (ibt_status != IBT_SUCCESS) {
818 			kmem_free(hca, sizeof (rib_hca_t));
819 			continue;
820 		}
821 		hca->hca_guid = hca_guids[i];
822 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
823 		hca->state = HCA_INITED;
824 
825 		/*
826 		 * query HCA info
827 		 */
828 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
829 		if (ibt_status != IBT_SUCCESS) {
830 			goto fail1;
831 		}
832 
833 		/*
834 		 * One PD (Protection Domain) per HCA.
835 		 * A qp is allowed to access a memory region
836 		 * only when it's in the same PD as that of
837 		 * the memory region.
838 		 */
839 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
840 		if (ibt_status != IBT_SUCCESS) {
841 			goto fail1;
842 		}
843 
844 		/*
845 		 * query HCA ports
846 		 */
847 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
848 		    0, &pinfop, &hca->hca_nports, &size);
849 		if (ibt_status != IBT_SUCCESS) {
850 			goto fail2;
851 		}
852 		hca->hca_ports = pinfop;
853 		hca->hca_pinfosz = size;
854 		pinfop = NULL;
855 
856 		cq_size = DEF_CQ_SIZE; /* default cq size */
857 		/*
858 		 * Create 2 pairs of cq's (1 pair for client
859 		 * and the other pair for server) on this hca.
860 		 * If number of qp's gets too large, then several
861 		 * cq's will be needed.
862 		 */
863 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
864 		    &hca->svc_rcq);
865 		if (status != RDMA_SUCCESS) {
866 			goto fail3;
867 		}
868 
869 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
870 		    &hca->svc_scq);
871 		if (status != RDMA_SUCCESS) {
872 			goto fail3;
873 		}
874 
875 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
876 		    &hca->clnt_rcq);
877 		if (status != RDMA_SUCCESS) {
878 			goto fail3;
879 		}
880 
881 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
882 		    &hca->clnt_scq);
883 		if (status != RDMA_SUCCESS) {
884 			goto fail3;
885 		}
886 
887 		/*
888 		 * Create buffer pools.
889 		 * Note rib_rbuf_create also allocates memory windows.
890 		 */
891 		hca->recv_pool = rib_rbufpool_create(hca,
892 		    RECV_BUFFER, rib_max_rbufs);
893 		if (hca->recv_pool == NULL) {
894 			goto fail3;
895 		}
896 
897 		hca->send_pool = rib_rbufpool_create(hca,
898 		    SEND_BUFFER, rib_max_rbufs);
899 		if (hca->send_pool == NULL) {
900 			rib_rbufpool_destroy(hca, RECV_BUFFER);
901 			goto fail3;
902 		}
903 
904 		if (hca->server_side_cache == NULL) {
905 			(void) sprintf(rssc_name,
906 			    "rib_srvr_cache_%llx",
907 			    (long long unsigned int) hca->hca_guid);
908 			hca->server_side_cache = kmem_cache_create(
909 			    rssc_name,
910 			    sizeof (cache_avl_struct_t), 0,
911 			    NULL,
912 			    NULL,
913 			    rib_server_side_cache_reclaim,
914 			    hca, NULL, 0);
915 		}
916 
917 		avl_create(&hca->avl_tree,
918 		    avl_compare,
919 		    sizeof (cache_avl_struct_t),
920 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
921 		    (uint_t)(uintptr_t)&example_avl_node);
922 
923 		rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
924 		    hca->iblock);
925 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
926 		rw_init(&hca->avl_rw_lock,
927 		    NULL, RW_DRIVER, hca->iblock);
928 		mutex_init(&hca->cache_allocation_lock,
929 		    NULL, MUTEX_DRIVER, NULL);
930 		hca->avl_init = TRUE;
931 
932 		/* Create kstats for the cache */
933 		ASSERT(INGLOBALZONE(curproc));
934 
935 		if (!stats_enabled) {
936 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
937 			    KSTAT_TYPE_NAMED,
938 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
939 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
940 			    GLOBAL_ZONEID);
941 			if (ksp) {
942 				ksp->ks_data = (void *) &rpcib_kstat;
943 				ksp->ks_update = rpcib_cache_kstat_update;
944 				kstat_install(ksp);
945 				stats_enabled = TRUE;
946 			}
947 		}
948 		if (hca->cleanup_helper == NULL) {
949 			char tq_name[sizeof (hca->hca_guid) * 2 + 1];
950 
951 			(void) snprintf(tq_name, sizeof (tq_name), "%llX",
952 			    (unsigned long long int) hca->hca_guid);
953 			hca->cleanup_helper = ddi_taskq_create(NULL,
954 			    tq_name, 1, TASKQ_DEFAULTPRI, 0);
955 		}
956 
957 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
958 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
959 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
960 		    hca->iblock);
961 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
962 		    hca->iblock);
963 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
964 		hca->inuse = TRUE;
965 
966 		hca->next = ribstat->hcas_list;
967 		ribstat->hcas_list = hca;
968 		ribstat->nhca_inited++;
969 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
970 		continue;
971 
972 fail3:
973 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
974 fail2:
975 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
976 fail1:
977 		(void) ibt_close_hca(hca->hca_hdl);
978 		kmem_free(hca, sizeof (rib_hca_t));
979 	}
980 	rw_exit(&ribstat->hcas_list_lock);
981 	ibt_free_hca_list(hca_guids, ribstat->hca_count);
982 	rib_mod.rdma_count = rib_stat->nhca_inited;
983 
984 	/*
985 	 * return success if at least one new hca has been configured.
986 	 */
987 	if (ribstat->nhca_inited != old_nhca_inited)
988 		return (RDMA_SUCCESS);
989 	else
990 		return (RDMA_FAILED);
991 }
992 
993 /*
994  * Callback routines
995  */
996 
997 /*
998  * SCQ handlers
999  */
1000 /* ARGSUSED */
1001 static void
1002 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1003 {
1004 	ibt_status_t	ibt_status;
1005 	ibt_wc_t	wc;
1006 	struct send_wid	*wd;
1007 	CONN		*conn;
1008 	rib_qp_t	*qp;
1009 	int		i;
1010 
1011 	/*
1012 	 * Re-enable cq notify here to avoid missing any
1013 	 * completion queue notification.
1014 	 */
1015 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1016 
1017 	ibt_status = IBT_SUCCESS;
1018 	while (ibt_status != IBT_CQ_EMPTY) {
1019 		bzero(&wc, sizeof (wc));
1020 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1021 		if (ibt_status != IBT_SUCCESS)
1022 			return;
1023 
1024 		/*
1025 		 * Got a send completion
1026 		 */
1027 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1028 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1029 			qp = wd->qp;
1030 			conn = qptoc(qp);
1031 
1032 			mutex_enter(&wd->sendwait_lock);
1033 			switch (wc.wc_status) {
1034 			case IBT_WC_SUCCESS:
1035 				wd->status = RDMA_SUCCESS;
1036 				break;
1037 			default:
1038 /*
1039  *    RC Send Q Error Code		Local state     Remote State
1040  *    ==================== 		===========     ============
1041  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
1042  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
1043  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
1044  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
1045  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
1046  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
1047  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
1048  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
1049  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
1050  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
1051  *    IBT_WC_WR_FLUSHED_ERR               ERROR           None
1052  */
1053 				/*
1054 				 * Channel in error state. Set connection to
1055 				 * ERROR and cleanup will happen either from
1056 				 * conn_release  or from rib_conn_get
1057 				 */
1058 				wd->status = RDMA_FAILED;
1059 				mutex_enter(&conn->c_lock);
1060 				if (conn->c_state != C_DISCONN_PEND)
1061 					conn->c_state = C_ERROR_CONN;
1062 				mutex_exit(&conn->c_lock);
1063 				break;
1064 			}
1065 
1066 			if (wd->cv_sig == 1) {
1067 				/*
1068 				 * Notify poster
1069 				 */
1070 				cv_signal(&wd->wait_cv);
1071 				mutex_exit(&wd->sendwait_lock);
1072 			} else {
1073 				/*
1074 				 * Poster not waiting for notification.
1075 				 * Free the send buffers and send_wid
1076 				 */
1077 				for (i = 0; i < wd->nsbufs; i++) {
1078 					rib_rbuf_free(qptoc(wd->qp),
1079 					    SEND_BUFFER,
1080 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1081 				}
1082 
1083 				/* decrement the send ref count */
1084 				rib_send_rele(qp);
1085 
1086 				mutex_exit(&wd->sendwait_lock);
1087 				(void) rib_free_sendwait(wd);
1088 			}
1089 		}
1090 	}
1091 }
1092 
1093 /* ARGSUSED */
1094 static void
1095 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1096 {
1097 	ibt_status_t	ibt_status;
1098 	ibt_wc_t	wc;
1099 	struct send_wid	*wd;
1100 	rib_qp_t	*qp;
1101 	CONN		*conn;
1102 	int		i;
1103 
1104 	/*
1105 	 * Re-enable cq notify here to avoid missing any
1106 	 * completion queue notification.
1107 	 */
1108 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1109 
1110 	ibt_status = IBT_SUCCESS;
1111 	while (ibt_status != IBT_CQ_EMPTY) {
1112 		bzero(&wc, sizeof (wc));
1113 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1114 		if (ibt_status != IBT_SUCCESS)
1115 			return;
1116 
1117 		/*
1118 		 * Got a send completion
1119 		 */
1120 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1121 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1122 			qp = wd->qp;
1123 			conn = qptoc(qp);
1124 			mutex_enter(&wd->sendwait_lock);
1125 
1126 			switch (wc.wc_status) {
1127 			case IBT_WC_SUCCESS:
1128 				wd->status = RDMA_SUCCESS;
1129 				break;
1130 			default:
1131 				/*
1132 				 * Channel in error state. Set connection to
1133 				 * ERROR and cleanup will happen either from
1134 				 * conn_release  or conn timeout.
1135 				 */
1136 				wd->status = RDMA_FAILED;
1137 				mutex_enter(&conn->c_lock);
1138 				if (conn->c_state != C_DISCONN_PEND)
1139 					conn->c_state = C_ERROR_CONN;
1140 				mutex_exit(&conn->c_lock);
1141 				break;
1142 			}
1143 
1144 			if (wd->cv_sig == 1) {
1145 				/*
1146 				 * Update completion status and notify poster
1147 				 */
1148 				cv_signal(&wd->wait_cv);
1149 				mutex_exit(&wd->sendwait_lock);
1150 			} else {
1151 				/*
1152 				 * Poster not waiting for notification.
1153 				 * Free the send buffers and send_wid
1154 				 */
1155 				for (i = 0; i < wd->nsbufs; i++) {
1156 					rib_rbuf_free(qptoc(wd->qp),
1157 					    SEND_BUFFER,
1158 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1159 				}
1160 
1161 				/* decrement the send ref count */
1162 				rib_send_rele(qp);
1163 
1164 				mutex_exit(&wd->sendwait_lock);
1165 				(void) rib_free_sendwait(wd);
1166 			}
1167 		}
1168 	}
1169 }
1170 
1171 /*
1172  * RCQ handler
1173  */
1174 /* ARGSUSED */
1175 static void
1176 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1177 {
1178 	rib_qp_t	*qp;
1179 	ibt_status_t	ibt_status;
1180 	ibt_wc_t	wc;
1181 	struct recv_wid	*rwid;
1182 
1183 	/*
1184 	 * Re-enable cq notify here to avoid missing any
1185 	 * completion queue notification.
1186 	 */
1187 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1188 
1189 	ibt_status = IBT_SUCCESS;
1190 	while (ibt_status != IBT_CQ_EMPTY) {
1191 		bzero(&wc, sizeof (wc));
1192 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1193 		if (ibt_status != IBT_SUCCESS)
1194 			return;
1195 
1196 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1197 		qp = rwid->qp;
1198 
1199 		if (wc.wc_status == IBT_WC_SUCCESS) {
1200 			XDR	inxdrs, *xdrs;
1201 			uint_t	xid, vers, op, find_xid = 0;
1202 			struct reply	*r;
1203 			CONN *conn = qptoc(qp);
1204 			uint32_t rdma_credit = 0;
1205 
1206 			xdrs = &inxdrs;
1207 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1208 			    wc.wc_bytes_xfer, XDR_DECODE);
1209 			/*
1210 			 * Treat xid as opaque (xid is the first entity
1211 			 * in the rpc rdma message).
1212 			 */
1213 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1214 
1215 			/* Skip xid and set the xdr position accordingly. */
1216 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1217 			(void) xdr_u_int(xdrs, &vers);
1218 			(void) xdr_u_int(xdrs, &rdma_credit);
1219 			(void) xdr_u_int(xdrs, &op);
1220 			XDR_DESTROY(xdrs);
1221 
1222 			if (vers != RPCRDMA_VERS) {
1223 				/*
1224 				 * Invalid RPC/RDMA version. Cannot
1225 				 * interoperate.  Set connection to
1226 				 * ERROR state and bail out.
1227 				 */
1228 				mutex_enter(&conn->c_lock);
1229 				if (conn->c_state != C_DISCONN_PEND)
1230 					conn->c_state = C_ERROR_CONN;
1231 				mutex_exit(&conn->c_lock);
1232 				rib_rbuf_free(conn, RECV_BUFFER,
1233 				    (void *)(uintptr_t)rwid->addr);
1234 				rib_free_wid(rwid);
1235 				rib_recv_rele(qp);
1236 				continue;
1237 			}
1238 
1239 			mutex_enter(&qp->replylist_lock);
1240 			for (r = qp->replylist; r != NULL; r = r->next) {
1241 				if (r->xid == xid) {
1242 					find_xid = 1;
1243 					switch (op) {
1244 					case RDMA_MSG:
1245 					case RDMA_NOMSG:
1246 					case RDMA_MSGP:
1247 						r->status = RDMA_SUCCESS;
1248 						r->vaddr_cq = rwid->addr;
1249 						r->bytes_xfer =
1250 						    wc.wc_bytes_xfer;
1251 						cv_signal(&r->wait_cv);
1252 						break;
1253 					default:
1254 						rib_rbuf_free(qptoc(qp),
1255 						    RECV_BUFFER,
1256 						    (void *)(uintptr_t)
1257 						    rwid->addr);
1258 						break;
1259 					}
1260 					break;
1261 				}
1262 			}
1263 			mutex_exit(&qp->replylist_lock);
1264 			if (find_xid == 0) {
1265 				/* RPC caller not waiting for reply */
1266 
1267 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1268 				    int, xid);
1269 
1270 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1271 				    (void *)(uintptr_t)rwid->addr);
1272 			}
1273 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1274 			CONN *conn = qptoc(qp);
1275 
1276 			/*
1277 			 * Connection being flushed. Just free
1278 			 * the posted buffer
1279 			 */
1280 			rib_rbuf_free(conn, RECV_BUFFER,
1281 			    (void *)(uintptr_t)rwid->addr);
1282 		} else {
1283 			CONN *conn = qptoc(qp);
1284 /*
1285  *  RC Recv Q Error Code		Local state     Remote State
1286  *  ====================		===========     ============
1287  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1288  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1289  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1290  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1291  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1292  *  IBT_WC_WR_FLUSHED_ERR               None            None
1293  */
1294 			/*
1295 			 * Channel in error state. Set connection
1296 			 * in ERROR state.
1297 			 */
1298 			mutex_enter(&conn->c_lock);
1299 			if (conn->c_state != C_DISCONN_PEND)
1300 				conn->c_state = C_ERROR_CONN;
1301 			mutex_exit(&conn->c_lock);
1302 			rib_rbuf_free(conn, RECV_BUFFER,
1303 			    (void *)(uintptr_t)rwid->addr);
1304 		}
1305 		rib_free_wid(rwid);
1306 		rib_recv_rele(qp);
1307 	}
1308 }
1309 
1310 /* Server side */
1311 /* ARGSUSED */
1312 static void
1313 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1314 {
1315 	rdma_recv_data_t *rdp;
1316 	rib_qp_t	*qp;
1317 	ibt_status_t	ibt_status;
1318 	ibt_wc_t	wc;
1319 	struct svc_recv	*s_recvp;
1320 	CONN		*conn;
1321 	mblk_t		*mp;
1322 
1323 	/*
1324 	 * Re-enable cq notify here to avoid missing any
1325 	 * completion queue notification.
1326 	 */
1327 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1328 
1329 	ibt_status = IBT_SUCCESS;
1330 	while (ibt_status != IBT_CQ_EMPTY) {
1331 		bzero(&wc, sizeof (wc));
1332 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1333 		if (ibt_status != IBT_SUCCESS)
1334 			return;
1335 
1336 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1337 		qp = s_recvp->qp;
1338 		conn = qptoc(qp);
1339 
1340 		if (wc.wc_status == IBT_WC_SUCCESS) {
1341 			XDR	inxdrs, *xdrs;
1342 			uint_t	xid, vers, op;
1343 			uint32_t rdma_credit;
1344 
1345 			xdrs = &inxdrs;
1346 			/* s_recvp->vaddr stores data */
1347 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1348 			    wc.wc_bytes_xfer, XDR_DECODE);
1349 
1350 			/*
1351 			 * Treat xid as opaque (xid is the first entity
1352 			 * in the rpc rdma message).
1353 			 */
1354 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1355 			/* Skip xid and set the xdr position accordingly. */
1356 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1357 			if (!xdr_u_int(xdrs, &vers) ||
1358 			    !xdr_u_int(xdrs, &rdma_credit) ||
1359 			    !xdr_u_int(xdrs, &op)) {
1360 				rib_rbuf_free(conn, RECV_BUFFER,
1361 				    (void *)(uintptr_t)s_recvp->vaddr);
1362 				XDR_DESTROY(xdrs);
1363 				rib_recv_rele(qp);
1364 				(void) rib_free_svc_recv(s_recvp);
1365 				continue;
1366 			}
1367 			XDR_DESTROY(xdrs);
1368 
1369 			if (vers != RPCRDMA_VERS) {
1370 				/*
1371 				 * Invalid RPC/RDMA version.
1372 				 * Drop rpc rdma message.
1373 				 */
1374 				rib_rbuf_free(conn, RECV_BUFFER,
1375 				    (void *)(uintptr_t)s_recvp->vaddr);
1376 				rib_recv_rele(qp);
1377 				(void) rib_free_svc_recv(s_recvp);
1378 				continue;
1379 			}
1380 			/*
1381 			 * Is this for RDMA_DONE?
1382 			 */
1383 			if (op == RDMA_DONE) {
1384 				rib_rbuf_free(conn, RECV_BUFFER,
1385 				    (void *)(uintptr_t)s_recvp->vaddr);
1386 				/*
1387 				 * Wake up the thread waiting on
1388 				 * a RDMA_DONE for xid
1389 				 */
1390 				mutex_enter(&qp->rdlist_lock);
1391 				rdma_done_notify(qp, xid);
1392 				mutex_exit(&qp->rdlist_lock);
1393 				rib_recv_rele(qp);
1394 				(void) rib_free_svc_recv(s_recvp);
1395 				continue;
1396 			}
1397 
1398 			mutex_enter(&plugin_state_lock);
1399 			mutex_enter(&conn->c_lock);
1400 			if ((plugin_state == ACCEPT) &&
1401 			    (conn->c_state == C_CONNECTED)) {
1402 				conn->c_ref++;
1403 				mutex_exit(&conn->c_lock);
1404 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1405 				    == NULL)
1406 					(void) strwaitbuf(
1407 					    sizeof (*rdp), BPRI_LO);
1408 				/*
1409 				 * Plugin is in accept state, hence the master
1410 				 * transport queue for this is still accepting
1411 				 * requests. Hence we can call svc_queuereq to
1412 				 * queue this recieved msg.
1413 				 */
1414 				rdp = (rdma_recv_data_t *)mp->b_rptr;
1415 				rdp->conn = conn;
1416 				rdp->rpcmsg.addr =
1417 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1418 				rdp->rpcmsg.type = RECV_BUFFER;
1419 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1420 				rdp->status = wc.wc_status;
1421 				mp->b_wptr += sizeof (*rdp);
1422 				svc_queuereq((queue_t *)rib_stat->q, mp);
1423 				mutex_exit(&plugin_state_lock);
1424 			} else {
1425 				/*
1426 				 * The master transport for this is going
1427 				 * away and the queue is not accepting anymore
1428 				 * requests for krpc, so don't do anything, just
1429 				 * free the msg.
1430 				 */
1431 				mutex_exit(&conn->c_lock);
1432 				mutex_exit(&plugin_state_lock);
1433 				rib_rbuf_free(conn, RECV_BUFFER,
1434 				    (void *)(uintptr_t)s_recvp->vaddr);
1435 			}
1436 		} else {
1437 			rib_rbuf_free(conn, RECV_BUFFER,
1438 			    (void *)(uintptr_t)s_recvp->vaddr);
1439 		}
1440 		rib_recv_rele(qp);
1441 		(void) rib_free_svc_recv(s_recvp);
1442 	}
1443 }
1444 
1445 static void
1446 rib_attach_hca()
1447 {
1448 	mutex_enter(&rib_stat->open_hca_lock);
1449 	(void) rpcib_open_hcas(rib_stat);
1450 	rib_listen(NULL);
1451 	mutex_exit(&rib_stat->open_hca_lock);
1452 }
1453 
1454 /*
1455  * Handles DR event of IBT_HCA_DETACH_EVENT.
1456  */
1457 /* ARGSUSED */
1458 static void
1459 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1460 	ibt_async_code_t code, ibt_async_event_t *event)
1461 {
1462 	switch (code) {
1463 	case IBT_HCA_ATTACH_EVENT:
1464 		rib_attach_hca();
1465 		break;
1466 	case IBT_HCA_DETACH_EVENT:
1467 		rib_detach_hca(hca_hdl);
1468 #ifdef DEBUG
1469 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1470 #endif
1471 		break;
1472 	case IBT_EVENT_PORT_UP:
1473 		/*
1474 		 * A port is up. We should call rib_listen() since there is
1475 		 * a chance that rib_listen() may have failed during
1476 		 * rib_attach_hca() because the port had not been up yet.
1477 		 */
1478 		rib_listen(NULL);
1479 #ifdef DEBUG
1480 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1481 #endif
1482 		break;
1483 #ifdef DEBUG
1484 	case IBT_EVENT_PATH_MIGRATED:
1485 		cmn_err(CE_NOTE, "rib_async_handler(): "
1486 		    "IBT_EVENT_PATH_MIGRATED\n");
1487 		break;
1488 	case IBT_EVENT_SQD:
1489 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1490 		break;
1491 	case IBT_EVENT_COM_EST:
1492 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1493 		break;
1494 	case IBT_ERROR_CATASTROPHIC_CHAN:
1495 		cmn_err(CE_NOTE, "rib_async_handler(): "
1496 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1497 		break;
1498 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1499 		cmn_err(CE_NOTE, "rib_async_handler(): "
1500 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1501 		break;
1502 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1503 		cmn_err(CE_NOTE, "rib_async_handler(): "
1504 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1505 		break;
1506 	case IBT_ERROR_PATH_MIGRATE_REQ:
1507 		cmn_err(CE_NOTE, "rib_async_handler(): "
1508 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1509 		break;
1510 	case IBT_ERROR_CQ:
1511 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1512 		break;
1513 	case IBT_ERROR_PORT_DOWN:
1514 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1515 		break;
1516 	case IBT_ASYNC_OPAQUE1:
1517 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1518 		break;
1519 	case IBT_ASYNC_OPAQUE2:
1520 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1521 		break;
1522 	case IBT_ASYNC_OPAQUE3:
1523 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1524 		break;
1525 	case IBT_ASYNC_OPAQUE4:
1526 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1527 		break;
1528 #endif
1529 	default:
1530 		break;
1531 	}
1532 }
1533 
1534 /*
1535  * Client's reachable function.
1536  */
1537 static rdma_stat
1538 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1539 {
1540 	rdma_stat	status;
1541 	rpcib_ping_t	rpt;
1542 	struct netbuf	saddr;
1543 	CONN		*conn;
1544 
1545 	bzero(&saddr, sizeof (struct netbuf));
1546 	status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
1547 
1548 	if (status == RDMA_SUCCESS) {
1549 		*handle = (void *)rpt.hca;
1550 		/* release the reference */
1551 		(void) rib_conn_release(conn);
1552 		return (RDMA_SUCCESS);
1553 	} else {
1554 		*handle = NULL;
1555 		DTRACE_PROBE(rpcib__i__pingfailed);
1556 		return (RDMA_FAILED);
1557 	}
1558 }
1559 
1560 /* Client side qp creation */
1561 static rdma_stat
1562 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1563 {
1564 	rib_qp_t	*kqp = NULL;
1565 	CONN		*conn;
1566 	rdma_clnt_cred_ctrl_t *cc_info;
1567 
1568 	ASSERT(qp != NULL);
1569 	*qp = NULL;
1570 
1571 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1572 	conn = qptoc(kqp);
1573 	kqp->hca = hca;
1574 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1575 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1576 
1577 	kqp->mode = RIB_CLIENT;
1578 	kqp->chan_flags = IBT_BLOCKING;
1579 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1580 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1581 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1582 	/*
1583 	 * Initialize
1584 	 */
1585 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1586 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1587 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1588 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1589 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1590 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1591 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1592 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1593 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1594 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1595 	/*
1596 	 * Initialize the client credit control
1597 	 * portion of the rdmaconn struct.
1598 	 */
1599 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1600 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1601 	cc_info->clnt_cc_granted_ops = 0;
1602 	cc_info->clnt_cc_in_flight_ops = 0;
1603 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1604 
1605 	*qp = kqp;
1606 	return (RDMA_SUCCESS);
1607 }
1608 
1609 /* Server side qp creation */
1610 static rdma_stat
1611 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1612 {
1613 	rib_qp_t	*kqp = NULL;
1614 	ibt_chan_sizes_t	chan_sizes;
1615 	ibt_rc_chan_alloc_args_t	qp_attr;
1616 	ibt_status_t		ibt_status;
1617 	rdma_srv_cred_ctrl_t *cc_info;
1618 
1619 	*qp = NULL;
1620 
1621 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1622 	kqp->hca = hca;
1623 	kqp->port_num = port;
1624 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1625 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1626 
1627 	/*
1628 	 * Create the qp handle
1629 	 */
1630 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1631 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1632 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1633 	qp_attr.rc_pd = hca->pd_hdl;
1634 	qp_attr.rc_hca_port_num = port;
1635 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1636 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1637 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1638 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1639 	qp_attr.rc_clone_chan = NULL;
1640 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1641 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1642 
1643 	rw_enter(&hca->state_lock, RW_READER);
1644 	if (hca->state != HCA_DETACHED) {
1645 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1646 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1647 		    &chan_sizes);
1648 	} else {
1649 		rw_exit(&hca->state_lock);
1650 		goto fail;
1651 	}
1652 	rw_exit(&hca->state_lock);
1653 
1654 	if (ibt_status != IBT_SUCCESS) {
1655 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1656 		    int, ibt_status);
1657 		goto fail;
1658 	}
1659 
1660 	kqp->mode = RIB_SERVER;
1661 	kqp->chan_flags = IBT_BLOCKING;
1662 	kqp->q = q;	/* server ONLY */
1663 
1664 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1665 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1666 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1667 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1668 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1669 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1670 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1671 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1672 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1673 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1674 	/*
1675 	 * Set the private data area to qp to be used in callbacks
1676 	 */
1677 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1678 	kqp->rdmaconn.c_state = C_CONNECTED;
1679 
1680 	/*
1681 	 * Initialize the server credit control
1682 	 * portion of the rdmaconn struct.
1683 	 */
1684 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1685 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1686 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1687 	cc_info->srv_cc_cur_buffers_used = 0;
1688 	cc_info->srv_cc_posted = preposted_rbufs;
1689 
1690 	*qp = kqp;
1691 
1692 	return (RDMA_SUCCESS);
1693 fail:
1694 	if (kqp)
1695 		kmem_free(kqp, sizeof (rib_qp_t));
1696 
1697 	return (RDMA_FAILED);
1698 }
1699 
1700 /* ARGSUSED */
1701 ibt_cm_status_t
1702 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1703     ibt_cm_return_args_t *ret_args, void *priv_data,
1704     ibt_priv_data_len_t len)
1705 {
1706 	rib_hca_t	*hca;
1707 
1708 	hca = (rib_hca_t *)clnt_hdl;
1709 
1710 	switch (event->cm_type) {
1711 
1712 	/* got a connection close event */
1713 	case IBT_CM_EVENT_CONN_CLOSED:
1714 	{
1715 		CONN	*conn;
1716 		rib_qp_t *qp;
1717 
1718 		/* check reason why connection was closed */
1719 		switch (event->cm_event.closed) {
1720 		case IBT_CM_CLOSED_DREP_RCVD:
1721 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1722 		case IBT_CM_CLOSED_DUP:
1723 		case IBT_CM_CLOSED_ABORT:
1724 		case IBT_CM_CLOSED_ALREADY:
1725 			/*
1726 			 * These cases indicate the local end initiated
1727 			 * the closing of the channel. Nothing to do here.
1728 			 */
1729 			break;
1730 		default:
1731 			/*
1732 			 * Reason for CONN_CLOSED event must be one of
1733 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1734 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1735 			 * the remote end is closing the channel. In these
1736 			 * cases free the channel and transition to error
1737 			 * state
1738 			 */
1739 			qp = ibt_get_chan_private(event->cm_channel);
1740 			conn = qptoc(qp);
1741 			mutex_enter(&conn->c_lock);
1742 			if (conn->c_state == C_DISCONN_PEND) {
1743 				mutex_exit(&conn->c_lock);
1744 				break;
1745 			}
1746 
1747 			conn->c_state = C_ERROR_CONN;
1748 
1749 			/*
1750 			 * Free the conn if c_ref is down to 0 already
1751 			 */
1752 			if (conn->c_ref == 0) {
1753 				/*
1754 				 * Remove from list and free conn
1755 				 */
1756 				conn->c_state = C_DISCONN_PEND;
1757 				mutex_exit(&conn->c_lock);
1758 				rw_enter(&hca->state_lock, RW_READER);
1759 				if (hca->state != HCA_DETACHED)
1760 					(void) rib_disconnect_channel(conn,
1761 					    &hca->cl_conn_list);
1762 				rw_exit(&hca->state_lock);
1763 			} else {
1764 				/*
1765 				 * conn will be freed when c_ref goes to 0.
1766 				 * Indicate to cleaning thread not to close
1767 				 * the connection, but just free the channel.
1768 				 */
1769 				conn->c_flags |= C_CLOSE_NOTNEEDED;
1770 				mutex_exit(&conn->c_lock);
1771 			}
1772 #ifdef DEBUG
1773 			if (rib_debug)
1774 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1775 				    "(CONN_CLOSED) channel disconnected");
1776 #endif
1777 			break;
1778 		}
1779 		break;
1780 	}
1781 	default:
1782 		break;
1783 	}
1784 	return (IBT_CM_ACCEPT);
1785 }
1786 
1787 /*
1788  * Connect to the server.
1789  */
1790 rdma_stat
1791 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1792 {
1793 	ibt_chan_open_args_t	chan_args;	/* channel args */
1794 	ibt_chan_sizes_t	chan_sizes;
1795 	ibt_rc_chan_alloc_args_t	qp_attr;
1796 	ibt_status_t		ibt_status;
1797 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1798 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1799 	ibt_ip_cm_info_t	ipcm_info;
1800 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1801 
1802 
1803 	(void) bzero(&chan_args, sizeof (chan_args));
1804 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1805 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1806 
1807 	ipcm_info.src_addr.family = rptp->srcip.family;
1808 	switch (ipcm_info.src_addr.family) {
1809 	case AF_INET:
1810 		ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1811 		break;
1812 	case AF_INET6:
1813 		ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1814 		break;
1815 	}
1816 
1817 	ipcm_info.dst_addr.family = rptp->srcip.family;
1818 	switch (ipcm_info.dst_addr.family) {
1819 	case AF_INET:
1820 		ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1821 		break;
1822 	case AF_INET6:
1823 		ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1824 		break;
1825 	}
1826 
1827 	ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1828 
1829 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1830 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1831 
1832 	if (ibt_status != IBT_SUCCESS) {
1833 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1834 		return (-1);
1835 	}
1836 
1837 	qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1838 	/* Alloc a RC channel */
1839 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1840 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1841 	qp_attr.rc_pd = hca->pd_hdl;
1842 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1843 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1844 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1845 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1846 	qp_attr.rc_clone_chan = NULL;
1847 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1848 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1849 
1850 	rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1851 	chan_args.oc_path = &rptp->path;
1852 
1853 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1854 	chan_args.oc_cm_clnt_private = (void *)hca;
1855 	chan_args.oc_rdma_ra_out = 4;
1856 	chan_args.oc_rdma_ra_in = 4;
1857 	chan_args.oc_path_retry_cnt = 2;
1858 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1859 	chan_args.oc_priv_data = cmp_ip_pvt;
1860 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1861 
1862 refresh:
1863 	rw_enter(&hca->state_lock, RW_READER);
1864 	if (hca->state != HCA_DETACHED) {
1865 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1866 		    IBT_ACHAN_NO_FLAGS,
1867 		    &qp_attr, &qp->qp_hdl,
1868 		    &chan_sizes);
1869 	} else {
1870 		rw_exit(&hca->state_lock);
1871 		return (RDMA_FAILED);
1872 	}
1873 	rw_exit(&hca->state_lock);
1874 
1875 	if (ibt_status != IBT_SUCCESS) {
1876 		DTRACE_PROBE1(rpcib__i_conntosrv,
1877 		    int, ibt_status);
1878 		return (RDMA_FAILED);
1879 	}
1880 
1881 	/* Connect to the Server */
1882 	(void) bzero(&ret_args, sizeof (ret_args));
1883 	mutex_enter(&qp->cb_lock);
1884 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1885 	    IBT_BLOCKING, &chan_args, &ret_args);
1886 	if (ibt_status != IBT_SUCCESS) {
1887 		DTRACE_PROBE2(rpcib__i_openrctosrv,
1888 		    int, ibt_status, int, ret_args.rc_status);
1889 
1890 		(void) ibt_free_channel(qp->qp_hdl);
1891 		qp->qp_hdl = NULL;
1892 		mutex_exit(&qp->cb_lock);
1893 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1894 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1895 			/*
1896 			 * Got IBT_CM_CONN_STALE probably because of stale
1897 			 * data on the passive end of a channel that existed
1898 			 * prior to reboot. Retry establishing a channel
1899 			 * REFRESH_ATTEMPTS times, during which time the
1900 			 * stale conditions on the server might clear up.
1901 			 */
1902 			goto refresh;
1903 		}
1904 		return (RDMA_FAILED);
1905 	}
1906 	mutex_exit(&qp->cb_lock);
1907 	/*
1908 	 * Set the private data area to qp to be used in callbacks
1909 	 */
1910 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1911 	return (RDMA_SUCCESS);
1912 }
1913 
1914 rdma_stat
1915 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1916 {
1917 	uint_t			i, addr_count;
1918 	ibt_status_t		ibt_status;
1919 	uint8_t			num_paths_p;
1920 	ibt_ip_path_attr_t	ipattr;
1921 	ibt_path_ip_src_t	srcip;
1922 	rpcib_ipaddrs_t		addrs4;
1923 	rpcib_ipaddrs_t		addrs6;
1924 	struct sockaddr_in	*sinp;
1925 	struct sockaddr_in6	*sin6p;
1926 	rdma_stat		retval = RDMA_FAILED;
1927 	rib_hca_t *hca;
1928 
1929 	if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1930 		return (RDMA_INVAL);
1931 	ASSERT(raddr->buf != NULL);
1932 
1933 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1934 
1935 	if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1936 	    (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1937 		retval = RDMA_FAILED;
1938 		goto done2;
1939 	}
1940 
1941 	if (addr_type == AF_INET) {
1942 		addr_count = addrs4.ri_count;
1943 		sinp = (struct sockaddr_in *)raddr->buf;
1944 		rptp->dstip.family = AF_INET;
1945 		rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1946 		sinp = addrs4.ri_list;
1947 	} else {
1948 		addr_count = addrs6.ri_count;
1949 		sin6p = (struct sockaddr_in6 *)raddr->buf;
1950 		rptp->dstip.family = AF_INET6;
1951 		rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1952 		sin6p = addrs6.ri_list;
1953 	}
1954 
1955 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1956 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1957 		rw_enter(&hca->state_lock, RW_READER);
1958 		if (hca->state == HCA_DETACHED) {
1959 			rw_exit(&hca->state_lock);
1960 			continue;
1961 		}
1962 
1963 		ipattr.ipa_dst_ip 	= &rptp->dstip;
1964 		ipattr.ipa_hca_guid	= hca->hca_guid;
1965 		ipattr.ipa_ndst		= 1;
1966 		ipattr.ipa_max_paths	= 1;
1967 		ipattr.ipa_src_ip.family = rptp->dstip.family;
1968 		for (i = 0; i < addr_count; i++) {
1969 			num_paths_p = 0;
1970 			if (addr_type == AF_INET) {
1971 				ipattr.ipa_src_ip.un.ip4addr =
1972 				    sinp[i].sin_addr.s_addr;
1973 			} else {
1974 				ipattr.ipa_src_ip.un.ip6addr =
1975 				    sin6p[i].sin6_addr;
1976 			}
1977 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
1978 
1979 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1980 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1981 			    &num_paths_p, &srcip);
1982 			if (ibt_status == IBT_SUCCESS &&
1983 			    num_paths_p != 0 &&
1984 			    rptp->path.pi_hca_guid == hca->hca_guid) {
1985 				rptp->hca = hca;
1986 				rw_exit(&hca->state_lock);
1987 				if (addr_type == AF_INET) {
1988 					rptp->srcip.family = AF_INET;
1989 					rptp->srcip.un.ip4addr =
1990 					    srcip.ip_primary.un.ip4addr;
1991 				} else {
1992 					rptp->srcip.family = AF_INET6;
1993 					rptp->srcip.un.ip6addr =
1994 					    srcip.ip_primary.un.ip6addr;
1995 
1996 				}
1997 				retval = RDMA_SUCCESS;
1998 				goto done1;
1999 			}
2000 		}
2001 		rw_exit(&hca->state_lock);
2002 	}
2003 done1:
2004 	rw_exit(&rib_stat->hcas_list_lock);
2005 done2:
2006 	if (addrs4.ri_size > 0)
2007 		kmem_free(addrs4.ri_list, addrs4.ri_size);
2008 	if (addrs6.ri_size > 0)
2009 		kmem_free(addrs6.ri_list, addrs6.ri_size);
2010 	return (retval);
2011 }
2012 
2013 /*
2014  * Close channel, remove from connection list and
2015  * free up resources allocated for that channel.
2016  */
2017 rdma_stat
2018 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2019 {
2020 	rib_qp_t	*qp = ctoqp(conn);
2021 	rib_hca_t	*hca;
2022 
2023 	mutex_enter(&conn->c_lock);
2024 	if (conn->c_timeout != NULL) {
2025 		mutex_exit(&conn->c_lock);
2026 		(void) untimeout(conn->c_timeout);
2027 		mutex_enter(&conn->c_lock);
2028 	}
2029 
2030 	while (conn->c_flags & C_CLOSE_PENDING) {
2031 		cv_wait(&conn->c_cv, &conn->c_lock);
2032 	}
2033 	mutex_exit(&conn->c_lock);
2034 
2035 	/*
2036 	 * c_ref == 0 and connection is in C_DISCONN_PEND
2037 	 */
2038 	hca = qp->hca;
2039 	if (conn_list != NULL)
2040 		(void) rib_rm_conn(conn, conn_list);
2041 
2042 	/*
2043 	 * There is only one case where we get here with
2044 	 * qp_hdl = NULL, which is during connection setup on
2045 	 * the client. In such a case there are no posted
2046 	 * send/recv buffers.
2047 	 */
2048 	if (qp->qp_hdl != NULL) {
2049 		mutex_enter(&qp->posted_rbufs_lock);
2050 		while (qp->n_posted_rbufs)
2051 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2052 		mutex_exit(&qp->posted_rbufs_lock);
2053 
2054 		mutex_enter(&qp->send_rbufs_lock);
2055 		while (qp->n_send_rbufs)
2056 			cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2057 			mutex_exit(&qp->send_rbufs_lock);
2058 
2059 		(void) ibt_free_channel(qp->qp_hdl);
2060 			qp->qp_hdl = NULL;
2061 	}
2062 
2063 	ASSERT(qp->rdlist == NULL);
2064 
2065 	if (qp->replylist != NULL) {
2066 		(void) rib_rem_replylist(qp);
2067 	}
2068 
2069 	cv_destroy(&qp->cb_conn_cv);
2070 	cv_destroy(&qp->posted_rbufs_cv);
2071 	cv_destroy(&qp->send_rbufs_cv);
2072 	mutex_destroy(&qp->cb_lock);
2073 	mutex_destroy(&qp->replylist_lock);
2074 	mutex_destroy(&qp->posted_rbufs_lock);
2075 	mutex_destroy(&qp->send_rbufs_lock);
2076 	mutex_destroy(&qp->rdlist_lock);
2077 
2078 	cv_destroy(&conn->c_cv);
2079 	mutex_destroy(&conn->c_lock);
2080 
2081 	if (conn->c_raddr.buf != NULL) {
2082 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2083 	}
2084 	if (conn->c_laddr.buf != NULL) {
2085 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2086 	}
2087 	if (conn->c_netid != NULL) {
2088 		kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1));
2089 	}
2090 	if (conn->c_addrmask.buf != NULL) {
2091 		kmem_free(conn->c_addrmask.buf, conn->c_addrmask.len);
2092 	}
2093 
2094 	/*
2095 	 * Credit control cleanup.
2096 	 */
2097 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2098 		rdma_clnt_cred_ctrl_t *cc_info;
2099 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2100 		cv_destroy(&cc_info->clnt_cc_cv);
2101 	}
2102 
2103 	kmem_free(qp, sizeof (rib_qp_t));
2104 
2105 	/*
2106 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2107 	 * then the hca is no longer being used.
2108 	 */
2109 	if (conn_list != NULL) {
2110 		rw_enter(&hca->state_lock, RW_READER);
2111 		if (hca->state == HCA_DETACHED) {
2112 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2113 			if (hca->srv_conn_list.conn_hd == NULL) {
2114 				rw_enter(&hca->cl_conn_list.conn_lock,
2115 				    RW_READER);
2116 
2117 				if (hca->cl_conn_list.conn_hd == NULL) {
2118 					mutex_enter(&hca->inuse_lock);
2119 					hca->inuse = FALSE;
2120 					cv_signal(&hca->cb_cv);
2121 					mutex_exit(&hca->inuse_lock);
2122 				}
2123 				rw_exit(&hca->cl_conn_list.conn_lock);
2124 			}
2125 			rw_exit(&hca->srv_conn_list.conn_lock);
2126 		}
2127 		rw_exit(&hca->state_lock);
2128 	}
2129 
2130 	return (RDMA_SUCCESS);
2131 }
2132 
2133 /*
2134  * All sends are done under the protection of
2135  * the wdesc->sendwait_lock. n_send_rbufs count
2136  * is protected using the send_rbufs_lock.
2137  * lock ordering is:
2138  * sendwait_lock -> send_rbufs_lock
2139  */
2140 
2141 void
2142 rib_send_hold(rib_qp_t *qp)
2143 {
2144 	mutex_enter(&qp->send_rbufs_lock);
2145 	qp->n_send_rbufs++;
2146 	mutex_exit(&qp->send_rbufs_lock);
2147 }
2148 
2149 void
2150 rib_send_rele(rib_qp_t *qp)
2151 {
2152 	mutex_enter(&qp->send_rbufs_lock);
2153 	qp->n_send_rbufs--;
2154 	if (qp->n_send_rbufs == 0)
2155 		cv_signal(&qp->send_rbufs_cv);
2156 	mutex_exit(&qp->send_rbufs_lock);
2157 }
2158 
2159 void
2160 rib_recv_rele(rib_qp_t *qp)
2161 {
2162 	mutex_enter(&qp->posted_rbufs_lock);
2163 	qp->n_posted_rbufs--;
2164 	if (qp->n_posted_rbufs == 0)
2165 		cv_signal(&qp->posted_rbufs_cv);
2166 	mutex_exit(&qp->posted_rbufs_lock);
2167 }
2168 
2169 /*
2170  * Wait for send completion notification. Only on receiving a
2171  * notification be it a successful or error completion, free the
2172  * send_wid.
2173  */
2174 static rdma_stat
2175 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2176 {
2177 	clock_t timout, cv_wait_ret;
2178 	rdma_stat error = RDMA_SUCCESS;
2179 	int	i;
2180 
2181 	/*
2182 	 * Wait for send to complete
2183 	 */
2184 	ASSERT(wd != NULL);
2185 	mutex_enter(&wd->sendwait_lock);
2186 	if (wd->status == (uint_t)SEND_WAIT) {
2187 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2188 		    ddi_get_lbolt();
2189 
2190 		if (qp->mode == RIB_SERVER) {
2191 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2192 			    &wd->sendwait_lock, timout)) > 0 &&
2193 			    wd->status == (uint_t)SEND_WAIT)
2194 				;
2195 			switch (cv_wait_ret) {
2196 			case -1:	/* timeout */
2197 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2198 
2199 				wd->cv_sig = 0;		/* no signal needed */
2200 				error = RDMA_TIMEDOUT;
2201 				break;
2202 			default:	/* got send completion */
2203 				break;
2204 			}
2205 		} else {
2206 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2207 			    &wd->sendwait_lock, timout)) > 0 &&
2208 			    wd->status == (uint_t)SEND_WAIT)
2209 				;
2210 			switch (cv_wait_ret) {
2211 			case -1:	/* timeout */
2212 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2213 
2214 				wd->cv_sig = 0;		/* no signal needed */
2215 				error = RDMA_TIMEDOUT;
2216 				break;
2217 			case 0:		/* interrupted */
2218 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2219 
2220 				wd->cv_sig = 0;		/* no signal needed */
2221 				error = RDMA_INTR;
2222 				break;
2223 			default:	/* got send completion */
2224 				break;
2225 			}
2226 		}
2227 	}
2228 
2229 	if (wd->status != (uint_t)SEND_WAIT) {
2230 		/* got send completion */
2231 		if (wd->status != RDMA_SUCCESS) {
2232 			switch (wd->status) {
2233 			case RDMA_CONNLOST:
2234 				error = RDMA_CONNLOST;
2235 				break;
2236 			default:
2237 				error = RDMA_FAILED;
2238 				break;
2239 			}
2240 		}
2241 		for (i = 0; i < wd->nsbufs; i++) {
2242 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2243 			    (void *)(uintptr_t)wd->sbufaddr[i]);
2244 		}
2245 
2246 		rib_send_rele(qp);
2247 
2248 		mutex_exit(&wd->sendwait_lock);
2249 		(void) rib_free_sendwait(wd);
2250 
2251 	} else {
2252 		mutex_exit(&wd->sendwait_lock);
2253 	}
2254 	return (error);
2255 }
2256 
2257 static struct send_wid *
2258 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2259 {
2260 	struct send_wid	*wd;
2261 
2262 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2263 	wd->xid = xid;
2264 	wd->cv_sig = cv_sig;
2265 	wd->qp = qp;
2266 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2267 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2268 	wd->status = (uint_t)SEND_WAIT;
2269 
2270 	return (wd);
2271 }
2272 
2273 static int
2274 rib_free_sendwait(struct send_wid *wdesc)
2275 {
2276 	cv_destroy(&wdesc->wait_cv);
2277 	mutex_destroy(&wdesc->sendwait_lock);
2278 	kmem_free(wdesc, sizeof (*wdesc));
2279 
2280 	return (0);
2281 }
2282 
2283 static rdma_stat
2284 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2285 {
2286 	mutex_enter(&qp->replylist_lock);
2287 	if (rep != NULL) {
2288 		(void) rib_remreply(qp, rep);
2289 		mutex_exit(&qp->replylist_lock);
2290 		return (RDMA_SUCCESS);
2291 	}
2292 	mutex_exit(&qp->replylist_lock);
2293 	return (RDMA_FAILED);
2294 }
2295 
2296 /*
2297  * Send buffers are freed here only in case of error in posting
2298  * on QP. If the post succeeded, the send buffers are freed upon
2299  * send completion in rib_sendwait() or in the scq_handler.
2300  */
2301 rdma_stat
2302 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2303 	int send_sig, int cv_sig, caddr_t *swid)
2304 {
2305 	struct send_wid	*wdesc;
2306 	struct clist	*clp;
2307 	ibt_status_t	ibt_status = IBT_SUCCESS;
2308 	rdma_stat	ret = RDMA_SUCCESS;
2309 	ibt_send_wr_t	tx_wr;
2310 	int		i, nds;
2311 	ibt_wr_ds_t	sgl[DSEG_MAX];
2312 	uint_t		total_msg_size;
2313 	rib_qp_t	*qp;
2314 
2315 	qp = ctoqp(conn);
2316 
2317 	ASSERT(cl != NULL);
2318 
2319 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2320 
2321 	nds = 0;
2322 	total_msg_size = 0;
2323 	clp = cl;
2324 	while (clp != NULL) {
2325 		if (nds >= DSEG_MAX) {
2326 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2327 			return (RDMA_FAILED);
2328 		}
2329 		sgl[nds].ds_va = clp->w.c_saddr;
2330 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2331 		sgl[nds].ds_len = clp->c_len;
2332 		total_msg_size += clp->c_len;
2333 		clp = clp->c_next;
2334 		nds++;
2335 	}
2336 
2337 	if (send_sig) {
2338 		/* Set SEND_SIGNAL flag. */
2339 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2340 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2341 		*swid = (caddr_t)wdesc;
2342 		tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2343 		mutex_enter(&wdesc->sendwait_lock);
2344 		wdesc->nsbufs = nds;
2345 		for (i = 0; i < nds; i++) {
2346 			wdesc->sbufaddr[i] = sgl[i].ds_va;
2347 		}
2348 	} else {
2349 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2350 		*swid = NULL;
2351 		tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2352 	}
2353 
2354 	tx_wr.wr_opcode = IBT_WRC_SEND;
2355 	tx_wr.wr_trans = IBT_RC_SRV;
2356 	tx_wr.wr_nds = nds;
2357 	tx_wr.wr_sgl = sgl;
2358 
2359 	mutex_enter(&conn->c_lock);
2360 	if (conn->c_state == C_CONNECTED) {
2361 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2362 	}
2363 	if (conn->c_state != C_CONNECTED ||
2364 	    ibt_status != IBT_SUCCESS) {
2365 		if (conn->c_state != C_DISCONN_PEND)
2366 			conn->c_state = C_ERROR_CONN;
2367 		mutex_exit(&conn->c_lock);
2368 		if (send_sig) {
2369 			for (i = 0; i < nds; i++) {
2370 				rib_rbuf_free(conn, SEND_BUFFER,
2371 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2372 			}
2373 			mutex_exit(&wdesc->sendwait_lock);
2374 			(void) rib_free_sendwait(wdesc);
2375 		}
2376 		return (RDMA_CONNLOST);
2377 	}
2378 
2379 	mutex_exit(&conn->c_lock);
2380 
2381 	if (send_sig) {
2382 		rib_send_hold(qp);
2383 		mutex_exit(&wdesc->sendwait_lock);
2384 		if (cv_sig) {
2385 			/*
2386 			 * cv_wait for send to complete.
2387 			 * We can fail due to a timeout or signal or
2388 			 * unsuccessful send.
2389 			 */
2390 			ret = rib_sendwait(qp, wdesc);
2391 
2392 			return (ret);
2393 		}
2394 	}
2395 
2396 	return (RDMA_SUCCESS);
2397 }
2398 
2399 
2400 rdma_stat
2401 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2402 {
2403 	rdma_stat	ret;
2404 	caddr_t		wd;
2405 
2406 	/* send-wait & cv_signal */
2407 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2408 	return (ret);
2409 }
2410 
2411 /*
2412  * Deprecated/obsolete interface not used currently
2413  * but earlier used for READ-READ protocol.
2414  * Send RPC reply and wait for RDMA_DONE.
2415  */
2416 rdma_stat
2417 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2418 {
2419 	rdma_stat ret = RDMA_SUCCESS;
2420 	struct rdma_done_list *rd;
2421 	clock_t cv_wait_ret;
2422 	caddr_t *wid = NULL;
2423 	rib_qp_t *qp = ctoqp(conn);
2424 
2425 	mutex_enter(&qp->rdlist_lock);
2426 	rd = rdma_done_add(qp, msgid);
2427 
2428 	/* No cv_signal (whether send-wait or no-send-wait) */
2429 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2430 
2431 	if (ret != RDMA_SUCCESS) {
2432 		rdma_done_rm(qp, rd);
2433 	} else {
2434 		/*
2435 		 * Wait for RDMA_DONE from remote end
2436 		 */
2437 		cv_wait_ret = cv_reltimedwait(&rd->rdma_done_cv,
2438 		    &qp->rdlist_lock, drv_usectohz(REPLY_WAIT_TIME * 1000000),
2439 		    TR_CLOCK_TICK);
2440 
2441 		rdma_done_rm(qp, rd);
2442 
2443 		if (cv_wait_ret < 0) {
2444 			ret = RDMA_TIMEDOUT;
2445 		}
2446 	}
2447 
2448 	mutex_exit(&qp->rdlist_lock);
2449 	return (ret);
2450 }
2451 
2452 static struct recv_wid *
2453 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2454 {
2455 	struct recv_wid	*rwid;
2456 
2457 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2458 	rwid->xid = msgid;
2459 	rwid->addr = sgl->ds_va;
2460 	rwid->qp = qp;
2461 
2462 	return (rwid);
2463 }
2464 
2465 static void
2466 rib_free_wid(struct recv_wid *rwid)
2467 {
2468 	kmem_free(rwid, sizeof (struct recv_wid));
2469 }
2470 
2471 rdma_stat
2472 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2473 {
2474 	rib_qp_t	*qp = ctoqp(conn);
2475 	struct clist	*clp = cl;
2476 	struct reply	*rep;
2477 	struct recv_wid	*rwid;
2478 	int		nds;
2479 	ibt_wr_ds_t	sgl[DSEG_MAX];
2480 	ibt_recv_wr_t	recv_wr;
2481 	rdma_stat	ret;
2482 	ibt_status_t	ibt_status;
2483 
2484 	/*
2485 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2486 	 */
2487 
2488 	nds = 0;
2489 	while (cl != NULL) {
2490 		if (nds >= DSEG_MAX) {
2491 			ret = RDMA_FAILED;
2492 			goto done;
2493 		}
2494 		sgl[nds].ds_va = cl->w.c_saddr;
2495 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2496 		sgl[nds].ds_len = cl->c_len;
2497 		cl = cl->c_next;
2498 		nds++;
2499 	}
2500 
2501 	if (nds != 1) {
2502 		ret = RDMA_FAILED;
2503 		goto done;
2504 	}
2505 
2506 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2507 	recv_wr.wr_nds = nds;
2508 	recv_wr.wr_sgl = sgl;
2509 
2510 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2511 	if (rwid) {
2512 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2513 	} else {
2514 		ret = RDMA_NORESOURCE;
2515 		goto done;
2516 	}
2517 	rep = rib_addreplylist(qp, msgid);
2518 	if (!rep) {
2519 		rib_free_wid(rwid);
2520 		ret = RDMA_NORESOURCE;
2521 		goto done;
2522 	}
2523 
2524 	mutex_enter(&conn->c_lock);
2525 
2526 	if (conn->c_state == C_CONNECTED) {
2527 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2528 	}
2529 
2530 	if (conn->c_state != C_CONNECTED ||
2531 	    ibt_status != IBT_SUCCESS) {
2532 		if (conn->c_state != C_DISCONN_PEND)
2533 			conn->c_state = C_ERROR_CONN;
2534 		mutex_exit(&conn->c_lock);
2535 		rib_free_wid(rwid);
2536 		(void) rib_rem_rep(qp, rep);
2537 		ret = RDMA_CONNLOST;
2538 		goto done;
2539 	}
2540 
2541 	mutex_enter(&qp->posted_rbufs_lock);
2542 	qp->n_posted_rbufs++;
2543 	mutex_exit(&qp->posted_rbufs_lock);
2544 
2545 	mutex_exit(&conn->c_lock);
2546 	return (RDMA_SUCCESS);
2547 
2548 done:
2549 	while (clp != NULL) {
2550 		rib_rbuf_free(conn, RECV_BUFFER,
2551 		    (void *)(uintptr_t)clp->w.c_saddr3);
2552 		clp = clp->c_next;
2553 	}
2554 	return (ret);
2555 }
2556 
2557 rdma_stat
2558 rib_svc_post(CONN* conn, struct clist *cl)
2559 {
2560 	rib_qp_t	*qp = ctoqp(conn);
2561 	struct svc_recv	*s_recvp;
2562 	int		nds;
2563 	ibt_wr_ds_t	sgl[DSEG_MAX];
2564 	ibt_recv_wr_t	recv_wr;
2565 	ibt_status_t	ibt_status;
2566 
2567 	nds = 0;
2568 	while (cl != NULL) {
2569 		if (nds >= DSEG_MAX) {
2570 			return (RDMA_FAILED);
2571 		}
2572 		sgl[nds].ds_va = cl->w.c_saddr;
2573 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2574 		sgl[nds].ds_len = cl->c_len;
2575 		cl = cl->c_next;
2576 		nds++;
2577 	}
2578 
2579 	if (nds != 1) {
2580 		rib_rbuf_free(conn, RECV_BUFFER,
2581 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2582 
2583 		return (RDMA_FAILED);
2584 	}
2585 
2586 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2587 	recv_wr.wr_nds = nds;
2588 	recv_wr.wr_sgl = sgl;
2589 
2590 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2591 	/* Use s_recvp's addr as wr id */
2592 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2593 	mutex_enter(&conn->c_lock);
2594 	if (conn->c_state == C_CONNECTED) {
2595 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2596 	}
2597 	if (conn->c_state != C_CONNECTED ||
2598 	    ibt_status != IBT_SUCCESS) {
2599 		if (conn->c_state != C_DISCONN_PEND)
2600 			conn->c_state = C_ERROR_CONN;
2601 		mutex_exit(&conn->c_lock);
2602 		rib_rbuf_free(conn, RECV_BUFFER,
2603 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2604 		(void) rib_free_svc_recv(s_recvp);
2605 
2606 		return (RDMA_CONNLOST);
2607 	}
2608 	mutex_exit(&conn->c_lock);
2609 
2610 	return (RDMA_SUCCESS);
2611 }
2612 
2613 /* Client */
2614 rdma_stat
2615 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2616 {
2617 	return (rib_clnt_post(conn, cl, msgid));
2618 }
2619 
2620 /* Client */
2621 rdma_stat
2622 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2623 {
2624 	rib_qp_t	*qp = ctoqp(conn);
2625 	struct reply	*rep;
2626 
2627 	mutex_enter(&qp->replylist_lock);
2628 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2629 		if (rep->xid == msgid) {
2630 			if (rep->vaddr_cq) {
2631 				rib_rbuf_free(conn, RECV_BUFFER,
2632 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2633 			}
2634 			(void) rib_remreply(qp, rep);
2635 			break;
2636 		}
2637 	}
2638 	mutex_exit(&qp->replylist_lock);
2639 
2640 	return (RDMA_SUCCESS);
2641 }
2642 
2643 /* Server */
2644 rdma_stat
2645 rib_post_recv(CONN *conn, struct clist *cl)
2646 {
2647 	rib_qp_t	*qp = ctoqp(conn);
2648 
2649 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2650 		mutex_enter(&qp->posted_rbufs_lock);
2651 		qp->n_posted_rbufs++;
2652 		mutex_exit(&qp->posted_rbufs_lock);
2653 		return (RDMA_SUCCESS);
2654 	}
2655 	return (RDMA_FAILED);
2656 }
2657 
2658 /*
2659  * Client side only interface to "recv" the rpc reply buf
2660  * posted earlier by rib_post_resp(conn, cl, msgid).
2661  */
2662 rdma_stat
2663 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2664 {
2665 	struct reply *rep = NULL;
2666 	clock_t timout, cv_wait_ret;
2667 	rdma_stat ret = RDMA_SUCCESS;
2668 	rib_qp_t *qp = ctoqp(conn);
2669 
2670 	/*
2671 	 * Find the reply structure for this msgid
2672 	 */
2673 	mutex_enter(&qp->replylist_lock);
2674 
2675 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2676 		if (rep->xid == msgid)
2677 			break;
2678 	}
2679 
2680 	if (rep != NULL) {
2681 		/*
2682 		 * If message not yet received, wait.
2683 		 */
2684 		if (rep->status == (uint_t)REPLY_WAIT) {
2685 			timout = ddi_get_lbolt() +
2686 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2687 
2688 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2689 			    &qp->replylist_lock, timout)) > 0 &&
2690 			    rep->status == (uint_t)REPLY_WAIT)
2691 				;
2692 
2693 			switch (cv_wait_ret) {
2694 			case -1:	/* timeout */
2695 				ret = RDMA_TIMEDOUT;
2696 				break;
2697 			case 0:
2698 				ret = RDMA_INTR;
2699 				break;
2700 			default:
2701 				break;
2702 			}
2703 		}
2704 
2705 		if (rep->status == RDMA_SUCCESS) {
2706 			struct clist *cl = NULL;
2707 
2708 			/*
2709 			 * Got message successfully
2710 			 */
2711 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2712 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2713 			*clp = cl;
2714 		} else {
2715 			if (rep->status != (uint_t)REPLY_WAIT) {
2716 				/*
2717 				 * Got error in reply message. Free
2718 				 * recv buffer here.
2719 				 */
2720 				ret = rep->status;
2721 				rib_rbuf_free(conn, RECV_BUFFER,
2722 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2723 			}
2724 		}
2725 		(void) rib_remreply(qp, rep);
2726 	} else {
2727 		/*
2728 		 * No matching reply structure found for given msgid on the
2729 		 * reply wait list.
2730 		 */
2731 		ret = RDMA_INVAL;
2732 		DTRACE_PROBE(rpcib__i__nomatchxid2);
2733 	}
2734 
2735 	/*
2736 	 * Done.
2737 	 */
2738 	mutex_exit(&qp->replylist_lock);
2739 	return (ret);
2740 }
2741 
2742 /*
2743  * RDMA write a buffer to the remote address.
2744  */
2745 rdma_stat
2746 rib_write(CONN *conn, struct clist *cl, int wait)
2747 {
2748 	ibt_send_wr_t	tx_wr;
2749 	int		cv_sig;
2750 	ibt_wr_ds_t	sgl[DSEG_MAX];
2751 	struct send_wid	*wdesc;
2752 	ibt_status_t	ibt_status;
2753 	rdma_stat	ret = RDMA_SUCCESS;
2754 	rib_qp_t	*qp = ctoqp(conn);
2755 	uint64_t	n_writes = 0;
2756 
2757 	if (cl == NULL) {
2758 		return (RDMA_FAILED);
2759 	}
2760 
2761 	while ((cl != NULL)) {
2762 		if (cl->c_len > 0) {
2763 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2764 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2765 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2766 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2767 			sgl[0].ds_va = cl->w.c_saddr;
2768 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2769 			sgl[0].ds_len = cl->c_len;
2770 
2771 			if (wait) {
2772 				cv_sig = 1;
2773 			} else {
2774 				if (n_writes > max_unsignaled_rws) {
2775 					n_writes = 0;
2776 					cv_sig = 1;
2777 				} else {
2778 					cv_sig = 0;
2779 				}
2780 			}
2781 
2782 			if (cv_sig) {
2783 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2784 				wdesc = rib_init_sendwait(0, cv_sig, qp);
2785 				tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2786 				mutex_enter(&wdesc->sendwait_lock);
2787 			} else {
2788 				tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2789 				tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2790 			}
2791 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2792 			tx_wr.wr_trans = IBT_RC_SRV;
2793 			tx_wr.wr_nds = 1;
2794 			tx_wr.wr_sgl = sgl;
2795 
2796 			mutex_enter(&conn->c_lock);
2797 			if (conn->c_state == C_CONNECTED) {
2798 				ibt_status =
2799 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2800 			}
2801 			if (conn->c_state != C_CONNECTED ||
2802 			    ibt_status != IBT_SUCCESS) {
2803 				if (conn->c_state != C_DISCONN_PEND)
2804 					conn->c_state = C_ERROR_CONN;
2805 				mutex_exit(&conn->c_lock);
2806 				if (cv_sig) {
2807 					mutex_exit(&wdesc->sendwait_lock);
2808 					(void) rib_free_sendwait(wdesc);
2809 				}
2810 				return (RDMA_CONNLOST);
2811 			}
2812 
2813 			mutex_exit(&conn->c_lock);
2814 
2815 			/*
2816 			 * Wait for send to complete
2817 			 */
2818 			if (cv_sig) {
2819 
2820 				rib_send_hold(qp);
2821 				mutex_exit(&wdesc->sendwait_lock);
2822 
2823 				ret = rib_sendwait(qp, wdesc);
2824 				if (ret != 0)
2825 					return (ret);
2826 			}
2827 			n_writes ++;
2828 		}
2829 		cl = cl->c_next;
2830 	}
2831 	return (RDMA_SUCCESS);
2832 }
2833 
2834 /*
2835  * RDMA Read a buffer from the remote address.
2836  */
2837 rdma_stat
2838 rib_read(CONN *conn, struct clist *cl, int wait)
2839 {
2840 	ibt_send_wr_t	rx_wr;
2841 	int		cv_sig = 0;
2842 	ibt_wr_ds_t	sgl;
2843 	struct send_wid	*wdesc;
2844 	ibt_status_t	ibt_status = IBT_SUCCESS;
2845 	rdma_stat	ret = RDMA_SUCCESS;
2846 	rib_qp_t	*qp = ctoqp(conn);
2847 
2848 	if (cl == NULL) {
2849 		return (RDMA_FAILED);
2850 	}
2851 
2852 	while (cl != NULL) {
2853 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2854 		/*
2855 		 * Remote address is at the head chunk item in list.
2856 		 */
2857 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2858 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2859 
2860 		sgl.ds_va = cl->u.c_daddr;
2861 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2862 		sgl.ds_len = cl->c_len;
2863 
2864 		/*
2865 		 * If there are multiple chunks to be read, and
2866 		 * wait is set, ask for signal only for the last chunk
2867 		 * and wait only on the last chunk. The completion of
2868 		 * RDMA_READ on last chunk ensures that reads on all
2869 		 * previous chunks are also completed.
2870 		 */
2871 		if (wait && (cl->c_next == NULL)) {
2872 			cv_sig = 1;
2873 			wdesc = rib_init_sendwait(0, cv_sig, qp);
2874 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2875 			rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2876 			mutex_enter(&wdesc->sendwait_lock);
2877 		} else {
2878 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2879 			rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2880 		}
2881 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2882 		rx_wr.wr_trans = IBT_RC_SRV;
2883 		rx_wr.wr_nds = 1;
2884 		rx_wr.wr_sgl = &sgl;
2885 
2886 		mutex_enter(&conn->c_lock);
2887 		if (conn->c_state == C_CONNECTED) {
2888 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2889 		}
2890 		if (conn->c_state != C_CONNECTED ||
2891 		    ibt_status != IBT_SUCCESS) {
2892 			if (conn->c_state != C_DISCONN_PEND)
2893 				conn->c_state = C_ERROR_CONN;
2894 			mutex_exit(&conn->c_lock);
2895 			if (wait && (cl->c_next == NULL)) {
2896 				mutex_exit(&wdesc->sendwait_lock);
2897 				(void) rib_free_sendwait(wdesc);
2898 			}
2899 			return (RDMA_CONNLOST);
2900 		}
2901 
2902 		mutex_exit(&conn->c_lock);
2903 
2904 		/*
2905 		 * Wait for send to complete if this is the
2906 		 * last item in the list.
2907 		 */
2908 		if (wait && cl->c_next == NULL) {
2909 			rib_send_hold(qp);
2910 			mutex_exit(&wdesc->sendwait_lock);
2911 
2912 			ret = rib_sendwait(qp, wdesc);
2913 
2914 			if (ret != 0)
2915 				return (ret);
2916 		}
2917 		cl = cl->c_next;
2918 	}
2919 	return (RDMA_SUCCESS);
2920 }
2921 
2922 /*
2923  * rib_srv_cm_handler()
2924  *    Connection Manager callback to handle RC connection requests.
2925  */
2926 /* ARGSUSED */
2927 static ibt_cm_status_t
2928 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2929 	ibt_cm_return_args_t *ret_args, void *priv_data,
2930 	ibt_priv_data_len_t len)
2931 {
2932 	queue_t		*q;
2933 	rib_qp_t	*qp;
2934 	rib_hca_t	*hca;
2935 	rdma_stat	status = RDMA_SUCCESS;
2936 	int		i;
2937 	struct clist	cl;
2938 	rdma_buf_t	rdbuf = {0};
2939 	void		*buf = NULL;
2940 	CONN		*conn;
2941 	ibt_ip_cm_info_t	ipinfo;
2942 	struct sockaddr_in *s;
2943 	struct sockaddr_in6 *s6;
2944 	int sin_size = sizeof (struct sockaddr_in);
2945 	int in_size = sizeof (struct in_addr);
2946 	int sin6_size = sizeof (struct sockaddr_in6);
2947 
2948 	ASSERT(any != NULL);
2949 	ASSERT(event != NULL);
2950 
2951 	hca = (rib_hca_t *)any;
2952 
2953 	/* got a connection request */
2954 	switch (event->cm_type) {
2955 	case IBT_CM_EVENT_REQ_RCV:
2956 		/*
2957 		 * If the plugin is in the NO_ACCEPT state, bail out.
2958 		 */
2959 		mutex_enter(&plugin_state_lock);
2960 		if (plugin_state == NO_ACCEPT) {
2961 			mutex_exit(&plugin_state_lock);
2962 			return (IBT_CM_REJECT);
2963 		}
2964 		mutex_exit(&plugin_state_lock);
2965 
2966 		/*
2967 		 * Need to send a MRA MAD to CM so that it does not
2968 		 * timeout on us.
2969 		 */
2970 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2971 		    event->cm_event.req.req_timeout * 8, NULL, 0);
2972 
2973 		mutex_enter(&rib_stat->open_hca_lock);
2974 		q = rib_stat->q;
2975 		mutex_exit(&rib_stat->open_hca_lock);
2976 
2977 		status = rib_svc_create_chan(hca, (caddr_t)q,
2978 		    event->cm_event.req.req_prim_hca_port, &qp);
2979 
2980 		if (status) {
2981 			return (IBT_CM_REJECT);
2982 		}
2983 
2984 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2985 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2986 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2987 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2988 
2989 		/*
2990 		 * Pre-posts RECV buffers
2991 		 */
2992 		conn = qptoc(qp);
2993 		for (i = 0; i < preposted_rbufs; i++) {
2994 			bzero(&rdbuf, sizeof (rdbuf));
2995 			rdbuf.type = RECV_BUFFER;
2996 			buf = rib_rbuf_alloc(conn, &rdbuf);
2997 			if (buf == NULL) {
2998 				/*
2999 				 * A connection is not established yet.
3000 				 * Just flush the channel. Buffers
3001 				 * posted till now will error out with
3002 				 * IBT_WC_WR_FLUSHED_ERR.
3003 				 */
3004 				(void) ibt_flush_channel(qp->qp_hdl);
3005 				(void) rib_disconnect_channel(conn, NULL);
3006 				return (IBT_CM_REJECT);
3007 			}
3008 
3009 			bzero(&cl, sizeof (cl));
3010 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
3011 			cl.c_len = rdbuf.len;
3012 			cl.c_smemhandle.mrc_lmr =
3013 			    rdbuf.handle.mrc_lmr; /* lkey */
3014 			cl.c_next = NULL;
3015 			status = rib_post_recv(conn, &cl);
3016 			if (status != RDMA_SUCCESS) {
3017 				/*
3018 				 * A connection is not established yet.
3019 				 * Just flush the channel. Buffers
3020 				 * posted till now will error out with
3021 				 * IBT_WC_WR_FLUSHED_ERR.
3022 				 */
3023 				(void) ibt_flush_channel(qp->qp_hdl);
3024 				(void) rib_disconnect_channel(conn, NULL);
3025 				return (IBT_CM_REJECT);
3026 			}
3027 		}
3028 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
3029 
3030 		/*
3031 		 * Get the address translation
3032 		 */
3033 		rw_enter(&hca->state_lock, RW_READER);
3034 		if (hca->state == HCA_DETACHED) {
3035 			rw_exit(&hca->state_lock);
3036 			return (IBT_CM_REJECT);
3037 		}
3038 		rw_exit(&hca->state_lock);
3039 
3040 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3041 
3042 		if (ibt_get_ip_data(event->cm_priv_data_len,
3043 		    event->cm_priv_data,
3044 		    &ipinfo) != IBT_SUCCESS) {
3045 
3046 			return (IBT_CM_REJECT);
3047 		}
3048 
3049 		switch (ipinfo.src_addr.family) {
3050 		case AF_INET:
3051 
3052 			conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1,
3053 			    KM_SLEEP);
3054 			(void) strcpy(conn->c_netid, RIBNETID_TCP);
3055 
3056 			conn->c_raddr.maxlen =
3057 			    conn->c_raddr.len = sin_size;
3058 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3059 
3060 			s = (struct sockaddr_in *)conn->c_raddr.buf;
3061 			s->sin_family = AF_INET;
3062 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3063 			    &s->sin_addr, in_size);
3064 
3065 			conn->c_laddr.maxlen =
3066 			    conn->c_laddr.len = sin_size;
3067 			conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3068 
3069 			s = (struct sockaddr_in *)conn->c_laddr.buf;
3070 			s->sin_family = AF_INET;
3071 			bcopy((void *)&ipinfo.dst_addr.un.ip4addr,
3072 			    &s->sin_addr, in_size);
3073 
3074 			conn->c_addrmask.maxlen = conn->c_addrmask.len =
3075 			    sizeof (struct sockaddr_in);
3076 			conn->c_addrmask.buf =
3077 			    kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3078 			((struct sockaddr_in *)
3079 			    conn->c_addrmask.buf)->sin_addr.s_addr =
3080 			    (uint32_t)~0;
3081 			((struct sockaddr_in *)
3082 			    conn->c_addrmask.buf)->sin_family =
3083 			    (sa_family_t)~0;
3084 			break;
3085 
3086 		case AF_INET6:
3087 
3088 			conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1,
3089 			    KM_SLEEP);
3090 			(void) strcpy(conn->c_netid, RIBNETID_TCP6);
3091 
3092 			conn->c_raddr.maxlen =
3093 			    conn->c_raddr.len = sin6_size;
3094 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3095 
3096 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3097 			s6->sin6_family = AF_INET6;
3098 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3099 			    &s6->sin6_addr,
3100 			    sizeof (struct in6_addr));
3101 
3102 			conn->c_laddr.maxlen =
3103 			    conn->c_laddr.len = sin6_size;
3104 			conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3105 
3106 			s6 = (struct sockaddr_in6 *)conn->c_laddr.buf;
3107 			s6->sin6_family = AF_INET6;
3108 			bcopy((void *)&ipinfo.dst_addr.un.ip6addr,
3109 			    &s6->sin6_addr,
3110 			    sizeof (struct in6_addr));
3111 
3112 			conn->c_addrmask.maxlen = conn->c_addrmask.len =
3113 			    sizeof (struct sockaddr_in6);
3114 			conn->c_addrmask.buf =
3115 			    kmem_zalloc(conn->c_addrmask.len, KM_SLEEP);
3116 			(void) memset(&((struct sockaddr_in6 *)
3117 			    conn->c_addrmask.buf)->sin6_addr, (uchar_t)~0,
3118 			    sizeof (struct in6_addr));
3119 			((struct sockaddr_in6 *)
3120 			    conn->c_addrmask.buf)->sin6_family =
3121 			    (sa_family_t)~0;
3122 			break;
3123 
3124 		default:
3125 			return (IBT_CM_REJECT);
3126 		}
3127 
3128 		break;
3129 
3130 	case IBT_CM_EVENT_CONN_CLOSED:
3131 	{
3132 		CONN		*conn;
3133 		rib_qp_t	*qp;
3134 
3135 		switch (event->cm_event.closed) {
3136 		case IBT_CM_CLOSED_DREP_RCVD:
3137 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
3138 		case IBT_CM_CLOSED_DUP:
3139 		case IBT_CM_CLOSED_ABORT:
3140 		case IBT_CM_CLOSED_ALREADY:
3141 			/*
3142 			 * These cases indicate the local end initiated
3143 			 * the closing of the channel. Nothing to do here.
3144 			 */
3145 			break;
3146 		default:
3147 			/*
3148 			 * Reason for CONN_CLOSED event must be one of
3149 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3150 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
3151 			 * the remote end is closing the channel. In these
3152 			 * cases free the channel and transition to error
3153 			 * state
3154 			 */
3155 			qp = ibt_get_chan_private(event->cm_channel);
3156 			conn = qptoc(qp);
3157 			mutex_enter(&conn->c_lock);
3158 			if (conn->c_state == C_DISCONN_PEND) {
3159 				mutex_exit(&conn->c_lock);
3160 				break;
3161 			}
3162 			conn->c_state = C_ERROR_CONN;
3163 
3164 			/*
3165 			 * Free the conn if c_ref goes down to 0
3166 			 */
3167 			if (conn->c_ref == 0) {
3168 				/*
3169 				 * Remove from list and free conn
3170 				 */
3171 				conn->c_state = C_DISCONN_PEND;
3172 				mutex_exit(&conn->c_lock);
3173 				(void) rib_disconnect_channel(conn,
3174 				    &hca->srv_conn_list);
3175 			} else {
3176 				/*
3177 				 * conn will be freed when c_ref goes to 0.
3178 				 * Indicate to cleaning thread not to close
3179 				 * the connection, but just free the channel.
3180 				 */
3181 				conn->c_flags |= C_CLOSE_NOTNEEDED;
3182 				mutex_exit(&conn->c_lock);
3183 			}
3184 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3185 			break;
3186 		}
3187 		break;
3188 	}
3189 	case IBT_CM_EVENT_CONN_EST:
3190 		/*
3191 		 * RTU received, hence connection established.
3192 		 */
3193 		if (rib_debug > 1)
3194 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3195 			    "(CONN_EST) channel established");
3196 		break;
3197 
3198 	default:
3199 		if (rib_debug > 2) {
3200 			/* Let CM handle the following events. */
3201 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3202 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3203 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3204 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3205 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3206 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3207 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3208 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3209 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3210 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3211 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3212 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3213 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3214 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3215 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
3216 			}
3217 		}
3218 		return (IBT_CM_DEFAULT);
3219 	}
3220 
3221 	/* accept all other CM messages (i.e. let the CM handle them) */
3222 	return (IBT_CM_ACCEPT);
3223 }
3224 
3225 static rdma_stat
3226 rib_register_service(rib_hca_t *hca, int service_type,
3227 	uint8_t protocol_num, in_port_t dst_port)
3228 {
3229 	ibt_srv_desc_t		sdesc;
3230 	ibt_hca_portinfo_t	*port_infop;
3231 	ib_svc_id_t		srv_id;
3232 	ibt_srv_hdl_t		srv_hdl;
3233 	uint_t			port_size;
3234 	uint_t			pki, i, num_ports, nbinds;
3235 	ibt_status_t		ibt_status;
3236 	rib_service_t		*service;
3237 	ib_pkey_t		pkey;
3238 
3239 	/*
3240 	 * Query all ports for the given HCA
3241 	 */
3242 	rw_enter(&hca->state_lock, RW_READER);
3243 	if (hca->state != HCA_DETACHED) {
3244 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3245 		    &num_ports, &port_size);
3246 		rw_exit(&hca->state_lock);
3247 	} else {
3248 		rw_exit(&hca->state_lock);
3249 		return (RDMA_FAILED);
3250 	}
3251 	if (ibt_status != IBT_SUCCESS) {
3252 		return (RDMA_FAILED);
3253 	}
3254 
3255 	DTRACE_PROBE1(rpcib__i__regservice_numports,
3256 	    int, num_ports);
3257 
3258 	for (i = 0; i < num_ports; i++) {
3259 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3260 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3261 			    int, i+1);
3262 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3263 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
3264 			    int, i+1);
3265 		}
3266 	}
3267 
3268 	/*
3269 	 * Get all the IP addresses on this system to register the
3270 	 * given "service type" on all DNS recognized IP addrs.
3271 	 * Each service type such as NFS will have all the systems
3272 	 * IP addresses as its different names. For now the only
3273 	 * type of service we support in RPCIB is NFS.
3274 	 */
3275 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3276 	/*
3277 	 * Start registering and binding service to active
3278 	 * on active ports on this HCA.
3279 	 */
3280 	nbinds = 0;
3281 	for (service = rib_stat->service_list;
3282 	    service && (service->srv_type != service_type);
3283 	    service = service->next)
3284 		;
3285 
3286 	if (service == NULL) {
3287 		/*
3288 		 * We use IP addresses as the service names for
3289 		 * service registration.  Register each of them
3290 		 * with CM to obtain a svc_id and svc_hdl.  We do not
3291 		 * register the service with machine's loopback address.
3292 		 */
3293 		(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3294 		(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3295 		(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3296 		sdesc.sd_handler = rib_srv_cm_handler;
3297 		sdesc.sd_flags = 0;
3298 		ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3299 		    &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3300 		    1, &srv_hdl, &srv_id);
3301 		if ((ibt_status != IBT_SUCCESS) &&
3302 		    (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3303 			rw_exit(&rib_stat->service_list_lock);
3304 			DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3305 			    int, ibt_status);
3306 			ibt_free_portinfo(port_infop, port_size);
3307 			return (RDMA_FAILED);
3308 		}
3309 
3310 		/*
3311 		 * Allocate and prepare a service entry
3312 		 */
3313 		service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3314 
3315 		service->srv_type = service_type;
3316 		service->srv_hdl = srv_hdl;
3317 		service->srv_id = srv_id;
3318 
3319 		service->next = rib_stat->service_list;
3320 		rib_stat->service_list = service;
3321 		DTRACE_PROBE1(rpcib__i__regservice__new__service,
3322 		    int, service->srv_type);
3323 	} else {
3324 		srv_hdl = service->srv_hdl;
3325 		srv_id = service->srv_id;
3326 		DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3327 		    int, service->srv_type);
3328 	}
3329 
3330 	for (i = 0; i < num_ports; i++) {
3331 		ibt_sbind_hdl_t		sbp;
3332 		rib_hca_service_t	*hca_srv;
3333 		ib_gid_t		gid;
3334 
3335 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3336 			continue;
3337 
3338 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3339 			pkey = port_infop[i].p_pkey_tbl[pki];
3340 
3341 			rw_enter(&hca->bound_services_lock, RW_READER);
3342 			gid = port_infop[i].p_sgid_tbl[0];
3343 			for (hca_srv = hca->bound_services; hca_srv;
3344 			    hca_srv = hca_srv->next) {
3345 				if ((hca_srv->srv_id == service->srv_id) &&
3346 				    (hca_srv->gid.gid_prefix ==
3347 				    gid.gid_prefix) &&
3348 				    (hca_srv->gid.gid_guid == gid.gid_guid))
3349 					break;
3350 			}
3351 			rw_exit(&hca->bound_services_lock);
3352 			if (hca_srv != NULL) {
3353 				/*
3354 				 * port is alreay bound the the service
3355 				 */
3356 				DTRACE_PROBE1(
3357 				    rpcib__i__regservice__already__bound,
3358 				    int, i+1);
3359 				nbinds++;
3360 				continue;
3361 			}
3362 
3363 			if ((pkey & IBSRM_HB) &&
3364 			    (pkey != IB_PKEY_INVALID_FULL)) {
3365 
3366 				sbp = NULL;
3367 				ibt_status = ibt_bind_service(srv_hdl,
3368 				    gid, NULL, hca, &sbp);
3369 
3370 				if (ibt_status == IBT_SUCCESS) {
3371 					hca_srv = kmem_zalloc(
3372 					    sizeof (rib_hca_service_t),
3373 					    KM_SLEEP);
3374 					hca_srv->srv_id = srv_id;
3375 					hca_srv->gid = gid;
3376 					hca_srv->sbind_hdl = sbp;
3377 
3378 					rw_enter(&hca->bound_services_lock,
3379 					    RW_WRITER);
3380 					hca_srv->next = hca->bound_services;
3381 					hca->bound_services = hca_srv;
3382 					rw_exit(&hca->bound_services_lock);
3383 					nbinds++;
3384 				}
3385 
3386 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3387 				    int, ibt_status);
3388 			}
3389 		}
3390 	}
3391 	rw_exit(&rib_stat->service_list_lock);
3392 
3393 	ibt_free_portinfo(port_infop, port_size);
3394 
3395 	if (nbinds == 0) {
3396 		return (RDMA_FAILED);
3397 	} else {
3398 		/*
3399 		 * Put this plugin into accept state, since atleast
3400 		 * one registration was successful.
3401 		 */
3402 		mutex_enter(&plugin_state_lock);
3403 		plugin_state = ACCEPT;
3404 		mutex_exit(&plugin_state_lock);
3405 		return (RDMA_SUCCESS);
3406 	}
3407 }
3408 
3409 void
3410 rib_listen(struct rdma_svc_data *rd)
3411 {
3412 	rdma_stat status;
3413 	int n_listening = 0;
3414 	rib_hca_t *hca;
3415 
3416 	mutex_enter(&rib_stat->listen_lock);
3417 	/*
3418 	 * if rd parameter is NULL then it means that rib_stat->q is
3419 	 * already initialized by a call from RDMA and we just want to
3420 	 * add a newly attached HCA to the same listening state as other
3421 	 * HCAs.
3422 	 */
3423 	if (rd == NULL) {
3424 		if (rib_stat->q == NULL) {
3425 			mutex_exit(&rib_stat->listen_lock);
3426 			return;
3427 		}
3428 	} else {
3429 		rib_stat->q = &rd->q;
3430 	}
3431 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3432 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3433 		/*
3434 		 * First check if a hca is still attached
3435 		 */
3436 		rw_enter(&hca->state_lock, RW_READER);
3437 		if (hca->state != HCA_INITED) {
3438 			rw_exit(&hca->state_lock);
3439 			continue;
3440 		}
3441 		rw_exit(&hca->state_lock);
3442 
3443 		/*
3444 		 * Right now the only service type is NFS. Hence
3445 		 * force feed this value. Ideally to communicate
3446 		 * the service type it should be passed down in
3447 		 * rdma_svc_data.
3448 		 */
3449 		status = rib_register_service(hca, NFS,
3450 		    IPPROTO_TCP, nfs_rdma_port);
3451 		if (status == RDMA_SUCCESS)
3452 			n_listening++;
3453 	}
3454 	rw_exit(&rib_stat->hcas_list_lock);
3455 
3456 	/*
3457 	 * Service active on an HCA, check rd->err_code for more
3458 	 * explainable errors.
3459 	 */
3460 	if (rd) {
3461 		if (n_listening > 0) {
3462 			rd->active = 1;
3463 			rd->err_code = RDMA_SUCCESS;
3464 		} else {
3465 			rd->active = 0;
3466 			rd->err_code = RDMA_FAILED;
3467 		}
3468 	}
3469 	mutex_exit(&rib_stat->listen_lock);
3470 }
3471 
3472 /* XXXX */
3473 /* ARGSUSED */
3474 static void
3475 rib_listen_stop(struct rdma_svc_data *svcdata)
3476 {
3477 	rib_hca_t		*hca;
3478 
3479 	mutex_enter(&rib_stat->listen_lock);
3480 	/*
3481 	 * KRPC called the RDMATF to stop the listeners, this means
3482 	 * stop sending incomming or recieved requests to KRPC master
3483 	 * transport handle for RDMA-IB. This is also means that the
3484 	 * master transport handle, responsible for us, is going away.
3485 	 */
3486 	mutex_enter(&plugin_state_lock);
3487 	plugin_state = NO_ACCEPT;
3488 	if (svcdata != NULL)
3489 		svcdata->active = 0;
3490 	mutex_exit(&plugin_state_lock);
3491 
3492 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3493 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3494 		/*
3495 		 * First check if a hca is still attached
3496 		 */
3497 		rw_enter(&hca->state_lock, RW_READER);
3498 		if (hca->state == HCA_DETACHED) {
3499 			rw_exit(&hca->state_lock);
3500 			continue;
3501 		}
3502 		rib_close_channels(&hca->srv_conn_list);
3503 		rib_stop_services(hca);
3504 		rw_exit(&hca->state_lock);
3505 	}
3506 	rw_exit(&rib_stat->hcas_list_lock);
3507 
3508 	/*
3509 	 * Avoid rib_listen() using the stale q field.
3510 	 * This could happen if a port goes up after all services
3511 	 * are already unregistered.
3512 	 */
3513 	rib_stat->q = NULL;
3514 	mutex_exit(&rib_stat->listen_lock);
3515 }
3516 
3517 /*
3518  * Traverse the HCA's service list to unbind and deregister services.
3519  * For each bound service of HCA to be removed, first find the corresponding
3520  * service handle (srv_hdl) and then unbind the service by calling
3521  * ibt_unbind_service().
3522  */
3523 static void
3524 rib_stop_services(rib_hca_t *hca)
3525 {
3526 	rib_hca_service_t *srv_list, *to_remove;
3527 
3528 	/*
3529 	 * unbind and deregister the services for this service type.
3530 	 * Right now there is only one service type. In future it will
3531 	 * be passed down to this function.
3532 	 */
3533 	rw_enter(&hca->bound_services_lock, RW_READER);
3534 	srv_list = hca->bound_services;
3535 	hca->bound_services = NULL;
3536 	rw_exit(&hca->bound_services_lock);
3537 
3538 	while (srv_list != NULL) {
3539 		rib_service_t *sc;
3540 
3541 		to_remove = srv_list;
3542 		srv_list = to_remove->next;
3543 		rw_enter(&rib_stat->service_list_lock, RW_READER);
3544 		for (sc = rib_stat->service_list;
3545 		    sc && (sc->srv_id != to_remove->srv_id);
3546 		    sc = sc->next)
3547 			;
3548 		/*
3549 		 * if sc is NULL then the service doesn't exist anymore,
3550 		 * probably just removed completely through rib_stat.
3551 		 */
3552 		if (sc != NULL)
3553 			(void) ibt_unbind_service(sc->srv_hdl,
3554 			    to_remove->sbind_hdl);
3555 		rw_exit(&rib_stat->service_list_lock);
3556 		kmem_free(to_remove, sizeof (rib_hca_service_t));
3557 	}
3558 }
3559 
3560 static struct svc_recv *
3561 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3562 {
3563 	struct svc_recv	*recvp;
3564 
3565 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3566 	recvp->vaddr = sgl->ds_va;
3567 	recvp->qp = qp;
3568 	recvp->bytes_xfer = 0;
3569 	return (recvp);
3570 }
3571 
3572 static int
3573 rib_free_svc_recv(struct svc_recv *recvp)
3574 {
3575 	kmem_free(recvp, sizeof (*recvp));
3576 
3577 	return (0);
3578 }
3579 
3580 static struct reply *
3581 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3582 {
3583 	struct reply	*rep;
3584 
3585 
3586 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3587 	if (rep == NULL) {
3588 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3589 		return (NULL);
3590 	}
3591 	rep->xid = msgid;
3592 	rep->vaddr_cq = NULL;
3593 	rep->bytes_xfer = 0;
3594 	rep->status = (uint_t)REPLY_WAIT;
3595 	rep->prev = NULL;
3596 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3597 
3598 	mutex_enter(&qp->replylist_lock);
3599 	if (qp->replylist) {
3600 		rep->next = qp->replylist;
3601 		qp->replylist->prev = rep;
3602 	}
3603 	qp->rep_list_size++;
3604 
3605 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3606 	    int, qp->rep_list_size);
3607 
3608 	qp->replylist = rep;
3609 	mutex_exit(&qp->replylist_lock);
3610 
3611 	return (rep);
3612 }
3613 
3614 static rdma_stat
3615 rib_rem_replylist(rib_qp_t *qp)
3616 {
3617 	struct reply	*r, *n;
3618 
3619 	mutex_enter(&qp->replylist_lock);
3620 	for (r = qp->replylist; r != NULL; r = n) {
3621 		n = r->next;
3622 		(void) rib_remreply(qp, r);
3623 	}
3624 	mutex_exit(&qp->replylist_lock);
3625 
3626 	return (RDMA_SUCCESS);
3627 }
3628 
3629 static int
3630 rib_remreply(rib_qp_t *qp, struct reply *rep)
3631 {
3632 
3633 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3634 	if (rep->prev) {
3635 		rep->prev->next = rep->next;
3636 	}
3637 	if (rep->next) {
3638 		rep->next->prev = rep->prev;
3639 	}
3640 	if (qp->replylist == rep)
3641 		qp->replylist = rep->next;
3642 
3643 	cv_destroy(&rep->wait_cv);
3644 	qp->rep_list_size--;
3645 
3646 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3647 	    int, qp->rep_list_size);
3648 
3649 	kmem_free(rep, sizeof (*rep));
3650 
3651 	return (0);
3652 }
3653 
3654 rdma_stat
3655 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3656 	struct mrc *buf_handle)
3657 {
3658 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3659 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3660 	rdma_stat	status;
3661 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3662 
3663 	/*
3664 	 * Note: ALL buffer pools use the same memory type RDMARW.
3665 	 */
3666 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3667 	if (status == RDMA_SUCCESS) {
3668 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3669 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3670 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3671 	} else {
3672 		buf_handle->mrc_linfo = NULL;
3673 		buf_handle->mrc_lmr = 0;
3674 		buf_handle->mrc_rmr = 0;
3675 	}
3676 	return (status);
3677 }
3678 
3679 static rdma_stat
3680 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3681 	ibt_mr_flags_t spec,
3682 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3683 {
3684 	ibt_mr_attr_t	mem_attr;
3685 	ibt_status_t	ibt_status;
3686 	mem_attr.mr_vaddr = (uintptr_t)buf;
3687 	mem_attr.mr_len = (ib_msglen_t)size;
3688 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3689 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3690 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3691 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3692 
3693 	rw_enter(&hca->state_lock, RW_READER);
3694 	if (hca->state != HCA_DETACHED) {
3695 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3696 		    &mem_attr, mr_hdlp, mr_descp);
3697 		rw_exit(&hca->state_lock);
3698 	} else {
3699 		rw_exit(&hca->state_lock);
3700 		return (RDMA_FAILED);
3701 	}
3702 
3703 	if (ibt_status != IBT_SUCCESS) {
3704 		return (RDMA_FAILED);
3705 	}
3706 	return (RDMA_SUCCESS);
3707 }
3708 
3709 rdma_stat
3710 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3711 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3712 {
3713 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3714 	rib_lrc_entry_t *l;
3715 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3716 	rdma_stat	status;
3717 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3718 
3719 	/*
3720 	 * Non-coherent memory registration.
3721 	 */
3722 	l = (rib_lrc_entry_t *)lrc;
3723 	if (l) {
3724 		if (l->registered) {
3725 			buf_handle->mrc_linfo =
3726 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3727 			buf_handle->mrc_lmr =
3728 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3729 			buf_handle->mrc_rmr =
3730 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3731 			*sync_handle = (RIB_SYNCMEM_HANDLE)
3732 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3733 			return (RDMA_SUCCESS);
3734 		} else {
3735 			/* Always register the whole buffer */
3736 			buf = (caddr_t)l->lrc_buf;
3737 			buflen = l->lrc_len;
3738 		}
3739 	}
3740 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3741 
3742 	if (status == RDMA_SUCCESS) {
3743 		if (l) {
3744 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3745 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3746 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3747 			l->registered		 = TRUE;
3748 		}
3749 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3750 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3751 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3752 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3753 	} else {
3754 		buf_handle->mrc_linfo = NULL;
3755 		buf_handle->mrc_lmr = 0;
3756 		buf_handle->mrc_rmr = 0;
3757 	}
3758 	return (status);
3759 }
3760 
3761 /* ARGSUSED */
3762 rdma_stat
3763 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3764 {
3765 	rib_hca_t *hca = (ctoqp(conn))->hca;
3766 	/*
3767 	 * Allow memory deregistration even if HCA is
3768 	 * getting detached. Need all outstanding
3769 	 * memory registrations to be deregistered
3770 	 * before HCA_DETACH_EVENT can be accepted.
3771 	 */
3772 	(void) ibt_deregister_mr(hca->hca_hdl,
3773 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3774 	return (RDMA_SUCCESS);
3775 }
3776 
3777 /* ARGSUSED */
3778 rdma_stat
3779 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3780 		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3781 {
3782 	rib_lrc_entry_t *l;
3783 	l = (rib_lrc_entry_t *)lrc;
3784 	if (l)
3785 		if (l->registered)
3786 			return (RDMA_SUCCESS);
3787 
3788 	(void) rib_deregistermem(conn, buf, buf_handle);
3789 
3790 	return (RDMA_SUCCESS);
3791 }
3792 
3793 /* ARGSUSED */
3794 rdma_stat
3795 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3796 		int len, int cpu)
3797 {
3798 	ibt_status_t	status;
3799 	rib_hca_t *hca = (ctoqp(conn))->hca;
3800 	ibt_mr_sync_t	mr_segment;
3801 
3802 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3803 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3804 	mr_segment.ms_len = (ib_memlen_t)len;
3805 	if (cpu) {
3806 		/* make incoming data visible to memory */
3807 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3808 	} else {
3809 		/* make memory changes visible to IO */
3810 		mr_segment.ms_flags = IBT_SYNC_READ;
3811 	}
3812 	rw_enter(&hca->state_lock, RW_READER);
3813 	if (hca->state != HCA_DETACHED) {
3814 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3815 		rw_exit(&hca->state_lock);
3816 	} else {
3817 		rw_exit(&hca->state_lock);
3818 		return (RDMA_FAILED);
3819 	}
3820 
3821 	if (status == IBT_SUCCESS)
3822 		return (RDMA_SUCCESS);
3823 	else {
3824 		return (RDMA_FAILED);
3825 	}
3826 }
3827 
3828 /*
3829  * XXXX	????
3830  */
3831 static rdma_stat
3832 rib_getinfo(rdma_info_t *info)
3833 {
3834 	/*
3835 	 * XXXX	Hack!
3836 	 */
3837 	info->addrlen = 16;
3838 	info->mts = 1000000;
3839 	info->mtu = 1000000;
3840 
3841 	return (RDMA_SUCCESS);
3842 }
3843 
3844 rib_bufpool_t *
3845 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3846 {
3847 	rib_bufpool_t	*rbp = NULL;
3848 	bufpool_t	*bp = NULL;
3849 	caddr_t		buf;
3850 	ibt_mr_attr_t	mem_attr;
3851 	ibt_status_t	ibt_status;
3852 	int		i, j;
3853 
3854 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3855 
3856 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3857 	    num * sizeof (void *), KM_SLEEP);
3858 
3859 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3860 	bp->numelems = num;
3861 
3862 
3863 	switch (ptype) {
3864 	case SEND_BUFFER:
3865 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3866 		bp->rsize = RPC_MSG_SZ;
3867 		break;
3868 	case RECV_BUFFER:
3869 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3870 		bp->rsize = RPC_BUF_SIZE;
3871 		break;
3872 	default:
3873 		goto fail;
3874 	}
3875 
3876 	/*
3877 	 * Register the pool.
3878 	 */
3879 	bp->bufsize = num * bp->rsize;
3880 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3881 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3882 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3883 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3884 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3885 	rw_enter(&hca->state_lock, RW_READER);
3886 
3887 	if (hca->state == HCA_DETACHED) {
3888 		rw_exit(&hca->state_lock);
3889 		goto fail;
3890 	}
3891 
3892 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3893 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3894 		mem_attr.mr_vaddr = (uintptr_t)buf;
3895 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3896 		mem_attr.mr_as = NULL;
3897 		ibt_status = ibt_register_mr(hca->hca_hdl,
3898 		    hca->pd_hdl, &mem_attr,
3899 		    &rbp->mr_hdl[i],
3900 		    &rbp->mr_desc[i]);
3901 		if (ibt_status != IBT_SUCCESS) {
3902 			for (j = 0; j < i; j++) {
3903 				(void) ibt_deregister_mr(hca->hca_hdl,
3904 				    rbp->mr_hdl[j]);
3905 			}
3906 			rw_exit(&hca->state_lock);
3907 			goto fail;
3908 		}
3909 	}
3910 	rw_exit(&hca->state_lock);
3911 	buf = (caddr_t)bp->buf;
3912 	for (i = 0; i < num; i++, buf += bp->rsize) {
3913 		bp->buflist[i] = (void *)buf;
3914 	}
3915 	bp->buffree = num - 1;	/* no. of free buffers */
3916 	rbp->bpool = bp;
3917 
3918 	return (rbp);
3919 fail:
3920 	if (bp) {
3921 		if (bp->buf)
3922 			kmem_free(bp->buf, bp->bufsize);
3923 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3924 	}
3925 	if (rbp) {
3926 		if (rbp->mr_hdl)
3927 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3928 		if (rbp->mr_desc)
3929 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3930 		kmem_free(rbp, sizeof (rib_bufpool_t));
3931 	}
3932 	return (NULL);
3933 }
3934 
3935 static void
3936 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3937 {
3938 	int i;
3939 	rib_bufpool_t *rbp = NULL;
3940 	bufpool_t *bp;
3941 
3942 	/*
3943 	 * Obtain pool address based on type of pool
3944 	 */
3945 	switch (ptype) {
3946 		case SEND_BUFFER:
3947 			rbp = hca->send_pool;
3948 			break;
3949 		case RECV_BUFFER:
3950 			rbp = hca->recv_pool;
3951 			break;
3952 		default:
3953 			return;
3954 	}
3955 	if (rbp == NULL)
3956 		return;
3957 
3958 	bp = rbp->bpool;
3959 
3960 	/*
3961 	 * Deregister the pool memory and free it.
3962 	 */
3963 	for (i = 0; i < bp->numelems; i++) {
3964 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3965 	}
3966 }
3967 
3968 static void
3969 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3970 {
3971 
3972 	rib_bufpool_t *rbp = NULL;
3973 	bufpool_t *bp;
3974 
3975 	/*
3976 	 * Obtain pool address based on type of pool
3977 	 */
3978 	switch (ptype) {
3979 		case SEND_BUFFER:
3980 			rbp = hca->send_pool;
3981 			break;
3982 		case RECV_BUFFER:
3983 			rbp = hca->recv_pool;
3984 			break;
3985 		default:
3986 			return;
3987 	}
3988 	if (rbp == NULL)
3989 		return;
3990 
3991 	bp = rbp->bpool;
3992 
3993 	/*
3994 	 * Free the pool memory.
3995 	 */
3996 	if (rbp->mr_hdl)
3997 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3998 
3999 	if (rbp->mr_desc)
4000 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
4001 	if (bp->buf)
4002 		kmem_free(bp->buf, bp->bufsize);
4003 	mutex_destroy(&bp->buflock);
4004 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
4005 	kmem_free(rbp, sizeof (rib_bufpool_t));
4006 }
4007 
4008 void
4009 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
4010 {
4011 	/*
4012 	 * Deregister the pool memory and free it.
4013 	 */
4014 	rib_rbufpool_deregister(hca, ptype);
4015 	rib_rbufpool_free(hca, ptype);
4016 }
4017 
4018 /*
4019  * Fetch a buffer from the pool of type specified in rdbuf->type.
4020  */
4021 static rdma_stat
4022 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4023 {
4024 	rib_lrc_entry_t *rlep;
4025 
4026 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
4027 		rlep = rib_get_cache_buf(conn, rdbuf->len);
4028 		rdbuf->rb_private =  (caddr_t)rlep;
4029 		rdbuf->addr = rlep->lrc_buf;
4030 		rdbuf->handle = rlep->lrc_mhandle;
4031 		return (RDMA_SUCCESS);
4032 	}
4033 
4034 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4035 	if (rdbuf->addr) {
4036 		switch (rdbuf->type) {
4037 		case SEND_BUFFER:
4038 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
4039 			break;
4040 		case RECV_BUFFER:
4041 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
4042 			break;
4043 		default:
4044 			rdbuf->len = 0;
4045 		}
4046 		return (RDMA_SUCCESS);
4047 	} else
4048 		return (RDMA_FAILED);
4049 }
4050 
4051 /*
4052  * Fetch a buffer of specified type.
4053  * Note that rdbuf->handle is mw's rkey.
4054  */
4055 static void *
4056 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4057 {
4058 	rib_qp_t	*qp = ctoqp(conn);
4059 	rib_hca_t	*hca = qp->hca;
4060 	rdma_btype	ptype = rdbuf->type;
4061 	void		*buf;
4062 	rib_bufpool_t	*rbp = NULL;
4063 	bufpool_t	*bp;
4064 	int		i;
4065 
4066 	/*
4067 	 * Obtain pool address based on type of pool
4068 	 */
4069 	switch (ptype) {
4070 	case SEND_BUFFER:
4071 		rbp = hca->send_pool;
4072 		break;
4073 	case RECV_BUFFER:
4074 		rbp = hca->recv_pool;
4075 		break;
4076 	default:
4077 		return (NULL);
4078 	}
4079 	if (rbp == NULL)
4080 		return (NULL);
4081 
4082 	bp = rbp->bpool;
4083 
4084 	mutex_enter(&bp->buflock);
4085 	if (bp->buffree < 0) {
4086 		mutex_exit(&bp->buflock);
4087 		return (NULL);
4088 	}
4089 
4090 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4091 	buf = bp->buflist[bp->buffree];
4092 	rdbuf->addr = buf;
4093 	rdbuf->len = bp->rsize;
4094 	for (i = bp->numelems - 1; i >= 0; i--) {
4095 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4096 			rdbuf->handle.mrc_rmr =
4097 			    (uint32_t)rbp->mr_desc[i].md_rkey;
4098 			rdbuf->handle.mrc_linfo =
4099 			    (uintptr_t)rbp->mr_hdl[i];
4100 			rdbuf->handle.mrc_lmr =
4101 			    (uint32_t)rbp->mr_desc[i].md_lkey;
4102 			bp->buffree--;
4103 
4104 			mutex_exit(&bp->buflock);
4105 
4106 			return (buf);
4107 		}
4108 	}
4109 
4110 	mutex_exit(&bp->buflock);
4111 
4112 	return (NULL);
4113 }
4114 
4115 static void
4116 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4117 {
4118 
4119 	if (rdbuf->type == RDMA_LONG_BUFFER) {
4120 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4121 		rdbuf->rb_private = NULL;
4122 		return;
4123 	}
4124 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4125 }
4126 
4127 static void
4128 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4129 {
4130 	rib_qp_t *qp = ctoqp(conn);
4131 	rib_hca_t *hca = qp->hca;
4132 	rib_bufpool_t *rbp = NULL;
4133 	bufpool_t *bp;
4134 
4135 	/*
4136 	 * Obtain pool address based on type of pool
4137 	 */
4138 	switch (ptype) {
4139 	case SEND_BUFFER:
4140 		rbp = hca->send_pool;
4141 		break;
4142 	case RECV_BUFFER:
4143 		rbp = hca->recv_pool;
4144 		break;
4145 	default:
4146 		return;
4147 	}
4148 	if (rbp == NULL)
4149 		return;
4150 
4151 	bp = rbp->bpool;
4152 
4153 	mutex_enter(&bp->buflock);
4154 	if (++bp->buffree >= bp->numelems) {
4155 		/*
4156 		 * Should never happen
4157 		 */
4158 		bp->buffree--;
4159 	} else {
4160 		bp->buflist[bp->buffree] = buf;
4161 	}
4162 	mutex_exit(&bp->buflock);
4163 }
4164 
4165 static rdma_stat
4166 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4167 {
4168 	rw_enter(&connlist->conn_lock, RW_WRITER);
4169 	if (connlist->conn_hd) {
4170 		cn->c_next = connlist->conn_hd;
4171 		connlist->conn_hd->c_prev = cn;
4172 	}
4173 	connlist->conn_hd = cn;
4174 	rw_exit(&connlist->conn_lock);
4175 
4176 	return (RDMA_SUCCESS);
4177 }
4178 
4179 static rdma_stat
4180 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4181 {
4182 	rw_enter(&connlist->conn_lock, RW_WRITER);
4183 	if (cn->c_prev) {
4184 		cn->c_prev->c_next = cn->c_next;
4185 	}
4186 	if (cn->c_next) {
4187 		cn->c_next->c_prev = cn->c_prev;
4188 	}
4189 	if (connlist->conn_hd == cn)
4190 		connlist->conn_hd = cn->c_next;
4191 	rw_exit(&connlist->conn_lock);
4192 
4193 	return (RDMA_SUCCESS);
4194 }
4195 
4196 /* ARGSUSED */
4197 static rdma_stat
4198 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4199     int addr_type, void *handle, CONN **conn)
4200 {
4201 	rdma_stat status;
4202 	rpcib_ping_t rpt;
4203 
4204 	status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
4205 	return (status);
4206 }
4207 
4208 /*
4209  * rib_find_hca_connection
4210  *
4211  * if there is an existing connection to the specified address then
4212  * it will be returned in conn, otherwise conn will be set to NULL.
4213  * Also cleans up any connection that is in error state.
4214  */
4215 static int
4216 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4217     struct netbuf *d_svcaddr, CONN **conn)
4218 {
4219 	CONN *cn;
4220 	clock_t cv_stat, timout;
4221 
4222 	*conn = NULL;
4223 again:
4224 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4225 	cn = hca->cl_conn_list.conn_hd;
4226 	while (cn != NULL) {
4227 		/*
4228 		 * First, clear up any connection in the ERROR state
4229 		 */
4230 		mutex_enter(&cn->c_lock);
4231 		if (cn->c_state == C_ERROR_CONN) {
4232 			if (cn->c_ref == 0) {
4233 				/*
4234 				 * Remove connection from list and destroy it.
4235 				 */
4236 				cn->c_state = C_DISCONN_PEND;
4237 				mutex_exit(&cn->c_lock);
4238 				rw_exit(&hca->cl_conn_list.conn_lock);
4239 				rib_conn_close((void *)cn);
4240 				goto again;
4241 			}
4242 			mutex_exit(&cn->c_lock);
4243 			cn = cn->c_next;
4244 			continue;
4245 		}
4246 		if (cn->c_state == C_DISCONN_PEND) {
4247 			mutex_exit(&cn->c_lock);
4248 			cn = cn->c_next;
4249 			continue;
4250 		}
4251 
4252 		/*
4253 		 * source address is only checked for if there is one,
4254 		 * this is the case for retries.
4255 		 */
4256 		if ((cn->c_raddr.len == d_svcaddr->len) &&
4257 		    (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4258 		    d_svcaddr->len) == 0) &&
4259 		    ((s_svcaddr->len == 0) ||
4260 		    ((cn->c_laddr.len == s_svcaddr->len) &&
4261 		    (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4262 		    s_svcaddr->len) == 0)))) {
4263 			/*
4264 			 * Our connection. Give up conn list lock
4265 			 * as we are done traversing the list.
4266 			 */
4267 			rw_exit(&hca->cl_conn_list.conn_lock);
4268 			if (cn->c_state == C_CONNECTED) {
4269 				cn->c_ref++;	/* sharing a conn */
4270 				mutex_exit(&cn->c_lock);
4271 				*conn = cn;
4272 				return (RDMA_SUCCESS);
4273 			}
4274 			if (cn->c_state == C_CONN_PEND) {
4275 				/*
4276 				 * Hold a reference to this conn before
4277 				 * we give up the lock.
4278 				 */
4279 				cn->c_ref++;
4280 				timout =  ddi_get_lbolt() +
4281 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
4282 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4283 				    &cn->c_lock, timout)) > 0 &&
4284 				    cn->c_state == C_CONN_PEND)
4285 					;
4286 				if (cv_stat == 0) {
4287 					(void) rib_conn_release_locked(cn);
4288 					return (RDMA_INTR);
4289 				}
4290 				if (cv_stat < 0) {
4291 					(void) rib_conn_release_locked(cn);
4292 					return (RDMA_TIMEDOUT);
4293 				}
4294 				if (cn->c_state == C_CONNECTED) {
4295 					*conn = cn;
4296 					mutex_exit(&cn->c_lock);
4297 					return (RDMA_SUCCESS);
4298 				} else {
4299 					(void) rib_conn_release_locked(cn);
4300 					return (RDMA_TIMEDOUT);
4301 				}
4302 			}
4303 		}
4304 		mutex_exit(&cn->c_lock);
4305 		cn = cn->c_next;
4306 	}
4307 	rw_exit(&hca->cl_conn_list.conn_lock);
4308 	*conn = NULL;
4309 	return (RDMA_FAILED);
4310 }
4311 
4312 /*
4313  * Connection management.
4314  * IBTF does not support recycling of channels. So connections are only
4315  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4316  * C_DISCONN_PEND state. No C_IDLE state.
4317  * C_CONN_PEND state: Connection establishment in progress to the server.
4318  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4319  * It has an RC channel associated with it. ibt_post_send/recv are allowed
4320  * only in this state.
4321  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4322  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4323  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4324  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4325  * c_ref drops to 0 (this indicates that RPC has no more references to this
4326  * connection), the connection should be destroyed. A connection transitions
4327  * into this state when it is being destroyed.
4328  */
4329 /* ARGSUSED */
4330 static rdma_stat
4331 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4332     int addr_type, rpcib_ping_t *rpt, CONN **conn)
4333 {
4334 	CONN *cn;
4335 	int status;
4336 	rib_hca_t *hca;
4337 	rib_qp_t *qp;
4338 	int s_addr_len;
4339 	char *s_addr_buf;
4340 
4341 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4342 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4343 		rw_enter(&hca->state_lock, RW_READER);
4344 		if (hca->state != HCA_DETACHED) {
4345 			status = rib_find_hca_connection(hca, s_svcaddr,
4346 			    d_svcaddr, conn);
4347 			rw_exit(&hca->state_lock);
4348 			if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4349 				rw_exit(&rib_stat->hcas_list_lock);
4350 				return (status);
4351 			}
4352 		} else
4353 			rw_exit(&hca->state_lock);
4354 	}
4355 	rw_exit(&rib_stat->hcas_list_lock);
4356 
4357 	/*
4358 	 * No existing connection found, establish a new connection.
4359 	 */
4360 	bzero(rpt, sizeof (rpcib_ping_t));
4361 
4362 	status = rib_ping_srv(addr_type, d_svcaddr, rpt);
4363 	if (status != RDMA_SUCCESS) {
4364 		return (RDMA_FAILED);
4365 	}
4366 	hca = rpt->hca;
4367 
4368 	if (rpt->srcip.family == AF_INET) {
4369 		s_addr_len = sizeof (rpt->srcip.un.ip4addr);
4370 		s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
4371 	} else if (rpt->srcip.family == AF_INET6) {
4372 		s_addr_len = sizeof (rpt->srcip.un.ip6addr);
4373 		s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
4374 	} else {
4375 		return (RDMA_FAILED);
4376 	}
4377 
4378 	/*
4379 	 * Channel to server doesn't exist yet, create one.
4380 	 */
4381 	if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4382 		return (RDMA_FAILED);
4383 	}
4384 	cn = qptoc(qp);
4385 	cn->c_state = C_CONN_PEND;
4386 	cn->c_ref = 1;
4387 
4388 	cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4389 	bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4390 	cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4391 
4392 	if (rpt->srcip.family == AF_INET) {
4393 		cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP);
4394 		(void) strcpy(cn->c_netid, RIBNETID_TCP);
4395 
4396 		cn->c_addrmask.len = cn->c_addrmask.maxlen =
4397 		    sizeof (struct sockaddr_in);
4398 		cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4399 
4400 		((struct sockaddr_in *)cn->c_addrmask.buf)->sin_addr.s_addr =
4401 		    (uint32_t)~0;
4402 		((struct sockaddr_in *)cn->c_addrmask.buf)->sin_family =
4403 		    (ushort_t)~0;
4404 
4405 	} else {
4406 		cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP);
4407 		(void) strcpy(cn->c_netid, RIBNETID_TCP6);
4408 
4409 		cn->c_addrmask.len = cn->c_addrmask.maxlen =
4410 		    sizeof (struct sockaddr_in6);
4411 		cn->c_addrmask.buf = kmem_zalloc(cn->c_addrmask.len, KM_SLEEP);
4412 
4413 		(void) memset(
4414 		    &((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_addr,
4415 		    (uchar_t)~0, sizeof (struct in6_addr));
4416 		((struct sockaddr_in6 *)cn->c_addrmask.buf)->sin6_family =
4417 		    (sa_family_t)~0;
4418 	}
4419 
4420 	/*
4421 	 * Add to conn list.
4422 	 * We had given up the READER lock. In the time since then,
4423 	 * another thread might have created the connection we are
4424 	 * trying here. But for now, that is quiet alright - there
4425 	 * might be two connections between a pair of hosts instead
4426 	 * of one. If we really want to close that window,
4427 	 * then need to check the list after acquiring the
4428 	 * WRITER lock.
4429 	 */
4430 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4431 	status = rib_conn_to_srv(hca, qp, rpt);
4432 	mutex_enter(&cn->c_lock);
4433 
4434 	if (cn->c_flags & C_CLOSE_PENDING) {
4435 		/*
4436 		 * This handles a case where the module or
4437 		 * HCA detached in the time a connection is
4438 		 * established. In such a case close the
4439 		 * connection immediately if this is the
4440 		 * only reference.
4441 		 */
4442 		if (cn->c_ref == 1) {
4443 			cn->c_ref--;
4444 			cn->c_state = C_DISCONN_PEND;
4445 			mutex_exit(&cn->c_lock);
4446 			rib_conn_close((void *)cn);
4447 			return (RDMA_FAILED);
4448 		}
4449 
4450 		/*
4451 		 * Connection to be closed later when c_ref = 0
4452 		 */
4453 		status = RDMA_FAILED;
4454 	}
4455 
4456 	if (status == RDMA_SUCCESS) {
4457 		cn->c_state = C_CONNECTED;
4458 		*conn = cn;
4459 	} else {
4460 		cn->c_state = C_ERROR_CONN;
4461 		cn->c_ref--;
4462 	}
4463 	cv_signal(&cn->c_cv);
4464 	mutex_exit(&cn->c_lock);
4465 	return (status);
4466 }
4467 
4468 static void
4469 rib_conn_close(void *rarg)
4470 {
4471 	CONN *conn = (CONN *)rarg;
4472 	rib_qp_t *qp = ctoqp(conn);
4473 
4474 	mutex_enter(&conn->c_lock);
4475 	if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4476 
4477 		conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4478 
4479 		/*
4480 		 * Live connection in CONNECTED state.
4481 		 */
4482 		if (conn->c_state == C_CONNECTED) {
4483 			conn->c_state = C_ERROR_CONN;
4484 		}
4485 		mutex_exit(&conn->c_lock);
4486 
4487 		rib_close_a_channel(conn);
4488 
4489 		mutex_enter(&conn->c_lock);
4490 		conn->c_flags &= ~C_CLOSE_PENDING;
4491 	}
4492 
4493 	mutex_exit(&conn->c_lock);
4494 
4495 	if (qp->mode == RIB_SERVER)
4496 		(void) rib_disconnect_channel(conn,
4497 		    &qp->hca->srv_conn_list);
4498 	else
4499 		(void) rib_disconnect_channel(conn,
4500 		    &qp->hca->cl_conn_list);
4501 }
4502 
4503 static void
4504 rib_conn_timeout_call(void *carg)
4505 {
4506 	time_t idle_time;
4507 	CONN *conn = (CONN *)carg;
4508 	rib_hca_t *hca = ctoqp(conn)->hca;
4509 	int error;
4510 
4511 	mutex_enter(&conn->c_lock);
4512 	if ((conn->c_ref > 0) ||
4513 	    (conn->c_state == C_DISCONN_PEND)) {
4514 		conn->c_timeout = NULL;
4515 		mutex_exit(&conn->c_lock);
4516 		return;
4517 	}
4518 
4519 	idle_time = (gethrestime_sec() - conn->c_last_used);
4520 
4521 	if ((idle_time <= rib_conn_timeout) &&
4522 	    (conn->c_state != C_ERROR_CONN)) {
4523 		/*
4524 		 * There was activity after the last timeout.
4525 		 * Extend the conn life. Unless the conn is
4526 		 * already in error state.
4527 		 */
4528 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4529 		    SEC_TO_TICK(rib_conn_timeout - idle_time));
4530 		mutex_exit(&conn->c_lock);
4531 		return;
4532 	}
4533 
4534 	error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4535 	    (void *)conn, DDI_NOSLEEP);
4536 
4537 	/*
4538 	 * If taskq dispatch fails above, then reset the timeout
4539 	 * to try again after 10 secs.
4540 	 */
4541 
4542 	if (error != DDI_SUCCESS) {
4543 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4544 		    SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4545 		mutex_exit(&conn->c_lock);
4546 		return;
4547 	}
4548 
4549 	conn->c_state = C_DISCONN_PEND;
4550 	mutex_exit(&conn->c_lock);
4551 }
4552 
4553 static rdma_stat
4554 rib_conn_release(CONN *conn)
4555 {
4556 	mutex_enter(&conn->c_lock);
4557 	return (rib_conn_release_locked(conn));
4558 }
4559 
4560 /*
4561  * Expects conn->c_lock to be held on entry.
4562  * c_lock released on return
4563  */
4564 static rdma_stat
4565 rib_conn_release_locked(CONN *conn)
4566 {
4567 	conn->c_ref--;
4568 
4569 	conn->c_last_used = gethrestime_sec();
4570 	if (conn->c_ref > 0) {
4571 		mutex_exit(&conn->c_lock);
4572 		return (RDMA_SUCCESS);
4573 	}
4574 
4575 	/*
4576 	 * If a conn is C_ERROR_CONN, close the channel.
4577 	 */
4578 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4579 		conn->c_state = C_DISCONN_PEND;
4580 		mutex_exit(&conn->c_lock);
4581 		rib_conn_close((void *)conn);
4582 		return (RDMA_SUCCESS);
4583 	}
4584 
4585 	/*
4586 	 * c_ref == 0, set a timeout for conn release
4587 	 */
4588 
4589 	if (conn->c_timeout == NULL) {
4590 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4591 		    SEC_TO_TICK(rib_conn_timeout));
4592 	}
4593 
4594 	mutex_exit(&conn->c_lock);
4595 	return (RDMA_SUCCESS);
4596 }
4597 
4598 /*
4599  * Add at front of list
4600  */
4601 static struct rdma_done_list *
4602 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4603 {
4604 	struct rdma_done_list *rd;
4605 
4606 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4607 
4608 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4609 	rd->xid = xid;
4610 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4611 
4612 	rd->prev = NULL;
4613 	rd->next = qp->rdlist;
4614 	if (qp->rdlist != NULL)
4615 		qp->rdlist->prev = rd;
4616 	qp->rdlist = rd;
4617 
4618 	return (rd);
4619 }
4620 
4621 static void
4622 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4623 {
4624 	struct rdma_done_list *r;
4625 
4626 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4627 
4628 	r = rd->next;
4629 	if (r != NULL) {
4630 		r->prev = rd->prev;
4631 	}
4632 
4633 	r = rd->prev;
4634 	if (r != NULL) {
4635 		r->next = rd->next;
4636 	} else {
4637 		qp->rdlist = rd->next;
4638 	}
4639 
4640 	cv_destroy(&rd->rdma_done_cv);
4641 	kmem_free(rd, sizeof (*rd));
4642 }
4643 
4644 static void
4645 rdma_done_rem_list(rib_qp_t *qp)
4646 {
4647 	struct rdma_done_list	*r, *n;
4648 
4649 	mutex_enter(&qp->rdlist_lock);
4650 	for (r = qp->rdlist; r != NULL; r = n) {
4651 		n = r->next;
4652 		rdma_done_rm(qp, r);
4653 	}
4654 	mutex_exit(&qp->rdlist_lock);
4655 }
4656 
4657 static void
4658 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4659 {
4660 	struct rdma_done_list *r = qp->rdlist;
4661 
4662 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4663 
4664 	while (r) {
4665 		if (r->xid == xid) {
4666 			cv_signal(&r->rdma_done_cv);
4667 			return;
4668 		} else {
4669 			r = r->next;
4670 		}
4671 	}
4672 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4673 	    int, xid);
4674 }
4675 
4676 /*
4677  * Expects conn->c_lock to be held by the caller.
4678  */
4679 
4680 static void
4681 rib_close_a_channel(CONN *conn)
4682 {
4683 	rib_qp_t	*qp;
4684 	qp = ctoqp(conn);
4685 
4686 	if (qp->qp_hdl == NULL) {
4687 		/* channel already freed */
4688 		return;
4689 	}
4690 
4691 	/*
4692 	 * Call ibt_close_rc_channel in blocking mode
4693 	 * with no callbacks.
4694 	 */
4695 	(void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4696 	    NULL, 0, NULL, NULL, 0);
4697 }
4698 
4699 /*
4700  * Goes through all connections and closes the channel
4701  * This will cause all the WRs on those channels to be
4702  * flushed.
4703  */
4704 static void
4705 rib_close_channels(rib_conn_list_t *connlist)
4706 {
4707 	CONN 		*conn, *tmp;
4708 
4709 	rw_enter(&connlist->conn_lock, RW_READER);
4710 	conn = connlist->conn_hd;
4711 	while (conn != NULL) {
4712 		mutex_enter(&conn->c_lock);
4713 		tmp = conn->c_next;
4714 		if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4715 
4716 			if (conn->c_state == C_CONN_PEND) {
4717 				conn->c_flags |= C_CLOSE_PENDING;
4718 				goto next;
4719 			}
4720 
4721 			conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4722 
4723 			/*
4724 			 * Live connection in CONNECTED state.
4725 			 */
4726 			if (conn->c_state == C_CONNECTED)
4727 				conn->c_state = C_ERROR_CONN;
4728 			mutex_exit(&conn->c_lock);
4729 
4730 			rib_close_a_channel(conn);
4731 
4732 			mutex_enter(&conn->c_lock);
4733 			conn->c_flags &= ~C_CLOSE_PENDING;
4734 			/* Signal a pending rib_disconnect_channel() */
4735 			cv_signal(&conn->c_cv);
4736 		}
4737 next:
4738 		mutex_exit(&conn->c_lock);
4739 		conn = tmp;
4740 	}
4741 	rw_exit(&connlist->conn_lock);
4742 }
4743 
4744 /*
4745  * Frees up all connections that are no longer being referenced
4746  */
4747 static void
4748 rib_purge_connlist(rib_conn_list_t *connlist)
4749 {
4750 	CONN 		*conn;
4751 
4752 top:
4753 	rw_enter(&connlist->conn_lock, RW_READER);
4754 	conn = connlist->conn_hd;
4755 	while (conn != NULL) {
4756 		mutex_enter(&conn->c_lock);
4757 
4758 		/*
4759 		 * At this point connection is either in ERROR
4760 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4761 		 * then some other thread is culling that connection.
4762 		 * If not and if c_ref is 0, then destroy the connection.
4763 		 */
4764 		if (conn->c_ref == 0 &&
4765 		    conn->c_state != C_DISCONN_PEND) {
4766 			/*
4767 			 * Cull the connection
4768 			 */
4769 			conn->c_state = C_DISCONN_PEND;
4770 			mutex_exit(&conn->c_lock);
4771 			rw_exit(&connlist->conn_lock);
4772 			(void) rib_disconnect_channel(conn, connlist);
4773 			goto top;
4774 		} else {
4775 			/*
4776 			 * conn disconnect already scheduled or will
4777 			 * happen from conn_release when c_ref drops to 0.
4778 			 */
4779 			mutex_exit(&conn->c_lock);
4780 		}
4781 		conn = conn->c_next;
4782 	}
4783 	rw_exit(&connlist->conn_lock);
4784 
4785 	/*
4786 	 * At this point, only connections with c_ref != 0 are on the list
4787 	 */
4788 }
4789 
4790 /*
4791  * Free all the HCA resources and close
4792  * the hca.
4793  */
4794 
4795 static void
4796 rib_free_hca(rib_hca_t *hca)
4797 {
4798 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4799 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4800 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4801 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4802 
4803 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4804 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4805 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4806 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4807 
4808 	rib_rbufpool_destroy(hca, RECV_BUFFER);
4809 	rib_rbufpool_destroy(hca, SEND_BUFFER);
4810 	rib_destroy_cache(hca);
4811 	if (rib_mod.rdma_count == 0)
4812 		(void) rdma_unregister_mod(&rib_mod);
4813 	(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4814 	(void) ibt_close_hca(hca->hca_hdl);
4815 	hca->hca_hdl = NULL;
4816 }
4817 
4818 
4819 static void
4820 rib_stop_hca_services(rib_hca_t *hca)
4821 {
4822 	rib_stop_services(hca);
4823 	rib_close_channels(&hca->cl_conn_list);
4824 	rib_close_channels(&hca->srv_conn_list);
4825 
4826 	rib_purge_connlist(&hca->cl_conn_list);
4827 	rib_purge_connlist(&hca->srv_conn_list);
4828 
4829 	if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4830 		kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4831 		    GLOBAL_ZONEID);
4832 		stats_enabled = FALSE;
4833 	}
4834 
4835 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4836 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4837 	if (hca->srv_conn_list.conn_hd == NULL &&
4838 	    hca->cl_conn_list.conn_hd == NULL) {
4839 		/*
4840 		 * conn_lists are NULL, so destroy
4841 		 * buffers, close hca and be done.
4842 		 */
4843 		rib_free_hca(hca);
4844 	}
4845 	rw_exit(&hca->cl_conn_list.conn_lock);
4846 	rw_exit(&hca->srv_conn_list.conn_lock);
4847 
4848 	if (hca->hca_hdl != NULL) {
4849 		mutex_enter(&hca->inuse_lock);
4850 		while (hca->inuse)
4851 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4852 		mutex_exit(&hca->inuse_lock);
4853 
4854 		rib_free_hca(hca);
4855 	}
4856 	rw_destroy(&hca->bound_services_lock);
4857 
4858 	if (hca->cleanup_helper != NULL) {
4859 		ddi_taskq_destroy(hca->cleanup_helper);
4860 		hca->cleanup_helper = NULL;
4861 	}
4862 }
4863 
4864 /*
4865  * Cleans and closes up all uses of the HCA
4866  */
4867 static void
4868 rib_detach_hca(ibt_hca_hdl_t hca_hdl)
4869 {
4870 	rib_hca_t *hca = NULL;
4871 	rib_hca_t **hcap;
4872 
4873 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4874 	for (hcap = &rib_stat->hcas_list; *hcap; hcap = &(*hcap)->next) {
4875 		hca = *hcap;
4876 		rw_enter(&hca->state_lock, RW_WRITER);
4877 		if (hca->hca_hdl == hca_hdl) {
4878 			/*
4879 			 * Mark as detached and remove from
4880 			 * hca list.
4881 			 */
4882 			hca->state = HCA_DETACHED;
4883 			*hcap = hca->next;
4884 			rib_stat->nhca_inited--;
4885 			rib_mod.rdma_count--;
4886 			rw_exit(&hca->state_lock);
4887 			break;
4888 		}
4889 		rw_exit(&hca->state_lock);
4890 	}
4891 	rw_exit(&rib_stat->hcas_list_lock);
4892 
4893 	if (hca == NULL)
4894 		return;
4895 	ASSERT(hca->hca_hdl == hca_hdl);
4896 
4897 	/*
4898 	 * Stop all services on the HCA
4899 	 * Go through cl_conn_list and close all rc_channels
4900 	 * Go through svr_conn_list and close all rc_channels
4901 	 * Free connections whose c_ref has dropped to 0
4902 	 * Destroy all CQs
4903 	 * Deregister and released all buffer pool memory after all
4904 	 * connections are destroyed
4905 	 * Free the protection domain
4906 	 * ibt_close_hca()
4907 	 */
4908 	rib_stop_hca_services(hca);
4909 
4910 	kmem_free(hca, sizeof (*hca));
4911 }
4912 
4913 static void
4914 rib_server_side_cache_reclaim(void *argp)
4915 {
4916 	cache_avl_struct_t    *rcas;
4917 	rib_lrc_entry_t		*rb;
4918 	rib_hca_t *hca = (rib_hca_t *)argp;
4919 
4920 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4921 	rcas = avl_first(&hca->avl_tree);
4922 	if (rcas != NULL)
4923 		avl_remove(&hca->avl_tree, rcas);
4924 
4925 	while (rcas != NULL) {
4926 		while (rcas->r.forw != &rcas->r) {
4927 			rcas->elements--;
4928 			rb = rcas->r.forw;
4929 			remque(rb);
4930 			if (rb->registered)
4931 				(void) rib_deregistermem_via_hca(hca,
4932 				    rb->lrc_buf, rb->lrc_mhandle);
4933 
4934 			hca->cache_allocation -= rb->lrc_len;
4935 			kmem_free(rb->lrc_buf, rb->lrc_len);
4936 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4937 		}
4938 		mutex_destroy(&rcas->node_lock);
4939 		kmem_cache_free(hca->server_side_cache, rcas);
4940 		rcas = avl_first(&hca->avl_tree);
4941 		if (rcas != NULL)
4942 			avl_remove(&hca->avl_tree, rcas);
4943 	}
4944 	rw_exit(&hca->avl_rw_lock);
4945 }
4946 
4947 static void
4948 rib_server_side_cache_cleanup(void *argp)
4949 {
4950 	cache_avl_struct_t    *rcas;
4951 	rib_lrc_entry_t		*rb;
4952 	rib_hca_t *hca = (rib_hca_t *)argp;
4953 
4954 	mutex_enter(&hca->cache_allocation_lock);
4955 	if (hca->cache_allocation < cache_limit) {
4956 		mutex_exit(&hca->cache_allocation_lock);
4957 		return;
4958 	}
4959 	mutex_exit(&hca->cache_allocation_lock);
4960 
4961 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4962 	rcas = avl_last(&hca->avl_tree);
4963 	if (rcas != NULL)
4964 		avl_remove(&hca->avl_tree, rcas);
4965 
4966 	while (rcas != NULL) {
4967 		while (rcas->r.forw != &rcas->r) {
4968 			rcas->elements--;
4969 			rb = rcas->r.forw;
4970 			remque(rb);
4971 			if (rb->registered)
4972 				(void) rib_deregistermem_via_hca(hca,
4973 				    rb->lrc_buf, rb->lrc_mhandle);
4974 
4975 			hca->cache_allocation -= rb->lrc_len;
4976 
4977 			kmem_free(rb->lrc_buf, rb->lrc_len);
4978 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4979 		}
4980 		mutex_destroy(&rcas->node_lock);
4981 		if (hca->server_side_cache) {
4982 			kmem_cache_free(hca->server_side_cache, rcas);
4983 		}
4984 
4985 		if (hca->cache_allocation < cache_limit) {
4986 			rw_exit(&hca->avl_rw_lock);
4987 			return;
4988 		}
4989 
4990 		rcas = avl_last(&hca->avl_tree);
4991 		if (rcas != NULL)
4992 			avl_remove(&hca->avl_tree, rcas);
4993 	}
4994 	rw_exit(&hca->avl_rw_lock);
4995 }
4996 
4997 static int
4998 avl_compare(const void *t1, const void *t2)
4999 {
5000 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
5001 		return (0);
5002 
5003 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
5004 		return (-1);
5005 
5006 	return (1);
5007 }
5008 
5009 static void
5010 rib_destroy_cache(rib_hca_t *hca)
5011 {
5012 	if (hca->avl_init) {
5013 		rib_server_side_cache_reclaim((void *)hca);
5014 		if (hca->server_side_cache) {
5015 			kmem_cache_destroy(hca->server_side_cache);
5016 			hca->server_side_cache = NULL;
5017 		}
5018 		avl_destroy(&hca->avl_tree);
5019 		mutex_destroy(&hca->cache_allocation_lock);
5020 		rw_destroy(&hca->avl_rw_lock);
5021 	}
5022 	hca->avl_init = FALSE;
5023 }
5024 
5025 static void
5026 rib_force_cleanup(void *hca)
5027 {
5028 	if (((rib_hca_t *)hca)->cleanup_helper != NULL)
5029 		(void) ddi_taskq_dispatch(
5030 		    ((rib_hca_t *)hca)->cleanup_helper,
5031 		    rib_server_side_cache_cleanup,
5032 		    (void *)hca, DDI_NOSLEEP);
5033 }
5034 
5035 static rib_lrc_entry_t *
5036 rib_get_cache_buf(CONN *conn, uint32_t len)
5037 {
5038 	cache_avl_struct_t	cas, *rcas;
5039 	rib_hca_t	*hca = (ctoqp(conn))->hca;
5040 	rib_lrc_entry_t *reply_buf;
5041 	avl_index_t where = NULL;
5042 	uint64_t c_alloc = 0;
5043 
5044 	if (!hca->avl_init)
5045 		goto  error_alloc;
5046 
5047 	cas.len = len;
5048 
5049 	rw_enter(&hca->avl_rw_lock, RW_READER);
5050 
5051 	mutex_enter(&hca->cache_allocation_lock);
5052 	c_alloc = hca->cache_allocation;
5053 	mutex_exit(&hca->cache_allocation_lock);
5054 
5055 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
5056 	    &where)) == NULL) {
5057 		/* Am I above the cache limit */
5058 		if ((c_alloc + len) >= cache_limit) {
5059 			rib_force_cleanup((void *)hca);
5060 			rw_exit(&hca->avl_rw_lock);
5061 			mutex_enter(&hca->cache_allocation_lock);
5062 			hca->cache_misses_above_the_limit ++;
5063 			mutex_exit(&hca->cache_allocation_lock);
5064 
5065 			/* Allocate and register the buffer directly */
5066 			goto error_alloc;
5067 		}
5068 
5069 		rw_exit(&hca->avl_rw_lock);
5070 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
5071 
5072 		/* Recheck to make sure no other thread added the entry in */
5073 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
5074 		    &cas, &where)) == NULL) {
5075 			/* Allocate an avl tree entry */
5076 			rcas = (cache_avl_struct_t *)
5077 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
5078 
5079 			bzero(rcas, sizeof (cache_avl_struct_t));
5080 			rcas->elements = 0;
5081 			rcas->r.forw = &rcas->r;
5082 			rcas->r.back = &rcas->r;
5083 			rcas->len = len;
5084 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5085 			avl_insert(&hca->avl_tree, rcas, where);
5086 		}
5087 	}
5088 
5089 	mutex_enter(&rcas->node_lock);
5090 
5091 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
5092 		reply_buf = rcas->r.forw;
5093 		remque(reply_buf);
5094 		rcas->elements--;
5095 		mutex_exit(&rcas->node_lock);
5096 		rw_exit(&hca->avl_rw_lock);
5097 
5098 		mutex_enter(&hca->cache_allocation_lock);
5099 		hca->cache_hits++;
5100 		hca->cache_allocation -= len;
5101 		mutex_exit(&hca->cache_allocation_lock);
5102 	} else {
5103 		/* Am I above the cache limit */
5104 		mutex_exit(&rcas->node_lock);
5105 		if ((c_alloc + len) >= cache_limit) {
5106 			rib_force_cleanup((void *)hca);
5107 			rw_exit(&hca->avl_rw_lock);
5108 
5109 			mutex_enter(&hca->cache_allocation_lock);
5110 			hca->cache_misses_above_the_limit++;
5111 			mutex_exit(&hca->cache_allocation_lock);
5112 			/* Allocate and register the buffer directly */
5113 			goto error_alloc;
5114 		}
5115 		rw_exit(&hca->avl_rw_lock);
5116 		mutex_enter(&hca->cache_allocation_lock);
5117 		hca->cache_misses++;
5118 		mutex_exit(&hca->cache_allocation_lock);
5119 		/* Allocate a reply_buf entry */
5120 		reply_buf = (rib_lrc_entry_t *)
5121 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5122 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
5123 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
5124 		reply_buf->lrc_len  = len;
5125 		reply_buf->registered = FALSE;
5126 		reply_buf->avl_node = (void *)rcas;
5127 	}
5128 
5129 	return (reply_buf);
5130 
5131 error_alloc:
5132 	reply_buf = (rib_lrc_entry_t *)
5133 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5134 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
5135 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5136 	reply_buf->lrc_len = len;
5137 	reply_buf->registered = FALSE;
5138 	reply_buf->avl_node = NULL;
5139 
5140 	return (reply_buf);
5141 }
5142 
5143 /*
5144  * Return a pre-registered back to the cache (without
5145  * unregistering the buffer)..
5146  */
5147 
5148 static void
5149 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5150 {
5151 	cache_avl_struct_t    cas, *rcas;
5152 	avl_index_t where = NULL;
5153 	rib_hca_t	*hca = (ctoqp(conn))->hca;
5154 
5155 	if (!hca->avl_init)
5156 		goto  error_free;
5157 
5158 	cas.len = reg_buf->lrc_len;
5159 	rw_enter(&hca->avl_rw_lock, RW_READER);
5160 	if ((rcas = (cache_avl_struct_t *)
5161 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5162 		rw_exit(&hca->avl_rw_lock);
5163 		goto error_free;
5164 	} else {
5165 		cas.len = reg_buf->lrc_len;
5166 		mutex_enter(&rcas->node_lock);
5167 		insque(reg_buf, &rcas->r);
5168 		rcas->elements ++;
5169 		mutex_exit(&rcas->node_lock);
5170 		rw_exit(&hca->avl_rw_lock);
5171 		mutex_enter(&hca->cache_allocation_lock);
5172 		hca->cache_allocation += cas.len;
5173 		mutex_exit(&hca->cache_allocation_lock);
5174 	}
5175 
5176 	return;
5177 
5178 error_free:
5179 
5180 	if (reg_buf->registered)
5181 		(void) rib_deregistermem_via_hca(hca,
5182 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5183 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5184 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5185 }
5186 
5187 static rdma_stat
5188 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5189 	uint_t buflen, struct mrc *buf_handle)
5190 {
5191 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
5192 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
5193 	rdma_stat	status;
5194 
5195 
5196 	/*
5197 	 * Note: ALL buffer pools use the same memory type RDMARW.
5198 	 */
5199 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5200 	if (status == RDMA_SUCCESS) {
5201 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5202 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5203 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5204 	} else {
5205 		buf_handle->mrc_linfo = NULL;
5206 		buf_handle->mrc_lmr = 0;
5207 		buf_handle->mrc_rmr = 0;
5208 	}
5209 	return (status);
5210 }
5211 
5212 /* ARGSUSED */
5213 static rdma_stat
5214 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5215     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5216 {
5217 
5218 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5219 	return (RDMA_SUCCESS);
5220 }
5221 
5222 /* ARGSUSED */
5223 static rdma_stat
5224 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5225 {
5226 
5227 	(void) ibt_deregister_mr(hca->hca_hdl,
5228 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5229 	return (RDMA_SUCCESS);
5230 }
5231 
5232 /*
5233  * Check if the IP interface named by `lifrp' is RDMA-capable.
5234  */
5235 static boolean_t
5236 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5237 {
5238 	char ifname[LIFNAMSIZ];
5239 	char *cp;
5240 
5241 	if (lifrp->lifr_type == IFT_IB)
5242 		return (B_TRUE);
5243 
5244 	/*
5245 	 * Strip off the logical interface portion before getting
5246 	 * intimate with the name.
5247 	 */
5248 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5249 	if ((cp = strchr(ifname, ':')) != NULL)
5250 		*cp = '\0';
5251 
5252 	return (strcmp("lo0", ifname) == 0);
5253 }
5254 
5255 static int
5256 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5257 {
5258 	vnode_t *kkvp, *vp;
5259 	TIUSER  *tiptr;
5260 	struct  strioctl iocb;
5261 	k_sigset_t smask;
5262 	int	err = 0;
5263 
5264 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kkvp) == 0) {
5265 		if (t_kopen(NULL, kkvp->v_rdev, FREAD|FWRITE,
5266 		    &tiptr, CRED()) == 0) {
5267 			vp = tiptr->fp->f_vnode;
5268 		} else {
5269 			VN_RELE(kkvp);
5270 			return (EPROTO);
5271 		}
5272 	} else {
5273 		return (EPROTO);
5274 	}
5275 
5276 	iocb.ic_cmd = cmd;
5277 	iocb.ic_timout = 0;
5278 	iocb.ic_len = len;
5279 	iocb.ic_dp = (caddr_t)arg;
5280 	sigintr(&smask, 0);
5281 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5282 	sigunintr(&smask);
5283 	(void) t_kclose(tiptr, 0);
5284 	VN_RELE(kkvp);
5285 	return (err);
5286 }
5287 
5288 /*
5289  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5290  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5291  */
5292 static int
5293 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5294 {
5295 	int err;
5296 	struct lifnum lifn;
5297 
5298 	bzero(&lifn, sizeof (struct lifnum));
5299 	lifn.lifn_family = AF_UNSPEC;
5300 
5301 	err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5302 	if (err != 0)
5303 		return (err);
5304 
5305 	/*
5306 	 * Pad the interface count to account for additional interfaces that
5307 	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5308 	 */
5309 	lifn.lifn_count += 4;
5310 
5311 	bzero(lifcp, sizeof (struct lifconf));
5312 	lifcp->lifc_family = AF_UNSPEC;
5313 	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5314 	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5315 
5316 	err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5317 	if (err != 0) {
5318 		kmem_free(lifcp->lifc_buf, *bufsizep);
5319 		return (err);
5320 	}
5321 	return (0);
5322 }
5323 
5324 static boolean_t
5325 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5326 {
5327 	uint_t i, nifs;
5328 	uint_t bufsize;
5329 	struct lifconf lifc;
5330 	struct lifreq *lifrp;
5331 	struct sockaddr_in *sinp;
5332 	struct sockaddr_in6 *sin6p;
5333 
5334 	bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5335 	bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5336 
5337 	if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5338 		return (B_FALSE);
5339 
5340 	if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5341 		kmem_free(lifc.lifc_buf, bufsize);
5342 		return (B_FALSE);
5343 	}
5344 
5345 	/*
5346 	 * Worst case is that all of the addresses are IB-capable and have
5347 	 * the same address family, so size our buffers accordingly.
5348 	 */
5349 	addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5350 	addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5351 	addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5352 	addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5353 
5354 	for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5355 		if (!rpcib_rdma_capable_interface(lifrp))
5356 			continue;
5357 
5358 		if (lifrp->lifr_addr.ss_family == AF_INET) {
5359 			sinp = addrs4->ri_list;
5360 			bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5361 			    sizeof (struct sockaddr_in));
5362 		} else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5363 			sin6p = addrs6->ri_list;
5364 			bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5365 			    sizeof (struct sockaddr_in6));
5366 		}
5367 	}
5368 
5369 	kmem_free(lifc.lifc_buf, bufsize);
5370 	return (B_TRUE);
5371 }
5372 
5373 /* ARGSUSED */
5374 static int
5375 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5376 {
5377 	rib_hca_t *hca;
5378 
5379 	if (KSTAT_WRITE == rw) {
5380 		return (EACCES);
5381 	}
5382 
5383 	rpcib_kstat.cache_limit.value.ui64 =
5384 	    (uint64_t)cache_limit;
5385 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5386 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5387 		rpcib_kstat.cache_allocation.value.ui64 +=
5388 		    (uint64_t)hca->cache_allocation;
5389 		rpcib_kstat.cache_hits.value.ui64 +=
5390 		    (uint64_t)hca->cache_hits;
5391 		rpcib_kstat.cache_misses.value.ui64 +=
5392 		    (uint64_t)hca->cache_misses;
5393 		rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5394 		    (uint64_t)hca->cache_misses_above_the_limit;
5395 	}
5396 	rw_exit(&rib_stat->hcas_list_lock);
5397 	return (0);
5398 }
5399