xref: /titanic_52/usr/src/uts/common/rpc/rpcib.c (revision 60471b7bbfab236de7d8776aed871d919c5f81c3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
28  *
29  * Portions of this source code is developed by the team members of
30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31  * headed by Professor Dhabaleswar K. (DK) Panda.
32  *
33  * Acknowledgements to contributions from developors:
34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
35  *   Lei Chai      : chail@cse.ohio-state.edu
36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
37  *
38  */
39 
40 /*
41  * The rpcib plugin. Implements the interface for RDMATF's
42  * interaction with IBTF.
43  */
44 
45 #include <sys/param.h>
46 #include <sys/types.h>
47 #include <sys/user.h>
48 #include <sys/systm.h>
49 #include <sys/sysmacros.h>
50 #include <sys/proc.h>
51 #include <sys/socket.h>
52 #include <sys/file.h>
53 #include <sys/stream.h>
54 #include <sys/strsubr.h>
55 #include <sys/stropts.h>
56 #include <sys/errno.h>
57 #include <sys/kmem.h>
58 #include <sys/debug.h>
59 #include <sys/pathname.h>
60 #include <sys/kstat.h>
61 #include <sys/t_lock.h>
62 #include <sys/ddi.h>
63 #include <sys/cmn_err.h>
64 #include <sys/time.h>
65 #include <sys/isa_defs.h>
66 #include <sys/callb.h>
67 #include <sys/sunddi.h>
68 #include <sys/sunndi.h>
69 #include <sys/sdt.h>
70 #include <sys/ib/ibtl/ibti.h>
71 #include <rpc/rpc.h>
72 #include <rpc/ib.h>
73 #include <sys/modctl.h>
74 #include <sys/kstr.h>
75 #include <sys/sockio.h>
76 #include <sys/vnode.h>
77 #include <sys/tiuser.h>
78 #include <net/if.h>
79 #include <net/if_types.h>
80 #include <sys/cred.h>
81 #include <rpc/rpc_rdma.h>
82 #include <nfs/nfs.h>
83 #include <sys/atomic.h>
84 
85 #define	NFS_RDMA_PORT	20049
86 
87 
88 /*
89  * Convenience structures for connection management
90  */
91 typedef struct rpcib_ipaddrs {
92 	void	*ri_list;	/* pointer to list of addresses */
93 	uint_t	ri_count;	/* number of addresses in list */
94 	uint_t	ri_size;	/* size of ri_list in bytes */
95 } rpcib_ipaddrs_t;
96 
97 
98 typedef struct rpcib_ping {
99 	rib_hca_t  *hca;
100 	ibt_path_info_t path;
101 	ibt_ip_addr_t srcip;
102 	ibt_ip_addr_t dstip;
103 } rpcib_ping_t;
104 
105 /*
106  * Prototype declarations for driver ops
107  */
108 static int	rpcib_attach(dev_info_t *, ddi_attach_cmd_t);
109 static int	rpcib_getinfo(dev_info_t *, ddi_info_cmd_t,
110 				void *, void **);
111 static int	rpcib_detach(dev_info_t *, ddi_detach_cmd_t);
112 static boolean_t rpcib_rdma_capable_interface(struct lifreq *);
113 static int	rpcib_do_ip_ioctl(int, int, void *);
114 static boolean_t rpcib_get_ib_addresses(rpcib_ipaddrs_t *, rpcib_ipaddrs_t *);
115 static int rpcib_cache_kstat_update(kstat_t *, int);
116 static void rib_force_cleanup(void *);
117 static void rib_stop_hca_services(rib_hca_t *);
118 static void rib_attach_hca(void);
119 static int rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
120 		struct netbuf *d_svcaddr, CONN **conn);
121 
122 struct {
123 	kstat_named_t cache_limit;
124 	kstat_named_t cache_allocation;
125 	kstat_named_t cache_hits;
126 	kstat_named_t cache_misses;
127 	kstat_named_t cache_misses_above_the_limit;
128 } rpcib_kstat = {
129 	{"cache_limit",			KSTAT_DATA_UINT64 },
130 	{"cache_allocation",		KSTAT_DATA_UINT64 },
131 	{"cache_hits",			KSTAT_DATA_UINT64 },
132 	{"cache_misses",		KSTAT_DATA_UINT64 },
133 	{"cache_misses_above_the_limit", KSTAT_DATA_UINT64 },
134 };
135 
136 /* rpcib cb_ops */
137 static struct cb_ops rpcib_cbops = {
138 	nulldev,		/* open */
139 	nulldev,		/* close */
140 	nodev,			/* strategy */
141 	nodev,			/* print */
142 	nodev,			/* dump */
143 	nodev,			/* read */
144 	nodev,			/* write */
145 	nodev,			/* ioctl */
146 	nodev,			/* devmap */
147 	nodev,			/* mmap */
148 	nodev,			/* segmap */
149 	nochpoll,		/* poll */
150 	ddi_prop_op,		/* prop_op */
151 	NULL,			/* stream */
152 	D_MP,			/* cb_flag */
153 	CB_REV,			/* rev */
154 	nodev,			/* int (*cb_aread)() */
155 	nodev			/* int (*cb_awrite)() */
156 };
157 
158 /*
159  * Device options
160  */
161 static struct dev_ops rpcib_ops = {
162 	DEVO_REV,		/* devo_rev, */
163 	0,			/* refcnt  */
164 	rpcib_getinfo,		/* info */
165 	nulldev,		/* identify */
166 	nulldev,		/* probe */
167 	rpcib_attach,		/* attach */
168 	rpcib_detach,		/* detach */
169 	nodev,			/* reset */
170 	&rpcib_cbops,		    /* driver ops - devctl interfaces */
171 	NULL,			/* bus operations */
172 	NULL,			/* power */
173 	ddi_quiesce_not_needed,		/* quiesce */
174 };
175 
176 /*
177  * Module linkage information.
178  */
179 
180 static struct modldrv rib_modldrv = {
181 	&mod_driverops,		/* Driver module */
182 	"RPCIB plugin driver",	/* Driver name and version */
183 	&rpcib_ops,		/* Driver ops */
184 };
185 
186 static struct modlinkage rib_modlinkage = {
187 	MODREV_1,
188 	(void *)&rib_modldrv,
189 	NULL
190 };
191 
192 typedef struct rib_lrc_entry {
193 	struct rib_lrc_entry *forw;
194 	struct rib_lrc_entry *back;
195 	char *lrc_buf;
196 
197 	uint32_t lrc_len;
198 	void  *avl_node;
199 	bool_t registered;
200 
201 	struct mrc lrc_mhandle;
202 	bool_t lrc_on_freed_list;
203 } rib_lrc_entry_t;
204 
205 typedef	struct cache_struct	{
206 	rib_lrc_entry_t		r;
207 	uint32_t		len;
208 	uint32_t		elements;
209 	kmutex_t		node_lock;
210 	avl_node_t		avl_link;
211 } cache_avl_struct_t;
212 
213 uint64_t	cache_limit = 100 * 1024 * 1024;
214 static uint64_t	cache_watermark = 80 * 1024 * 1024;
215 static bool_t	stats_enabled = FALSE;
216 
217 static uint64_t max_unsignaled_rws = 5;
218 int nfs_rdma_port = NFS_RDMA_PORT;
219 
220 #define	RIBNETID_TCP	"tcp"
221 #define	RIBNETID_TCP6	"tcp6"
222 
223 /*
224  * rib_stat: private data pointer used when registering
225  *	with the IBTF.  It is returned to the consumer
226  *	in all callbacks.
227  */
228 static rpcib_state_t *rib_stat = NULL;
229 
230 #define	RNR_RETRIES	IBT_RNR_RETRY_1
231 #define	MAX_PORTS	2
232 #define	RDMA_DUMMY_WRID	0x4D3A1D4D3A1D
233 #define	RDMA_CONN_REAP_RETRY	10	/* 10 secs */
234 
235 int preposted_rbufs = RDMA_BUFS_GRANT;
236 int send_threshold = 1;
237 
238 /*
239  * Old cards with Tavor driver have limited memory footprint
240  * when booted in 32bit. The rib_max_rbufs tunable can be
241  * tuned for more buffers if needed.
242  */
243 
244 #if !defined(_ELF64) && !defined(__sparc)
245 int rib_max_rbufs = MAX_BUFS;
246 #else
247 int rib_max_rbufs = 10 * MAX_BUFS;
248 #endif	/* !(_ELF64) && !(__sparc) */
249 
250 int rib_conn_timeout = 60 * 12;		/* 12 minutes */
251 
252 /*
253  * State of the plugin.
254  * ACCEPT = accepting new connections and requests.
255  * NO_ACCEPT = not accepting new connection and requests.
256  * This should eventually move to rpcib_state_t structure, since this
257  * will tell in which state the plugin is for a particular type of service
258  * like NFS, NLM or v4 Callback deamon. The plugin might be in accept
259  * state for one and in no_accept state for the other.
260  */
261 int		plugin_state;
262 kmutex_t	plugin_state_lock;
263 
264 ldi_ident_t rpcib_li;
265 
266 /*
267  * RPCIB RDMATF operations
268  */
269 static rdma_stat rib_reachable(int addr_type, struct netbuf *, void **handle);
270 static rdma_stat rib_disconnect(CONN *conn);
271 static void rib_listen(struct rdma_svc_data *rd);
272 static void rib_listen_stop(struct rdma_svc_data *rd);
273 static rdma_stat rib_registermem(CONN *conn, caddr_t  adsp, caddr_t buf,
274 	uint_t buflen, struct mrc *buf_handle);
275 static rdma_stat rib_deregistermem(CONN *conn, caddr_t buf,
276 	struct mrc buf_handle);
277 static rdma_stat rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp,
278 		caddr_t buf, uint_t buflen, struct mrc *buf_handle);
279 static rdma_stat rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf,
280 		struct mrc buf_handle);
281 static rdma_stat rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf,
282 	uint_t buflen, struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle,
283 	void *lrc);
284 static rdma_stat rib_deregistermemsync(CONN *conn, caddr_t buf,
285 	struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle, void *);
286 static rdma_stat rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle,
287 	caddr_t buf, int len, int cpu);
288 
289 static rdma_stat rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf);
290 
291 static void rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf);
292 static void *rib_rbuf_alloc(CONN *, rdma_buf_t *);
293 
294 static void rib_rbuf_free(CONN *conn, int ptype, void *buf);
295 
296 static rdma_stat rib_send(CONN *conn, struct clist *cl, uint32_t msgid);
297 static rdma_stat rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid);
298 static rdma_stat rib_post_resp(CONN *conn, struct clist *cl, uint32_t msgid);
299 static rdma_stat rib_post_resp_remove(CONN *conn, uint32_t msgid);
300 static rdma_stat rib_post_recv(CONN *conn, struct clist *cl);
301 static rdma_stat rib_recv(CONN *conn, struct clist **clp, uint32_t msgid);
302 static rdma_stat rib_read(CONN *conn, struct clist *cl, int wait);
303 static rdma_stat rib_write(CONN *conn, struct clist *cl, int wait);
304 static rdma_stat rib_ping_srv(int addr_type, struct netbuf *, rpcib_ping_t *);
305 static rdma_stat rib_conn_get(struct netbuf *, struct netbuf *,
306 	int addr_type, void *, CONN **);
307 static rdma_stat rib_conn_release(CONN *conn);
308 static rdma_stat rib_connect(struct netbuf *, struct netbuf *, int,
309 	rpcib_ping_t *, CONN **);
310 static rdma_stat rib_getinfo(rdma_info_t *info);
311 
312 static rib_lrc_entry_t *rib_get_cache_buf(CONN *conn, uint32_t len);
313 static void rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *buf);
314 static void rib_destroy_cache(rib_hca_t *hca);
315 static	void	rib_server_side_cache_reclaim(void *argp);
316 static int avl_compare(const void *t1, const void *t2);
317 
318 static void rib_stop_services(rib_hca_t *);
319 static void rib_close_channels(rib_conn_list_t *);
320 static void rib_conn_close(void *);
321 
322 /*
323  * RPCIB addressing operations
324  */
325 
326 /*
327  * RDMA operations the RPCIB module exports
328  */
329 static rdmaops_t rib_ops = {
330 	rib_reachable,
331 	rib_conn_get,
332 	rib_conn_release,
333 	rib_listen,
334 	rib_listen_stop,
335 	rib_registermem,
336 	rib_deregistermem,
337 	rib_registermemsync,
338 	rib_deregistermemsync,
339 	rib_syncmem,
340 	rib_reg_buf_alloc,
341 	rib_reg_buf_free,
342 	rib_send,
343 	rib_send_resp,
344 	rib_post_resp,
345 	rib_post_resp_remove,
346 	rib_post_recv,
347 	rib_recv,
348 	rib_read,
349 	rib_write,
350 	rib_getinfo,
351 };
352 
353 /*
354  * RDMATF RPCIB plugin details
355  */
356 static rdma_mod_t rib_mod = {
357 	"ibtf",		/* api name */
358 	RDMATF_VERS_1,
359 	0,
360 	&rib_ops,	/* rdma op vector for ibtf */
361 };
362 
363 static rdma_stat rpcib_open_hcas(rpcib_state_t *);
364 static rdma_stat rib_qp_init(rib_qp_t *, int);
365 static void rib_svc_scq_handler(ibt_cq_hdl_t, void *);
366 static void rib_clnt_scq_handler(ibt_cq_hdl_t, void *);
367 static void rib_clnt_rcq_handler(ibt_cq_hdl_t, void *);
368 static void rib_svc_rcq_handler(ibt_cq_hdl_t, void *);
369 static rib_bufpool_t *rib_rbufpool_create(rib_hca_t *hca, int ptype, int num);
370 static rdma_stat rib_reg_mem(rib_hca_t *, caddr_t adsp, caddr_t, uint_t,
371 	ibt_mr_flags_t, ibt_mr_hdl_t *, ibt_mr_desc_t *);
372 static rdma_stat rib_reg_mem_user(rib_hca_t *, caddr_t, uint_t, ibt_mr_flags_t,
373 	ibt_mr_hdl_t *, ibt_mr_desc_t *, caddr_t);
374 static rdma_stat rib_conn_to_srv(rib_hca_t *, rib_qp_t *, rpcib_ping_t *);
375 static rdma_stat rib_clnt_create_chan(rib_hca_t *, struct netbuf *,
376 	rib_qp_t **);
377 static rdma_stat rib_svc_create_chan(rib_hca_t *, caddr_t, uint8_t,
378 	rib_qp_t **);
379 static rdma_stat rib_sendwait(rib_qp_t *, struct send_wid *);
380 static struct send_wid *rib_init_sendwait(uint32_t, int, rib_qp_t *);
381 static int rib_free_sendwait(struct send_wid *);
382 static struct rdma_done_list *rdma_done_add(rib_qp_t *qp, uint32_t xid);
383 static void rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd);
384 static void rdma_done_rem_list(rib_qp_t *);
385 static void rdma_done_notify(rib_qp_t *qp, uint32_t xid);
386 
387 static void rib_async_handler(void *,
388 	ibt_hca_hdl_t, ibt_async_code_t, ibt_async_event_t *);
389 static rdma_stat rib_rem_rep(rib_qp_t *, struct reply *);
390 static struct svc_recv *rib_init_svc_recv(rib_qp_t *, ibt_wr_ds_t *);
391 static int rib_free_svc_recv(struct svc_recv *);
392 static struct recv_wid *rib_create_wid(rib_qp_t *, ibt_wr_ds_t *, uint32_t);
393 static void rib_free_wid(struct recv_wid *);
394 static rdma_stat rib_disconnect_channel(CONN *, rib_conn_list_t *);
395 static void rib_detach_hca(rib_hca_t *);
396 static void rib_close_a_channel(CONN *);
397 static void rib_send_hold(rib_qp_t *);
398 static void rib_send_rele(rib_qp_t *);
399 
400 /*
401  * Registration with IBTF as a consumer
402  */
403 static struct ibt_clnt_modinfo_s rib_modinfo = {
404 	IBTI_V_CURR,
405 	IBT_GENERIC,
406 	rib_async_handler,	/* async event handler */
407 	NULL,			/* Memory Region Handler */
408 	"nfs/ib"
409 };
410 
411 /*
412  * Global strucuture
413  */
414 
415 typedef struct rpcib_s {
416 	dev_info_t	*rpcib_dip;
417 	kmutex_t	rpcib_mutex;
418 } rpcib_t;
419 
420 rpcib_t rpcib;
421 
422 /*
423  * /etc/system controlled variable to control
424  * debugging in rpcib kernel module.
425  * Set it to values greater that 1 to control
426  * the amount of debugging messages required.
427  */
428 int rib_debug = 0;
429 
430 int
431 _init(void)
432 {
433 	int error;
434 
435 	error = mod_install((struct modlinkage *)&rib_modlinkage);
436 	if (error != 0) {
437 		/*
438 		 * Could not load module
439 		 */
440 		return (error);
441 	}
442 	mutex_init(&plugin_state_lock, NULL, MUTEX_DRIVER, NULL);
443 	return (0);
444 }
445 
446 int
447 _fini()
448 {
449 	int status;
450 
451 	/*
452 	 * Remove module
453 	 */
454 	if ((status = mod_remove(&rib_modlinkage)) != 0) {
455 		return (status);
456 	}
457 	mutex_destroy(&plugin_state_lock);
458 	return (0);
459 }
460 
461 int
462 _info(struct modinfo *modinfop)
463 {
464 	return (mod_info(&rib_modlinkage, modinfop));
465 }
466 
467 /*
468  * rpcib_getinfo()
469  * Given the device number, return the devinfo pointer or the
470  * instance number.
471  * Note: always succeed DDI_INFO_DEVT2INSTANCE, even before attach.
472  */
473 
474 /*ARGSUSED*/
475 static int
476 rpcib_getinfo(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result)
477 {
478 	int ret = DDI_SUCCESS;
479 
480 	switch (cmd) {
481 	case DDI_INFO_DEVT2DEVINFO:
482 		if (rpcib.rpcib_dip != NULL)
483 			*result = rpcib.rpcib_dip;
484 		else {
485 			*result = NULL;
486 			ret = DDI_FAILURE;
487 		}
488 		break;
489 
490 	case DDI_INFO_DEVT2INSTANCE:
491 		*result = NULL;
492 		break;
493 
494 	default:
495 		ret = DDI_FAILURE;
496 	}
497 	return (ret);
498 }
499 
500 static void
501 rpcib_free_hca_list()
502 {
503 	rib_hca_t *hca, *hcap;
504 
505 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
506 	hca = rib_stat->hcas_list;
507 	rib_stat->hcas_list = NULL;
508 	rw_exit(&rib_stat->hcas_list_lock);
509 	while (hca != NULL) {
510 		rw_enter(&hca->state_lock, RW_WRITER);
511 		hcap = hca;
512 		hca = hca->next;
513 		rib_stat->nhca_inited--;
514 		rib_mod.rdma_count--;
515 		hcap->state = HCA_DETACHED;
516 		rw_exit(&hcap->state_lock);
517 		rib_stop_hca_services(hcap);
518 
519 		kmem_free(hcap, sizeof (*hcap));
520 	}
521 }
522 
523 static rdma_stat
524 rpcib_free_service_list()
525 {
526 	rib_service_t *service;
527 	ibt_status_t ret;
528 
529 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
530 	while (rib_stat->service_list != NULL) {
531 		service = rib_stat->service_list;
532 		ret = ibt_unbind_all_services(service->srv_hdl);
533 		if (ret != IBT_SUCCESS) {
534 			rw_exit(&rib_stat->service_list_lock);
535 #ifdef DEBUG
536 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
537 			    "ibt_unbind_all_services failed (%d)\n", (int)ret);
538 #endif
539 			return (RDMA_FAILED);
540 		}
541 		ret = ibt_deregister_service(rib_stat->ibt_clnt_hdl,
542 		    service->srv_hdl);
543 		if (ret != IBT_SUCCESS) {
544 			rw_exit(&rib_stat->service_list_lock);
545 #ifdef DEBUG
546 			cmn_err(CE_NOTE, "rpcib_free_service_list: "
547 			    "ibt_deregister_service failed (%d)\n", (int)ret);
548 #endif
549 			return (RDMA_FAILED);
550 		}
551 		rib_stat->service_list = service->next;
552 		kmem_free(service, sizeof (rib_service_t));
553 	}
554 	rw_exit(&rib_stat->service_list_lock);
555 
556 	return (RDMA_SUCCESS);
557 }
558 
559 static int
560 rpcib_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
561 {
562 	ibt_status_t	ibt_status;
563 	rdma_stat	r_status;
564 
565 	switch (cmd) {
566 	case DDI_ATTACH:
567 		break;
568 	case DDI_RESUME:
569 		return (DDI_SUCCESS);
570 	default:
571 		return (DDI_FAILURE);
572 	}
573 
574 	mutex_init(&rpcib.rpcib_mutex, NULL, MUTEX_DRIVER, NULL);
575 
576 	mutex_enter(&rpcib.rpcib_mutex);
577 	if (rpcib.rpcib_dip != NULL) {
578 		mutex_exit(&rpcib.rpcib_mutex);
579 		return (DDI_FAILURE);
580 	}
581 	rpcib.rpcib_dip = dip;
582 	mutex_exit(&rpcib.rpcib_mutex);
583 	/*
584 	 * Create the "rpcib" minor-node.
585 	 */
586 	if (ddi_create_minor_node(dip,
587 	    "rpcib", S_IFCHR, 0, DDI_PSEUDO, 0) != DDI_SUCCESS) {
588 		/* Error message, no cmn_err as they print on console */
589 		return (DDI_FAILURE);
590 	}
591 
592 	if (rib_stat == NULL) {
593 		rib_stat = kmem_zalloc(sizeof (*rib_stat), KM_SLEEP);
594 		mutex_init(&rib_stat->open_hca_lock, NULL, MUTEX_DRIVER, NULL);
595 		rw_init(&rib_stat->hcas_list_lock, NULL, RW_DRIVER, NULL);
596 		mutex_init(&rib_stat->listen_lock, NULL, MUTEX_DRIVER, NULL);
597 	}
598 
599 	rib_stat->hca_count = ibt_get_hca_list(NULL);
600 	if (rib_stat->hca_count < 1) {
601 		mutex_destroy(&rib_stat->listen_lock);
602 		rw_destroy(&rib_stat->hcas_list_lock);
603 		mutex_destroy(&rib_stat->open_hca_lock);
604 		kmem_free(rib_stat, sizeof (*rib_stat));
605 		rib_stat = NULL;
606 		return (DDI_FAILURE);
607 	}
608 
609 	ibt_status = ibt_attach(&rib_modinfo, dip,
610 	    (void *)rib_stat, &rib_stat->ibt_clnt_hdl);
611 
612 	if (ibt_status != IBT_SUCCESS) {
613 		mutex_destroy(&rib_stat->listen_lock);
614 		rw_destroy(&rib_stat->hcas_list_lock);
615 		mutex_destroy(&rib_stat->open_hca_lock);
616 		kmem_free(rib_stat, sizeof (*rib_stat));
617 		rib_stat = NULL;
618 		return (DDI_FAILURE);
619 	}
620 
621 	rib_stat->service_list = NULL;
622 	rw_init(&rib_stat->service_list_lock, NULL, RW_DRIVER, NULL);
623 	mutex_enter(&rib_stat->open_hca_lock);
624 	if (rpcib_open_hcas(rib_stat) != RDMA_SUCCESS) {
625 		mutex_exit(&rib_stat->open_hca_lock);
626 		goto open_fail;
627 	}
628 	mutex_exit(&rib_stat->open_hca_lock);
629 
630 	if (ddi_prop_update_int(DDI_DEV_T_NONE, dip, DDI_NO_AUTODETACH, 1) !=
631 	    DDI_PROP_SUCCESS) {
632 		cmn_err(CE_WARN, "rpcib_attach: ddi-no-autodetach prop update "
633 		    "failed.");
634 		goto register_fail;
635 	}
636 
637 	/*
638 	 * Register with rdmatf
639 	 */
640 	r_status = rdma_register_mod(&rib_mod);
641 	if (r_status != RDMA_SUCCESS && r_status != RDMA_REG_EXIST) {
642 		cmn_err(CE_WARN, "rpcib_attach:rdma_register_mod failed, "
643 		    "status = %d", r_status);
644 		goto register_fail;
645 	}
646 
647 	return (DDI_SUCCESS);
648 
649 register_fail:
650 
651 open_fail:
652 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
653 	rpcib_free_hca_list();
654 	(void) rpcib_free_service_list();
655 	mutex_destroy(&rib_stat->listen_lock);
656 	rw_destroy(&rib_stat->hcas_list_lock);
657 	mutex_destroy(&rib_stat->open_hca_lock);
658 	rw_destroy(&rib_stat->service_list_lock);
659 	kmem_free(rib_stat, sizeof (*rib_stat));
660 	rib_stat = NULL;
661 	return (DDI_FAILURE);
662 }
663 
664 /*ARGSUSED*/
665 static int
666 rpcib_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
667 {
668 	switch (cmd) {
669 
670 	case DDI_DETACH:
671 		break;
672 
673 	case DDI_SUSPEND:
674 	default:
675 		return (DDI_FAILURE);
676 	}
677 
678 	/*
679 	 * Detach the hca and free resources
680 	 */
681 	mutex_enter(&plugin_state_lock);
682 	plugin_state = NO_ACCEPT;
683 	mutex_exit(&plugin_state_lock);
684 
685 	if (rpcib_free_service_list() != RDMA_SUCCESS)
686 		return (DDI_FAILURE);
687 	rpcib_free_hca_list();
688 
689 	(void) ibt_detach(rib_stat->ibt_clnt_hdl);
690 	mutex_destroy(&rib_stat->listen_lock);
691 	rw_destroy(&rib_stat->hcas_list_lock);
692 	mutex_destroy(&rib_stat->open_hca_lock);
693 	rw_destroy(&rib_stat->service_list_lock);
694 
695 	kmem_free(rib_stat, sizeof (*rib_stat));
696 	rib_stat = NULL;
697 
698 	mutex_enter(&rpcib.rpcib_mutex);
699 	rpcib.rpcib_dip = NULL;
700 	mutex_exit(&rpcib.rpcib_mutex);
701 	mutex_destroy(&rpcib.rpcib_mutex);
702 	return (DDI_SUCCESS);
703 }
704 
705 
706 static void rib_rbufpool_free(rib_hca_t *, int);
707 static void rib_rbufpool_deregister(rib_hca_t *, int);
708 static void rib_rbufpool_destroy(rib_hca_t *hca, int ptype);
709 static struct reply *rib_addreplylist(rib_qp_t *, uint32_t);
710 static rdma_stat rib_rem_replylist(rib_qp_t *);
711 static int rib_remreply(rib_qp_t *, struct reply *);
712 static rdma_stat rib_add_connlist(CONN *, rib_conn_list_t *);
713 static rdma_stat rib_rm_conn(CONN *, rib_conn_list_t *);
714 
715 
716 /*
717  * One CQ pair per HCA
718  */
719 static rdma_stat
720 rib_create_cq(rib_hca_t *hca, uint32_t cq_size, ibt_cq_handler_t cq_handler,
721 	rib_cq_t **cqp)
722 {
723 	rib_cq_t	*cq;
724 	ibt_cq_attr_t	cq_attr;
725 	uint32_t	real_size;
726 	ibt_status_t	status;
727 	rdma_stat	error = RDMA_SUCCESS;
728 
729 	cq = kmem_zalloc(sizeof (rib_cq_t), KM_SLEEP);
730 	cq->rib_hca = hca;
731 	cq_attr.cq_size = cq_size;
732 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
733 	status = ibt_alloc_cq(hca->hca_hdl, &cq_attr, &cq->rib_cq_hdl,
734 	    &real_size);
735 	if (status != IBT_SUCCESS) {
736 		cmn_err(CE_WARN, "rib_create_cq: ibt_alloc_cq() failed,"
737 		    " status=%d", status);
738 		error = RDMA_FAILED;
739 		goto fail;
740 	}
741 	ibt_set_cq_handler(cq->rib_cq_hdl, cq_handler, hca);
742 
743 	/*
744 	 * Enable CQ callbacks. CQ Callbacks are single shot
745 	 * (e.g. you have to call ibt_enable_cq_notify()
746 	 * after each callback to get another one).
747 	 */
748 	status = ibt_enable_cq_notify(cq->rib_cq_hdl, IBT_NEXT_COMPLETION);
749 	if (status != IBT_SUCCESS) {
750 		cmn_err(CE_WARN, "rib_create_cq: "
751 		    "enable_cq_notify failed, status %d", status);
752 		error = RDMA_FAILED;
753 		goto fail;
754 	}
755 	*cqp = cq;
756 
757 	return (error);
758 fail:
759 	if (cq->rib_cq_hdl)
760 		(void) ibt_free_cq(cq->rib_cq_hdl);
761 	if (cq)
762 		kmem_free(cq, sizeof (rib_cq_t));
763 	return (error);
764 }
765 
766 /*
767  * rpcib_find_hca
768  *
769  * Caller should have already locked the hcas_lock before calling
770  * this function.
771  */
772 static rib_hca_t *
773 rpcib_find_hca(rpcib_state_t *ribstat, ib_guid_t guid)
774 {
775 	rib_hca_t *hca = ribstat->hcas_list;
776 
777 	while (hca && hca->hca_guid != guid)
778 		hca = hca->next;
779 
780 	return (hca);
781 }
782 
783 static rdma_stat
784 rpcib_open_hcas(rpcib_state_t *ribstat)
785 {
786 	rib_hca_t		*hca;
787 	ibt_status_t		ibt_status;
788 	rdma_stat		status;
789 	ibt_hca_portinfo_t	*pinfop;
790 	ibt_pd_flags_t		pd_flags = IBT_PD_NO_FLAGS;
791 	uint_t			size, cq_size;
792 	int			i;
793 	kstat_t *ksp;
794 	cache_avl_struct_t example_avl_node;
795 	char rssc_name[32];
796 	int old_nhca_inited = ribstat->nhca_inited;
797 	ib_guid_t		*hca_guids;
798 
799 	ASSERT(MUTEX_HELD(&ribstat->open_hca_lock));
800 
801 	ribstat->hca_count = ibt_get_hca_list(&hca_guids);
802 	if (ribstat->hca_count == 0)
803 		return (RDMA_FAILED);
804 
805 	rw_enter(&ribstat->hcas_list_lock, RW_WRITER);
806 	/*
807 	 * Open a hca and setup for RDMA
808 	 */
809 	for (i = 0; i < ribstat->hca_count; i++) {
810 		if (rpcib_find_hca(ribstat, hca_guids[i]))
811 			continue;
812 		hca = kmem_zalloc(sizeof (rib_hca_t), KM_SLEEP);
813 
814 		ibt_status = ibt_open_hca(ribstat->ibt_clnt_hdl,
815 		    hca_guids[i], &hca->hca_hdl);
816 		if (ibt_status != IBT_SUCCESS) {
817 			kmem_free(hca, sizeof (rib_hca_t));
818 			continue;
819 		}
820 		hca->hca_guid = hca_guids[i];
821 		hca->ibt_clnt_hdl = ribstat->ibt_clnt_hdl;
822 		hca->state = HCA_INITED;
823 
824 		/*
825 		 * query HCA info
826 		 */
827 		ibt_status = ibt_query_hca(hca->hca_hdl, &hca->hca_attrs);
828 		if (ibt_status != IBT_SUCCESS) {
829 			goto fail1;
830 		}
831 
832 		/*
833 		 * One PD (Protection Domain) per HCA.
834 		 * A qp is allowed to access a memory region
835 		 * only when it's in the same PD as that of
836 		 * the memory region.
837 		 */
838 		ibt_status = ibt_alloc_pd(hca->hca_hdl, pd_flags, &hca->pd_hdl);
839 		if (ibt_status != IBT_SUCCESS) {
840 			goto fail1;
841 		}
842 
843 		/*
844 		 * query HCA ports
845 		 */
846 		ibt_status = ibt_query_hca_ports(hca->hca_hdl,
847 		    0, &pinfop, &hca->hca_nports, &size);
848 		if (ibt_status != IBT_SUCCESS) {
849 			goto fail2;
850 		}
851 		hca->hca_ports = pinfop;
852 		hca->hca_pinfosz = size;
853 		pinfop = NULL;
854 
855 		cq_size = DEF_CQ_SIZE; /* default cq size */
856 		/*
857 		 * Create 2 pairs of cq's (1 pair for client
858 		 * and the other pair for server) on this hca.
859 		 * If number of qp's gets too large, then several
860 		 * cq's will be needed.
861 		 */
862 		status = rib_create_cq(hca, cq_size, rib_svc_rcq_handler,
863 		    &hca->svc_rcq);
864 		if (status != RDMA_SUCCESS) {
865 			goto fail3;
866 		}
867 
868 		status = rib_create_cq(hca, cq_size, rib_svc_scq_handler,
869 		    &hca->svc_scq);
870 		if (status != RDMA_SUCCESS) {
871 			goto fail3;
872 		}
873 
874 		status = rib_create_cq(hca, cq_size, rib_clnt_rcq_handler,
875 		    &hca->clnt_rcq);
876 		if (status != RDMA_SUCCESS) {
877 			goto fail3;
878 		}
879 
880 		status = rib_create_cq(hca, cq_size, rib_clnt_scq_handler,
881 		    &hca->clnt_scq);
882 		if (status != RDMA_SUCCESS) {
883 			goto fail3;
884 		}
885 
886 		/*
887 		 * Create buffer pools.
888 		 * Note rib_rbuf_create also allocates memory windows.
889 		 */
890 		hca->recv_pool = rib_rbufpool_create(hca,
891 		    RECV_BUFFER, rib_max_rbufs);
892 		if (hca->recv_pool == NULL) {
893 			goto fail3;
894 		}
895 
896 		hca->send_pool = rib_rbufpool_create(hca,
897 		    SEND_BUFFER, rib_max_rbufs);
898 		if (hca->send_pool == NULL) {
899 			rib_rbufpool_destroy(hca, RECV_BUFFER);
900 			goto fail3;
901 		}
902 
903 		if (hca->server_side_cache == NULL) {
904 			(void) sprintf(rssc_name,
905 			    "rib_srvr_cache_%llx",
906 			    (long long unsigned int) hca->hca_guid);
907 			hca->server_side_cache = kmem_cache_create(
908 			    rssc_name,
909 			    sizeof (cache_avl_struct_t), 0,
910 			    NULL,
911 			    NULL,
912 			    rib_server_side_cache_reclaim,
913 			    hca, NULL, 0);
914 		}
915 
916 		avl_create(&hca->avl_tree,
917 		    avl_compare,
918 		    sizeof (cache_avl_struct_t),
919 		    (uint_t)(uintptr_t)&example_avl_node.avl_link-
920 		    (uint_t)(uintptr_t)&example_avl_node);
921 
922 		rw_init(&hca->bound_services_lock, NULL, RW_DRIVER,
923 		    hca->iblock);
924 		rw_init(&hca->state_lock, NULL, RW_DRIVER, hca->iblock);
925 		rw_init(&hca->avl_rw_lock,
926 		    NULL, RW_DRIVER, hca->iblock);
927 		mutex_init(&hca->cache_allocation_lock,
928 		    NULL, MUTEX_DRIVER, NULL);
929 		hca->avl_init = TRUE;
930 
931 		/* Create kstats for the cache */
932 		ASSERT(INGLOBALZONE(curproc));
933 
934 		if (!stats_enabled) {
935 			ksp = kstat_create_zone("unix", 0, "rpcib_cache", "rpc",
936 			    KSTAT_TYPE_NAMED,
937 			    sizeof (rpcib_kstat) / sizeof (kstat_named_t),
938 			    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE,
939 			    GLOBAL_ZONEID);
940 			if (ksp) {
941 				ksp->ks_data = (void *) &rpcib_kstat;
942 				ksp->ks_update = rpcib_cache_kstat_update;
943 				kstat_install(ksp);
944 				stats_enabled = TRUE;
945 			}
946 		}
947 		if (hca->cleanup_helper == NULL) {
948 			char tq_name[sizeof (hca->hca_guid) * 2 + 1];
949 
950 			(void) snprintf(tq_name, sizeof (tq_name), "%llX",
951 			    (unsigned long long int) hca->hca_guid);
952 			hca->cleanup_helper = ddi_taskq_create(NULL,
953 			    tq_name, 1, TASKQ_DEFAULTPRI, 0);
954 		}
955 
956 		mutex_init(&hca->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
957 		cv_init(&hca->cb_cv, NULL, CV_DRIVER, NULL);
958 		rw_init(&hca->cl_conn_list.conn_lock, NULL, RW_DRIVER,
959 		    hca->iblock);
960 		rw_init(&hca->srv_conn_list.conn_lock, NULL, RW_DRIVER,
961 		    hca->iblock);
962 		mutex_init(&hca->inuse_lock, NULL, MUTEX_DRIVER, hca->iblock);
963 		hca->inuse = TRUE;
964 
965 		hca->next = ribstat->hcas_list;
966 		ribstat->hcas_list = hca;
967 		ribstat->nhca_inited++;
968 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
969 		continue;
970 
971 fail3:
972 		ibt_free_portinfo(hca->hca_ports, hca->hca_pinfosz);
973 fail2:
974 		(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
975 fail1:
976 		(void) ibt_close_hca(hca->hca_hdl);
977 		kmem_free(hca, sizeof (rib_hca_t));
978 	}
979 	rw_exit(&ribstat->hcas_list_lock);
980 	ibt_free_hca_list(hca_guids, ribstat->hca_count);
981 	rib_mod.rdma_count = rib_stat->nhca_inited;
982 
983 	/*
984 	 * return success if at least one new hca has been configured.
985 	 */
986 	if (ribstat->nhca_inited != old_nhca_inited)
987 		return (RDMA_SUCCESS);
988 	else
989 		return (RDMA_FAILED);
990 }
991 
992 /*
993  * Callback routines
994  */
995 
996 /*
997  * SCQ handlers
998  */
999 /* ARGSUSED */
1000 static void
1001 rib_clnt_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1002 {
1003 	ibt_status_t	ibt_status;
1004 	ibt_wc_t	wc;
1005 	struct send_wid	*wd;
1006 	CONN		*conn;
1007 	rib_qp_t	*qp;
1008 	int		i;
1009 
1010 	/*
1011 	 * Re-enable cq notify here to avoid missing any
1012 	 * completion queue notification.
1013 	 */
1014 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1015 
1016 	ibt_status = IBT_SUCCESS;
1017 	while (ibt_status != IBT_CQ_EMPTY) {
1018 		bzero(&wc, sizeof (wc));
1019 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1020 		if (ibt_status != IBT_SUCCESS)
1021 			return;
1022 
1023 		/*
1024 		 * Got a send completion
1025 		 */
1026 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1027 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1028 			qp = wd->qp;
1029 			conn = qptoc(qp);
1030 
1031 			mutex_enter(&wd->sendwait_lock);
1032 			switch (wc.wc_status) {
1033 			case IBT_WC_SUCCESS:
1034 				wd->status = RDMA_SUCCESS;
1035 				break;
1036 			default:
1037 /*
1038  *    RC Send Q Error Code		Local state     Remote State
1039  *    ==================== 		===========     ============
1040  *    IBT_WC_BAD_RESPONSE_ERR             ERROR           None
1041  *    IBT_WC_LOCAL_LEN_ERR                ERROR           None
1042  *    IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           None
1043  *    IBT_WC_LOCAL_PROTECT_ERR            ERROR           None
1044  *    IBT_WC_MEM_WIN_BIND_ERR             ERROR           None
1045  *    IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR
1046  *    IBT_WC_REMOTE_ACCESS_ERR            ERROR           ERROR
1047  *    IBT_WC_REMOTE_OP_ERR                ERROR           ERROR
1048  *    IBT_WC_RNR_NAK_TIMEOUT_ERR          ERROR           None
1049  *    IBT_WC_TRANS_TIMEOUT_ERR            ERROR           None
1050  *    IBT_WC_WR_FLUSHED_ERR               ERROR           None
1051  */
1052 				/*
1053 				 * Channel in error state. Set connection to
1054 				 * ERROR and cleanup will happen either from
1055 				 * conn_release  or from rib_conn_get
1056 				 */
1057 				wd->status = RDMA_FAILED;
1058 				mutex_enter(&conn->c_lock);
1059 				if (conn->c_state != C_DISCONN_PEND)
1060 					conn->c_state = C_ERROR_CONN;
1061 				mutex_exit(&conn->c_lock);
1062 				break;
1063 			}
1064 
1065 			if (wd->cv_sig == 1) {
1066 				/*
1067 				 * Notify poster
1068 				 */
1069 				cv_signal(&wd->wait_cv);
1070 				mutex_exit(&wd->sendwait_lock);
1071 			} else {
1072 				/*
1073 				 * Poster not waiting for notification.
1074 				 * Free the send buffers and send_wid
1075 				 */
1076 				for (i = 0; i < wd->nsbufs; i++) {
1077 					rib_rbuf_free(qptoc(wd->qp),
1078 					    SEND_BUFFER,
1079 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1080 				}
1081 
1082 				/* decrement the send ref count */
1083 				rib_send_rele(qp);
1084 
1085 				mutex_exit(&wd->sendwait_lock);
1086 				(void) rib_free_sendwait(wd);
1087 			}
1088 		}
1089 	}
1090 }
1091 
1092 /* ARGSUSED */
1093 static void
1094 rib_svc_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1095 {
1096 	ibt_status_t	ibt_status;
1097 	ibt_wc_t	wc;
1098 	struct send_wid	*wd;
1099 	rib_qp_t	*qp;
1100 	CONN		*conn;
1101 	int		i;
1102 
1103 	/*
1104 	 * Re-enable cq notify here to avoid missing any
1105 	 * completion queue notification.
1106 	 */
1107 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1108 
1109 	ibt_status = IBT_SUCCESS;
1110 	while (ibt_status != IBT_CQ_EMPTY) {
1111 		bzero(&wc, sizeof (wc));
1112 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1113 		if (ibt_status != IBT_SUCCESS)
1114 			return;
1115 
1116 		/*
1117 		 * Got a send completion
1118 		 */
1119 		if (wc.wc_id != RDMA_DUMMY_WRID) {
1120 			wd = (struct send_wid *)(uintptr_t)wc.wc_id;
1121 			qp = wd->qp;
1122 			conn = qptoc(qp);
1123 			mutex_enter(&wd->sendwait_lock);
1124 
1125 			switch (wc.wc_status) {
1126 			case IBT_WC_SUCCESS:
1127 				wd->status = RDMA_SUCCESS;
1128 				break;
1129 			default:
1130 				/*
1131 				 * Channel in error state. Set connection to
1132 				 * ERROR and cleanup will happen either from
1133 				 * conn_release  or conn timeout.
1134 				 */
1135 				wd->status = RDMA_FAILED;
1136 				mutex_enter(&conn->c_lock);
1137 				if (conn->c_state != C_DISCONN_PEND)
1138 					conn->c_state = C_ERROR_CONN;
1139 				mutex_exit(&conn->c_lock);
1140 				break;
1141 			}
1142 
1143 			if (wd->cv_sig == 1) {
1144 				/*
1145 				 * Update completion status and notify poster
1146 				 */
1147 				cv_signal(&wd->wait_cv);
1148 				mutex_exit(&wd->sendwait_lock);
1149 			} else {
1150 				/*
1151 				 * Poster not waiting for notification.
1152 				 * Free the send buffers and send_wid
1153 				 */
1154 				for (i = 0; i < wd->nsbufs; i++) {
1155 					rib_rbuf_free(qptoc(wd->qp),
1156 					    SEND_BUFFER,
1157 					    (void *)(uintptr_t)wd->sbufaddr[i]);
1158 				}
1159 
1160 				/* decrement the send ref count */
1161 				rib_send_rele(qp);
1162 
1163 				mutex_exit(&wd->sendwait_lock);
1164 				(void) rib_free_sendwait(wd);
1165 			}
1166 		}
1167 	}
1168 }
1169 
1170 /*
1171  * RCQ handler
1172  */
1173 /* ARGSUSED */
1174 static void
1175 rib_clnt_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1176 {
1177 	rib_qp_t	*qp;
1178 	ibt_status_t	ibt_status;
1179 	ibt_wc_t	wc;
1180 	struct recv_wid	*rwid;
1181 
1182 	/*
1183 	 * Re-enable cq notify here to avoid missing any
1184 	 * completion queue notification.
1185 	 */
1186 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1187 
1188 	ibt_status = IBT_SUCCESS;
1189 	while (ibt_status != IBT_CQ_EMPTY) {
1190 		bzero(&wc, sizeof (wc));
1191 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1192 		if (ibt_status != IBT_SUCCESS)
1193 			return;
1194 
1195 		rwid = (struct recv_wid *)(uintptr_t)wc.wc_id;
1196 		qp = rwid->qp;
1197 		if (wc.wc_status == IBT_WC_SUCCESS) {
1198 			XDR	inxdrs, *xdrs;
1199 			uint_t	xid, vers, op, find_xid = 0;
1200 			struct reply	*r;
1201 			CONN *conn = qptoc(qp);
1202 			uint32_t rdma_credit = 0;
1203 
1204 			xdrs = &inxdrs;
1205 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)rwid->addr,
1206 			    wc.wc_bytes_xfer, XDR_DECODE);
1207 			/*
1208 			 * Treat xid as opaque (xid is the first entity
1209 			 * in the rpc rdma message).
1210 			 */
1211 			xid = *(uint32_t *)(uintptr_t)rwid->addr;
1212 
1213 			/* Skip xid and set the xdr position accordingly. */
1214 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1215 			(void) xdr_u_int(xdrs, &vers);
1216 			(void) xdr_u_int(xdrs, &rdma_credit);
1217 			(void) xdr_u_int(xdrs, &op);
1218 			XDR_DESTROY(xdrs);
1219 
1220 			if (vers != RPCRDMA_VERS) {
1221 				/*
1222 				 * Invalid RPC/RDMA version. Cannot
1223 				 * interoperate.  Set connection to
1224 				 * ERROR state and bail out.
1225 				 */
1226 				mutex_enter(&conn->c_lock);
1227 				if (conn->c_state != C_DISCONN_PEND)
1228 					conn->c_state = C_ERROR_CONN;
1229 				mutex_exit(&conn->c_lock);
1230 				rib_rbuf_free(conn, RECV_BUFFER,
1231 				    (void *)(uintptr_t)rwid->addr);
1232 				rib_free_wid(rwid);
1233 				continue;
1234 			}
1235 
1236 			mutex_enter(&qp->replylist_lock);
1237 			for (r = qp->replylist; r != NULL; r = r->next) {
1238 				if (r->xid == xid) {
1239 					find_xid = 1;
1240 					switch (op) {
1241 					case RDMA_MSG:
1242 					case RDMA_NOMSG:
1243 					case RDMA_MSGP:
1244 						r->status = RDMA_SUCCESS;
1245 						r->vaddr_cq = rwid->addr;
1246 						r->bytes_xfer =
1247 						    wc.wc_bytes_xfer;
1248 						cv_signal(&r->wait_cv);
1249 						break;
1250 					default:
1251 						rib_rbuf_free(qptoc(qp),
1252 						    RECV_BUFFER,
1253 						    (void *)(uintptr_t)
1254 						    rwid->addr);
1255 						break;
1256 					}
1257 					break;
1258 				}
1259 			}
1260 			mutex_exit(&qp->replylist_lock);
1261 			if (find_xid == 0) {
1262 				/* RPC caller not waiting for reply */
1263 
1264 				DTRACE_PROBE1(rpcib__i__nomatchxid1,
1265 				    int, xid);
1266 
1267 				rib_rbuf_free(qptoc(qp), RECV_BUFFER,
1268 				    (void *)(uintptr_t)rwid->addr);
1269 			}
1270 		} else if (wc.wc_status == IBT_WC_WR_FLUSHED_ERR) {
1271 			CONN *conn = qptoc(qp);
1272 
1273 			/*
1274 			 * Connection being flushed. Just free
1275 			 * the posted buffer
1276 			 */
1277 			rib_rbuf_free(conn, RECV_BUFFER,
1278 			    (void *)(uintptr_t)rwid->addr);
1279 		} else {
1280 			CONN *conn = qptoc(qp);
1281 /*
1282  *  RC Recv Q Error Code		Local state     Remote State
1283  *  ====================		===========     ============
1284  *  IBT_WC_LOCAL_ACCESS_ERR             ERROR           ERROR when NAK recvd
1285  *  IBT_WC_LOCAL_LEN_ERR                ERROR           ERROR when NAK recvd
1286  *  IBT_WC_LOCAL_PROTECT_ERR            ERROR           ERROR when NAK recvd
1287  *  IBT_WC_LOCAL_CHAN_OP_ERR            ERROR           ERROR when NAK recvd
1288  *  IBT_WC_REMOTE_INVALID_REQ_ERR       ERROR           ERROR when NAK recvd
1289  *  IBT_WC_WR_FLUSHED_ERR               None            None
1290  */
1291 			/*
1292 			 * Channel in error state. Set connection
1293 			 * in ERROR state.
1294 			 */
1295 			mutex_enter(&conn->c_lock);
1296 			if (conn->c_state != C_DISCONN_PEND)
1297 				conn->c_state = C_ERROR_CONN;
1298 			mutex_exit(&conn->c_lock);
1299 			rib_rbuf_free(conn, RECV_BUFFER,
1300 			    (void *)(uintptr_t)rwid->addr);
1301 		}
1302 		rib_free_wid(rwid);
1303 	}
1304 }
1305 
1306 /* Server side */
1307 /* ARGSUSED */
1308 static void
1309 rib_svc_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
1310 {
1311 	rdma_recv_data_t *rdp;
1312 	rib_qp_t	*qp;
1313 	ibt_status_t	ibt_status;
1314 	ibt_wc_t	wc;
1315 	struct svc_recv	*s_recvp;
1316 	CONN		*conn;
1317 	mblk_t		*mp;
1318 
1319 	/*
1320 	 * Re-enable cq notify here to avoid missing any
1321 	 * completion queue notification.
1322 	 */
1323 	(void) ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION);
1324 
1325 	ibt_status = IBT_SUCCESS;
1326 	while (ibt_status != IBT_CQ_EMPTY) {
1327 		bzero(&wc, sizeof (wc));
1328 		ibt_status = ibt_poll_cq(cq_hdl, &wc, 1, NULL);
1329 		if (ibt_status != IBT_SUCCESS)
1330 			return;
1331 
1332 		s_recvp = (struct svc_recv *)(uintptr_t)wc.wc_id;
1333 		qp = s_recvp->qp;
1334 		conn = qptoc(qp);
1335 		mutex_enter(&qp->posted_rbufs_lock);
1336 		qp->n_posted_rbufs--;
1337 		if (qp->n_posted_rbufs == 0)
1338 			cv_signal(&qp->posted_rbufs_cv);
1339 		mutex_exit(&qp->posted_rbufs_lock);
1340 
1341 		if (wc.wc_status == IBT_WC_SUCCESS) {
1342 			XDR	inxdrs, *xdrs;
1343 			uint_t	xid, vers, op;
1344 			uint32_t rdma_credit;
1345 
1346 			xdrs = &inxdrs;
1347 			/* s_recvp->vaddr stores data */
1348 			xdrmem_create(xdrs, (caddr_t)(uintptr_t)s_recvp->vaddr,
1349 			    wc.wc_bytes_xfer, XDR_DECODE);
1350 
1351 			/*
1352 			 * Treat xid as opaque (xid is the first entity
1353 			 * in the rpc rdma message).
1354 			 */
1355 			xid = *(uint32_t *)(uintptr_t)s_recvp->vaddr;
1356 			/* Skip xid and set the xdr position accordingly. */
1357 			XDR_SETPOS(xdrs, sizeof (uint32_t));
1358 			if (!xdr_u_int(xdrs, &vers) ||
1359 			    !xdr_u_int(xdrs, &rdma_credit) ||
1360 			    !xdr_u_int(xdrs, &op)) {
1361 				rib_rbuf_free(conn, RECV_BUFFER,
1362 				    (void *)(uintptr_t)s_recvp->vaddr);
1363 				XDR_DESTROY(xdrs);
1364 				(void) rib_free_svc_recv(s_recvp);
1365 				continue;
1366 			}
1367 			XDR_DESTROY(xdrs);
1368 
1369 			if (vers != RPCRDMA_VERS) {
1370 				/*
1371 				 * Invalid RPC/RDMA version.
1372 				 * Drop rpc rdma message.
1373 				 */
1374 				rib_rbuf_free(conn, RECV_BUFFER,
1375 				    (void *)(uintptr_t)s_recvp->vaddr);
1376 				(void) rib_free_svc_recv(s_recvp);
1377 				continue;
1378 			}
1379 			/*
1380 			 * Is this for RDMA_DONE?
1381 			 */
1382 			if (op == RDMA_DONE) {
1383 				rib_rbuf_free(conn, RECV_BUFFER,
1384 				    (void *)(uintptr_t)s_recvp->vaddr);
1385 				/*
1386 				 * Wake up the thread waiting on
1387 				 * a RDMA_DONE for xid
1388 				 */
1389 				mutex_enter(&qp->rdlist_lock);
1390 				rdma_done_notify(qp, xid);
1391 				mutex_exit(&qp->rdlist_lock);
1392 				(void) rib_free_svc_recv(s_recvp);
1393 				continue;
1394 			}
1395 
1396 			mutex_enter(&plugin_state_lock);
1397 			if (plugin_state == ACCEPT) {
1398 				while ((mp = allocb(sizeof (*rdp), BPRI_LO))
1399 				    == NULL)
1400 					(void) strwaitbuf(
1401 					    sizeof (*rdp), BPRI_LO);
1402 				/*
1403 				 * Plugin is in accept state, hence the master
1404 				 * transport queue for this is still accepting
1405 				 * requests. Hence we can call svc_queuereq to
1406 				 * queue this recieved msg.
1407 				 */
1408 				rdp = (rdma_recv_data_t *)mp->b_rptr;
1409 				rdp->conn = conn;
1410 				rdp->rpcmsg.addr =
1411 				    (caddr_t)(uintptr_t)s_recvp->vaddr;
1412 				rdp->rpcmsg.type = RECV_BUFFER;
1413 				rdp->rpcmsg.len = wc.wc_bytes_xfer;
1414 				rdp->status = wc.wc_status;
1415 				mutex_enter(&conn->c_lock);
1416 				conn->c_ref++;
1417 				mutex_exit(&conn->c_lock);
1418 				mp->b_wptr += sizeof (*rdp);
1419 				svc_queuereq((queue_t *)rib_stat->q, mp);
1420 				mutex_exit(&plugin_state_lock);
1421 			} else {
1422 				/*
1423 				 * The master transport for this is going
1424 				 * away and the queue is not accepting anymore
1425 				 * requests for krpc, so don't do anything, just
1426 				 * free the msg.
1427 				 */
1428 				mutex_exit(&plugin_state_lock);
1429 				rib_rbuf_free(conn, RECV_BUFFER,
1430 				    (void *)(uintptr_t)s_recvp->vaddr);
1431 			}
1432 		} else {
1433 			rib_rbuf_free(conn, RECV_BUFFER,
1434 			    (void *)(uintptr_t)s_recvp->vaddr);
1435 		}
1436 		(void) rib_free_svc_recv(s_recvp);
1437 	}
1438 }
1439 
1440 static void
1441 rib_attach_hca()
1442 {
1443 	mutex_enter(&rib_stat->open_hca_lock);
1444 	(void) rpcib_open_hcas(rib_stat);
1445 	rib_listen(NULL);
1446 	mutex_exit(&rib_stat->open_hca_lock);
1447 }
1448 
1449 /*
1450  * Handles DR event of IBT_HCA_DETACH_EVENT.
1451  */
1452 /* ARGSUSED */
1453 static void
1454 rib_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
1455 	ibt_async_code_t code, ibt_async_event_t *event)
1456 {
1457 	switch (code) {
1458 	case IBT_HCA_ATTACH_EVENT:
1459 		rib_attach_hca();
1460 		break;
1461 	case IBT_HCA_DETACH_EVENT:
1462 	{
1463 		rib_hca_t *hca;
1464 
1465 		rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1466 		for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1467 			rw_enter(&hca->state_lock, RW_READER);
1468 			if ((hca->state != HCA_DETACHED) &&
1469 			    (hca->hca_hdl == hca_hdl)) {
1470 				rw_exit(&hca->state_lock);
1471 				break;
1472 			}
1473 			rw_exit(&hca->state_lock);
1474 		}
1475 		rw_exit(&rib_stat->hcas_list_lock);
1476 
1477 		if (hca == NULL)
1478 			return;
1479 		ASSERT(hca->hca_hdl == hca_hdl);
1480 		rib_detach_hca(hca);
1481 #ifdef DEBUG
1482 		cmn_err(CE_NOTE, "rib_async_handler(): HCA being detached!\n");
1483 #endif
1484 		break;
1485 	}
1486 	case IBT_EVENT_PORT_UP:
1487 		/*
1488 		 * A port is up. We should call rib_listen() since there is
1489 		 * a chance that rib_listen() may have failed during
1490 		 * rib_attach_hca() because the port had not been up yet.
1491 		 */
1492 		rib_listen(NULL);
1493 #ifdef DEBUG
1494 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_PORT_UP\n");
1495 #endif
1496 		break;
1497 #ifdef DEBUG
1498 	case IBT_EVENT_PATH_MIGRATED:
1499 		cmn_err(CE_NOTE, "rib_async_handler(): "
1500 		    "IBT_EVENT_PATH_MIGRATED\n");
1501 		break;
1502 	case IBT_EVENT_SQD:
1503 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_SQD\n");
1504 		break;
1505 	case IBT_EVENT_COM_EST:
1506 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_EVENT_COM_EST\n");
1507 		break;
1508 	case IBT_ERROR_CATASTROPHIC_CHAN:
1509 		cmn_err(CE_NOTE, "rib_async_handler(): "
1510 		    "IBT_ERROR_CATASTROPHIC_CHAN\n");
1511 		break;
1512 	case IBT_ERROR_INVALID_REQUEST_CHAN:
1513 		cmn_err(CE_NOTE, "rib_async_handler(): "
1514 		    "IBT_ERROR_INVALID_REQUEST_CHAN\n");
1515 		break;
1516 	case IBT_ERROR_ACCESS_VIOLATION_CHAN:
1517 		cmn_err(CE_NOTE, "rib_async_handler(): "
1518 		    "IBT_ERROR_ACCESS_VIOLATION_CHAN\n");
1519 		break;
1520 	case IBT_ERROR_PATH_MIGRATE_REQ:
1521 		cmn_err(CE_NOTE, "rib_async_handler(): "
1522 		    "IBT_ERROR_PATH_MIGRATE_REQ\n");
1523 		break;
1524 	case IBT_ERROR_CQ:
1525 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_CQ\n");
1526 		break;
1527 	case IBT_ERROR_PORT_DOWN:
1528 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ERROR_PORT_DOWN\n");
1529 		break;
1530 	case IBT_ASYNC_OPAQUE1:
1531 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE1\n");
1532 		break;
1533 	case IBT_ASYNC_OPAQUE2:
1534 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE2\n");
1535 		break;
1536 	case IBT_ASYNC_OPAQUE3:
1537 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE3\n");
1538 		break;
1539 	case IBT_ASYNC_OPAQUE4:
1540 		cmn_err(CE_NOTE, "rib_async_handler(): IBT_ASYNC_OPAQUE4\n");
1541 		break;
1542 #endif
1543 	default:
1544 		break;
1545 	}
1546 }
1547 
1548 /*
1549  * Client's reachable function.
1550  */
1551 static rdma_stat
1552 rib_reachable(int addr_type, struct netbuf *raddr, void **handle)
1553 {
1554 	rdma_stat	status;
1555 	rpcib_ping_t	rpt;
1556 	struct netbuf	saddr;
1557 	CONN		*conn;
1558 
1559 	bzero(&saddr, sizeof (struct netbuf));
1560 	status = rib_connect(&saddr, raddr, addr_type, &rpt, &conn);
1561 
1562 	if (status == RDMA_SUCCESS) {
1563 		*handle = (void *)rpt.hca;
1564 		/* release the reference */
1565 		(void) rib_conn_release(conn);
1566 		return (RDMA_SUCCESS);
1567 	} else {
1568 		*handle = NULL;
1569 		DTRACE_PROBE(rpcib__i__pingfailed);
1570 		return (RDMA_FAILED);
1571 	}
1572 }
1573 
1574 /* Client side qp creation */
1575 static rdma_stat
1576 rib_clnt_create_chan(rib_hca_t *hca, struct netbuf *raddr, rib_qp_t **qp)
1577 {
1578 	rib_qp_t	*kqp = NULL;
1579 	CONN		*conn;
1580 	rdma_clnt_cred_ctrl_t *cc_info;
1581 
1582 	ASSERT(qp != NULL);
1583 	*qp = NULL;
1584 
1585 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1586 	conn = qptoc(kqp);
1587 	kqp->hca = hca;
1588 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1589 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1590 
1591 	kqp->mode = RIB_CLIENT;
1592 	kqp->chan_flags = IBT_BLOCKING;
1593 	conn->c_raddr.buf = kmem_alloc(raddr->len, KM_SLEEP);
1594 	bcopy(raddr->buf, conn->c_raddr.buf, raddr->len);
1595 	conn->c_raddr.len = conn->c_raddr.maxlen = raddr->len;
1596 	/*
1597 	 * Initialize
1598 	 */
1599 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1600 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1601 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1602 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1603 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1604 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DRIVER, hca->iblock);
1605 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1606 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1607 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1608 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1609 	/*
1610 	 * Initialize the client credit control
1611 	 * portion of the rdmaconn struct.
1612 	 */
1613 	kqp->rdmaconn.c_cc_type = RDMA_CC_CLNT;
1614 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
1615 	cc_info->clnt_cc_granted_ops = 0;
1616 	cc_info->clnt_cc_in_flight_ops = 0;
1617 	cv_init(&cc_info->clnt_cc_cv, NULL, CV_DEFAULT, NULL);
1618 
1619 	*qp = kqp;
1620 	return (RDMA_SUCCESS);
1621 }
1622 
1623 /* Server side qp creation */
1624 static rdma_stat
1625 rib_svc_create_chan(rib_hca_t *hca, caddr_t q, uint8_t port, rib_qp_t **qp)
1626 {
1627 	rib_qp_t	*kqp = NULL;
1628 	ibt_chan_sizes_t	chan_sizes;
1629 	ibt_rc_chan_alloc_args_t	qp_attr;
1630 	ibt_status_t		ibt_status;
1631 	rdma_srv_cred_ctrl_t *cc_info;
1632 
1633 	*qp = NULL;
1634 
1635 	kqp = kmem_zalloc(sizeof (rib_qp_t), KM_SLEEP);
1636 	kqp->hca = hca;
1637 	kqp->port_num = port;
1638 	kqp->rdmaconn.c_rdmamod = &rib_mod;
1639 	kqp->rdmaconn.c_private = (caddr_t)kqp;
1640 
1641 	/*
1642 	 * Create the qp handle
1643 	 */
1644 	bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1645 	qp_attr.rc_scq = hca->svc_scq->rib_cq_hdl;
1646 	qp_attr.rc_rcq = hca->svc_rcq->rib_cq_hdl;
1647 	qp_attr.rc_pd = hca->pd_hdl;
1648 	qp_attr.rc_hca_port_num = port;
1649 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1650 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1651 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1652 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1653 	qp_attr.rc_clone_chan = NULL;
1654 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1655 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1656 
1657 	rw_enter(&hca->state_lock, RW_READER);
1658 	if (hca->state != HCA_DETACHED) {
1659 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1660 		    IBT_ACHAN_NO_FLAGS, &qp_attr, &kqp->qp_hdl,
1661 		    &chan_sizes);
1662 	} else {
1663 		rw_exit(&hca->state_lock);
1664 		goto fail;
1665 	}
1666 	rw_exit(&hca->state_lock);
1667 
1668 	if (ibt_status != IBT_SUCCESS) {
1669 		DTRACE_PROBE1(rpcib__i_svccreatechanfail,
1670 		    int, ibt_status);
1671 		goto fail;
1672 	}
1673 
1674 	kqp->mode = RIB_SERVER;
1675 	kqp->chan_flags = IBT_BLOCKING;
1676 	kqp->q = q;	/* server ONLY */
1677 
1678 	cv_init(&kqp->cb_conn_cv, NULL, CV_DEFAULT, NULL);
1679 	cv_init(&kqp->posted_rbufs_cv, NULL, CV_DEFAULT, NULL);
1680 	mutex_init(&kqp->replylist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1681 	mutex_init(&kqp->posted_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1682 	cv_init(&kqp->send_rbufs_cv, NULL, CV_DEFAULT, NULL);
1683 	mutex_init(&kqp->send_rbufs_lock, NULL, MUTEX_DRIVER, hca->iblock);
1684 	mutex_init(&kqp->rdlist_lock, NULL, MUTEX_DEFAULT, hca->iblock);
1685 	mutex_init(&kqp->cb_lock, NULL, MUTEX_DRIVER, hca->iblock);
1686 	cv_init(&kqp->rdmaconn.c_cv, NULL, CV_DEFAULT, NULL);
1687 	mutex_init(&kqp->rdmaconn.c_lock, NULL, MUTEX_DRIVER, hca->iblock);
1688 	/*
1689 	 * Set the private data area to qp to be used in callbacks
1690 	 */
1691 	ibt_set_chan_private(kqp->qp_hdl, (void *)kqp);
1692 	kqp->rdmaconn.c_state = C_CONNECTED;
1693 
1694 	/*
1695 	 * Initialize the server credit control
1696 	 * portion of the rdmaconn struct.
1697 	 */
1698 	kqp->rdmaconn.c_cc_type = RDMA_CC_SRV;
1699 	cc_info = &kqp->rdmaconn.rdma_conn_cred_ctrl_u.c_srv_cc;
1700 	cc_info->srv_cc_buffers_granted = preposted_rbufs;
1701 	cc_info->srv_cc_cur_buffers_used = 0;
1702 	cc_info->srv_cc_posted = preposted_rbufs;
1703 
1704 	*qp = kqp;
1705 
1706 	return (RDMA_SUCCESS);
1707 fail:
1708 	if (kqp)
1709 		kmem_free(kqp, sizeof (rib_qp_t));
1710 
1711 	return (RDMA_FAILED);
1712 }
1713 
1714 /* ARGSUSED */
1715 ibt_cm_status_t
1716 rib_clnt_cm_handler(void *clnt_hdl, ibt_cm_event_t *event,
1717     ibt_cm_return_args_t *ret_args, void *priv_data,
1718     ibt_priv_data_len_t len)
1719 {
1720 	rib_hca_t	*hca;
1721 
1722 	hca = (rib_hca_t *)clnt_hdl;
1723 
1724 	switch (event->cm_type) {
1725 
1726 	/* got a connection close event */
1727 	case IBT_CM_EVENT_CONN_CLOSED:
1728 	{
1729 		CONN	*conn;
1730 		rib_qp_t *qp;
1731 
1732 		/* check reason why connection was closed */
1733 		switch (event->cm_event.closed) {
1734 		case IBT_CM_CLOSED_DREP_RCVD:
1735 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
1736 		case IBT_CM_CLOSED_DUP:
1737 		case IBT_CM_CLOSED_ABORT:
1738 		case IBT_CM_CLOSED_ALREADY:
1739 			/*
1740 			 * These cases indicate the local end initiated
1741 			 * the closing of the channel. Nothing to do here.
1742 			 */
1743 			break;
1744 		default:
1745 			/*
1746 			 * Reason for CONN_CLOSED event must be one of
1747 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
1748 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
1749 			 * the remote end is closing the channel. In these
1750 			 * cases free the channel and transition to error
1751 			 * state
1752 			 */
1753 			qp = ibt_get_chan_private(event->cm_channel);
1754 			conn = qptoc(qp);
1755 			mutex_enter(&conn->c_lock);
1756 			if (conn->c_state == C_DISCONN_PEND) {
1757 				mutex_exit(&conn->c_lock);
1758 				break;
1759 			}
1760 
1761 			conn->c_state = C_ERROR_CONN;
1762 
1763 			/*
1764 			 * Free the conn if c_ref is down to 0 already
1765 			 */
1766 			if (conn->c_ref == 0) {
1767 				/*
1768 				 * Remove from list and free conn
1769 				 */
1770 				conn->c_state = C_DISCONN_PEND;
1771 				mutex_exit(&conn->c_lock);
1772 				rw_enter(&hca->state_lock, RW_READER);
1773 				if (hca->state != HCA_DETACHED)
1774 					(void) rib_disconnect_channel(conn,
1775 					    &hca->cl_conn_list);
1776 				rw_exit(&hca->state_lock);
1777 			} else {
1778 				/*
1779 				 * conn will be freed when c_ref goes to 0.
1780 				 * Indicate to cleaning thread not to close
1781 				 * the connection, but just free the channel.
1782 				 */
1783 				conn->c_flags |= C_CLOSE_NOTNEEDED;
1784 				mutex_exit(&conn->c_lock);
1785 			}
1786 #ifdef DEBUG
1787 			if (rib_debug)
1788 				cmn_err(CE_NOTE, "rib_clnt_cm_handler: "
1789 				    "(CONN_CLOSED) channel disconnected");
1790 #endif
1791 			break;
1792 		}
1793 		break;
1794 	}
1795 	default:
1796 		break;
1797 	}
1798 	return (IBT_CM_ACCEPT);
1799 }
1800 
1801 /*
1802  * Connect to the server.
1803  */
1804 rdma_stat
1805 rib_conn_to_srv(rib_hca_t *hca, rib_qp_t *qp, rpcib_ping_t *rptp)
1806 {
1807 	ibt_chan_open_args_t	chan_args;	/* channel args */
1808 	ibt_chan_sizes_t	chan_sizes;
1809 	ibt_rc_chan_alloc_args_t	qp_attr;
1810 	ibt_status_t		ibt_status;
1811 	ibt_rc_returns_t	ret_args;   	/* conn reject info */
1812 	int refresh = REFRESH_ATTEMPTS;	/* refresh if IBT_CM_CONN_STALE */
1813 	ibt_ip_cm_info_t	ipcm_info;
1814 	uint8_t cmp_ip_pvt[IBT_IP_HDR_PRIV_DATA_SZ];
1815 
1816 
1817 	(void) bzero(&chan_args, sizeof (chan_args));
1818 	(void) bzero(&qp_attr, sizeof (ibt_rc_chan_alloc_args_t));
1819 	(void) bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
1820 
1821 	ipcm_info.src_addr.family = rptp->srcip.family;
1822 	switch (ipcm_info.src_addr.family) {
1823 	case AF_INET:
1824 		ipcm_info.src_addr.un.ip4addr = rptp->srcip.un.ip4addr;
1825 		break;
1826 	case AF_INET6:
1827 		ipcm_info.src_addr.un.ip6addr = rptp->srcip.un.ip6addr;
1828 		break;
1829 	}
1830 
1831 	ipcm_info.dst_addr.family = rptp->srcip.family;
1832 	switch (ipcm_info.dst_addr.family) {
1833 	case AF_INET:
1834 		ipcm_info.dst_addr.un.ip4addr = rptp->dstip.un.ip4addr;
1835 		break;
1836 	case AF_INET6:
1837 		ipcm_info.dst_addr.un.ip6addr = rptp->dstip.un.ip6addr;
1838 		break;
1839 	}
1840 
1841 	ipcm_info.src_port = (in_port_t)nfs_rdma_port;
1842 
1843 	ibt_status = ibt_format_ip_private_data(&ipcm_info,
1844 	    IBT_IP_HDR_PRIV_DATA_SZ, cmp_ip_pvt);
1845 
1846 	if (ibt_status != IBT_SUCCESS) {
1847 		cmn_err(CE_WARN, "ibt_format_ip_private_data failed\n");
1848 		return (-1);
1849 	}
1850 
1851 	qp_attr.rc_hca_port_num = rptp->path.pi_prim_cep_path.cep_hca_port_num;
1852 	/* Alloc a RC channel */
1853 	qp_attr.rc_scq = hca->clnt_scq->rib_cq_hdl;
1854 	qp_attr.rc_rcq = hca->clnt_rcq->rib_cq_hdl;
1855 	qp_attr.rc_pd = hca->pd_hdl;
1856 	qp_attr.rc_sizes.cs_sq_sgl = DSEG_MAX;
1857 	qp_attr.rc_sizes.cs_rq_sgl = RQ_DSEG_MAX;
1858 	qp_attr.rc_sizes.cs_sq = DEF_SQ_SIZE;
1859 	qp_attr.rc_sizes.cs_rq = DEF_RQ_SIZE;
1860 	qp_attr.rc_clone_chan = NULL;
1861 	qp_attr.rc_control = IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1862 	qp_attr.rc_flags = IBT_WR_SIGNALED;
1863 
1864 	rptp->path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, nfs_rdma_port);
1865 	chan_args.oc_path = &rptp->path;
1866 
1867 	chan_args.oc_cm_handler = rib_clnt_cm_handler;
1868 	chan_args.oc_cm_clnt_private = (void *)hca;
1869 	chan_args.oc_rdma_ra_out = 4;
1870 	chan_args.oc_rdma_ra_in = 4;
1871 	chan_args.oc_path_retry_cnt = 2;
1872 	chan_args.oc_path_rnr_retry_cnt = RNR_RETRIES;
1873 	chan_args.oc_priv_data = cmp_ip_pvt;
1874 	chan_args.oc_priv_data_len = IBT_IP_HDR_PRIV_DATA_SZ;
1875 
1876 refresh:
1877 	rw_enter(&hca->state_lock, RW_READER);
1878 	if (hca->state != HCA_DETACHED) {
1879 		ibt_status = ibt_alloc_rc_channel(hca->hca_hdl,
1880 		    IBT_ACHAN_NO_FLAGS,
1881 		    &qp_attr, &qp->qp_hdl,
1882 		    &chan_sizes);
1883 	} else {
1884 		rw_exit(&hca->state_lock);
1885 		return (RDMA_FAILED);
1886 	}
1887 	rw_exit(&hca->state_lock);
1888 
1889 	if (ibt_status != IBT_SUCCESS) {
1890 		DTRACE_PROBE1(rpcib__i_conntosrv,
1891 		    int, ibt_status);
1892 		return (RDMA_FAILED);
1893 	}
1894 
1895 	/* Connect to the Server */
1896 	(void) bzero(&ret_args, sizeof (ret_args));
1897 	mutex_enter(&qp->cb_lock);
1898 	ibt_status = ibt_open_rc_channel(qp->qp_hdl, IBT_OCHAN_NO_FLAGS,
1899 	    IBT_BLOCKING, &chan_args, &ret_args);
1900 	if (ibt_status != IBT_SUCCESS) {
1901 		DTRACE_PROBE2(rpcib__i_openrctosrv,
1902 		    int, ibt_status, int, ret_args.rc_status);
1903 
1904 		(void) ibt_free_channel(qp->qp_hdl);
1905 		qp->qp_hdl = NULL;
1906 		mutex_exit(&qp->cb_lock);
1907 		if (refresh-- && ibt_status == IBT_CM_FAILURE &&
1908 		    ret_args.rc_status == IBT_CM_CONN_STALE) {
1909 			/*
1910 			 * Got IBT_CM_CONN_STALE probably because of stale
1911 			 * data on the passive end of a channel that existed
1912 			 * prior to reboot. Retry establishing a channel
1913 			 * REFRESH_ATTEMPTS times, during which time the
1914 			 * stale conditions on the server might clear up.
1915 			 */
1916 			goto refresh;
1917 		}
1918 		return (RDMA_FAILED);
1919 	}
1920 	mutex_exit(&qp->cb_lock);
1921 	/*
1922 	 * Set the private data area to qp to be used in callbacks
1923 	 */
1924 	ibt_set_chan_private(qp->qp_hdl, (void *)qp);
1925 	return (RDMA_SUCCESS);
1926 }
1927 
1928 rdma_stat
1929 rib_ping_srv(int addr_type, struct netbuf *raddr, rpcib_ping_t *rptp)
1930 {
1931 	uint_t			i, addr_count;
1932 	ibt_status_t		ibt_status;
1933 	uint8_t			num_paths_p;
1934 	ibt_ip_path_attr_t	ipattr;
1935 	ibt_path_ip_src_t	srcip;
1936 	rpcib_ipaddrs_t		addrs4;
1937 	rpcib_ipaddrs_t		addrs6;
1938 	struct sockaddr_in	*sinp;
1939 	struct sockaddr_in6	*sin6p;
1940 	rdma_stat		retval = RDMA_FAILED;
1941 	rib_hca_t *hca;
1942 
1943 	if ((addr_type != AF_INET) && (addr_type != AF_INET6))
1944 		return (RDMA_INVAL);
1945 	ASSERT(raddr->buf != NULL);
1946 
1947 	bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
1948 
1949 	if (!rpcib_get_ib_addresses(&addrs4, &addrs6) ||
1950 	    (addrs4.ri_count == 0 && addrs6.ri_count == 0)) {
1951 		retval = RDMA_FAILED;
1952 		goto done2;
1953 	}
1954 
1955 	if (addr_type == AF_INET) {
1956 		addr_count = addrs4.ri_count;
1957 		sinp = (struct sockaddr_in *)raddr->buf;
1958 		rptp->dstip.family = AF_INET;
1959 		rptp->dstip.un.ip4addr = sinp->sin_addr.s_addr;
1960 		sinp = addrs4.ri_list;
1961 	} else {
1962 		addr_count = addrs6.ri_count;
1963 		sin6p = (struct sockaddr_in6 *)raddr->buf;
1964 		rptp->dstip.family = AF_INET6;
1965 		rptp->dstip.un.ip6addr = sin6p->sin6_addr;
1966 		sin6p = addrs6.ri_list;
1967 	}
1968 
1969 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
1970 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
1971 		rw_enter(&hca->state_lock, RW_READER);
1972 		if (hca->state == HCA_DETACHED) {
1973 			rw_exit(&hca->state_lock);
1974 			continue;
1975 		}
1976 
1977 		ipattr.ipa_dst_ip 	= &rptp->dstip;
1978 		ipattr.ipa_hca_guid	= hca->hca_guid;
1979 		ipattr.ipa_ndst		= 1;
1980 		ipattr.ipa_max_paths	= 1;
1981 		ipattr.ipa_src_ip.family = rptp->dstip.family;
1982 		for (i = 0; i < addr_count; i++) {
1983 			num_paths_p = 0;
1984 			if (addr_type == AF_INET) {
1985 				ipattr.ipa_src_ip.un.ip4addr =
1986 				    sinp[i].sin_addr.s_addr;
1987 			} else {
1988 				ipattr.ipa_src_ip.un.ip6addr =
1989 				    sin6p[i].sin6_addr;
1990 			}
1991 			bzero(&srcip, sizeof (ibt_path_ip_src_t));
1992 
1993 			ibt_status = ibt_get_ip_paths(rib_stat->ibt_clnt_hdl,
1994 			    IBT_PATH_NO_FLAGS, &ipattr, &rptp->path,
1995 			    &num_paths_p, &srcip);
1996 			if (ibt_status == IBT_SUCCESS &&
1997 			    num_paths_p != 0 &&
1998 			    rptp->path.pi_hca_guid == hca->hca_guid) {
1999 				rptp->hca = hca;
2000 				rw_exit(&hca->state_lock);
2001 				if (addr_type == AF_INET) {
2002 					rptp->srcip.family = AF_INET;
2003 					rptp->srcip.un.ip4addr =
2004 					    srcip.ip_primary.un.ip4addr;
2005 				} else {
2006 					rptp->srcip.family = AF_INET6;
2007 					rptp->srcip.un.ip6addr =
2008 					    srcip.ip_primary.un.ip6addr;
2009 
2010 				}
2011 				retval = RDMA_SUCCESS;
2012 				goto done1;
2013 			}
2014 		}
2015 		rw_exit(&hca->state_lock);
2016 	}
2017 done1:
2018 	rw_exit(&rib_stat->hcas_list_lock);
2019 done2:
2020 	if (addrs4.ri_size > 0)
2021 		kmem_free(addrs4.ri_list, addrs4.ri_size);
2022 	if (addrs6.ri_size > 0)
2023 		kmem_free(addrs6.ri_list, addrs6.ri_size);
2024 	return (retval);
2025 }
2026 
2027 /*
2028  * Close channel, remove from connection list and
2029  * free up resources allocated for that channel.
2030  */
2031 rdma_stat
2032 rib_disconnect_channel(CONN *conn, rib_conn_list_t *conn_list)
2033 {
2034 	rib_qp_t	*qp = ctoqp(conn);
2035 	rib_hca_t	*hca;
2036 
2037 	mutex_enter(&conn->c_lock);
2038 	if (conn->c_timeout != NULL) {
2039 		mutex_exit(&conn->c_lock);
2040 		(void) untimeout(conn->c_timeout);
2041 		mutex_enter(&conn->c_lock);
2042 	}
2043 
2044 	while (conn->c_flags & C_CLOSE_PENDING) {
2045 		cv_wait(&conn->c_cv, &conn->c_lock);
2046 	}
2047 	mutex_exit(&conn->c_lock);
2048 
2049 	/*
2050 	 * c_ref == 0 and connection is in C_DISCONN_PEND
2051 	 */
2052 	hca = qp->hca;
2053 	if (conn_list != NULL)
2054 		(void) rib_rm_conn(conn, conn_list);
2055 
2056 	/*
2057 	 * There is only one case where we get here with
2058 	 * qp_hdl = NULL, which is during connection setup on
2059 	 * the client. In such a case there are no posted
2060 	 * send/recv buffers.
2061 	 */
2062 	if (qp->qp_hdl != NULL) {
2063 		mutex_enter(&qp->posted_rbufs_lock);
2064 		while (qp->n_posted_rbufs)
2065 			cv_wait(&qp->posted_rbufs_cv, &qp->posted_rbufs_lock);
2066 		mutex_exit(&qp->posted_rbufs_lock);
2067 
2068 		mutex_enter(&qp->send_rbufs_lock);
2069 		while (qp->n_send_rbufs)
2070 			cv_wait(&qp->send_rbufs_cv, &qp->send_rbufs_lock);
2071 		mutex_exit(&qp->send_rbufs_lock);
2072 
2073 		(void) ibt_free_channel(qp->qp_hdl);
2074 		qp->qp_hdl = NULL;
2075 	}
2076 
2077 	ASSERT(qp->rdlist == NULL);
2078 
2079 	if (qp->replylist != NULL) {
2080 		(void) rib_rem_replylist(qp);
2081 	}
2082 
2083 	cv_destroy(&qp->cb_conn_cv);
2084 	cv_destroy(&qp->posted_rbufs_cv);
2085 	cv_destroy(&qp->send_rbufs_cv);
2086 	mutex_destroy(&qp->cb_lock);
2087 	mutex_destroy(&qp->replylist_lock);
2088 	mutex_destroy(&qp->posted_rbufs_lock);
2089 	mutex_destroy(&qp->send_rbufs_lock);
2090 	mutex_destroy(&qp->rdlist_lock);
2091 
2092 	cv_destroy(&conn->c_cv);
2093 	mutex_destroy(&conn->c_lock);
2094 
2095 	if (conn->c_raddr.buf != NULL) {
2096 		kmem_free(conn->c_raddr.buf, conn->c_raddr.len);
2097 	}
2098 	if (conn->c_laddr.buf != NULL) {
2099 		kmem_free(conn->c_laddr.buf, conn->c_laddr.len);
2100 	}
2101 	if (conn->c_netid != NULL) {
2102 		kmem_free(conn->c_netid, (strlen(conn->c_netid) + 1));
2103 	}
2104 
2105 	/*
2106 	 * Credit control cleanup.
2107 	 */
2108 	if (qp->rdmaconn.c_cc_type == RDMA_CC_CLNT) {
2109 		rdma_clnt_cred_ctrl_t *cc_info;
2110 		cc_info = &qp->rdmaconn.rdma_conn_cred_ctrl_u.c_clnt_cc;
2111 		cv_destroy(&cc_info->clnt_cc_cv);
2112 	}
2113 
2114 	kmem_free(qp, sizeof (rib_qp_t));
2115 
2116 	/*
2117 	 * If HCA has been DETACHED and the srv/clnt_conn_list is NULL,
2118 	 * then the hca is no longer being used.
2119 	 */
2120 	if (conn_list != NULL) {
2121 		rw_enter(&hca->state_lock, RW_READER);
2122 		if (hca->state == HCA_DETACHED) {
2123 			rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
2124 			if (hca->srv_conn_list.conn_hd == NULL) {
2125 				rw_enter(&hca->cl_conn_list.conn_lock,
2126 				    RW_READER);
2127 
2128 				if (hca->cl_conn_list.conn_hd == NULL) {
2129 					mutex_enter(&hca->inuse_lock);
2130 					hca->inuse = FALSE;
2131 					cv_signal(&hca->cb_cv);
2132 					mutex_exit(&hca->inuse_lock);
2133 				}
2134 				rw_exit(&hca->cl_conn_list.conn_lock);
2135 			}
2136 			rw_exit(&hca->srv_conn_list.conn_lock);
2137 		}
2138 		rw_exit(&hca->state_lock);
2139 	}
2140 
2141 	return (RDMA_SUCCESS);
2142 }
2143 
2144 /*
2145  * All sends are done under the protection of
2146  * the wdesc->sendwait_lock. n_send_rbufs count
2147  * is protected using the send_rbufs_lock.
2148  * lock ordering is:
2149  * sendwait_lock -> send_rbufs_lock
2150  */
2151 
2152 void
2153 rib_send_hold(rib_qp_t *qp)
2154 {
2155 	mutex_enter(&qp->send_rbufs_lock);
2156 	qp->n_send_rbufs++;
2157 	mutex_exit(&qp->send_rbufs_lock);
2158 }
2159 
2160 void
2161 rib_send_rele(rib_qp_t *qp)
2162 {
2163 	mutex_enter(&qp->send_rbufs_lock);
2164 	qp->n_send_rbufs--;
2165 	if (qp->n_send_rbufs == 0)
2166 		cv_signal(&qp->send_rbufs_cv);
2167 	mutex_exit(&qp->send_rbufs_lock);
2168 }
2169 
2170 /*
2171  * Wait for send completion notification. Only on receiving a
2172  * notification be it a successful or error completion, free the
2173  * send_wid.
2174  */
2175 static rdma_stat
2176 rib_sendwait(rib_qp_t *qp, struct send_wid *wd)
2177 {
2178 	clock_t timout, cv_wait_ret;
2179 	rdma_stat error = RDMA_SUCCESS;
2180 	int	i;
2181 
2182 	/*
2183 	 * Wait for send to complete
2184 	 */
2185 	ASSERT(wd != NULL);
2186 	mutex_enter(&wd->sendwait_lock);
2187 	if (wd->status == (uint_t)SEND_WAIT) {
2188 		timout = drv_usectohz(SEND_WAIT_TIME * 1000000) +
2189 		    ddi_get_lbolt();
2190 
2191 		if (qp->mode == RIB_SERVER) {
2192 			while ((cv_wait_ret = cv_timedwait(&wd->wait_cv,
2193 			    &wd->sendwait_lock, timout)) > 0 &&
2194 			    wd->status == (uint_t)SEND_WAIT)
2195 				;
2196 			switch (cv_wait_ret) {
2197 			case -1:	/* timeout */
2198 				DTRACE_PROBE(rpcib__i__srvsendwait__timeout);
2199 
2200 				wd->cv_sig = 0;		/* no signal needed */
2201 				error = RDMA_TIMEDOUT;
2202 				break;
2203 			default:	/* got send completion */
2204 				break;
2205 			}
2206 		} else {
2207 			while ((cv_wait_ret = cv_timedwait_sig(&wd->wait_cv,
2208 			    &wd->sendwait_lock, timout)) > 0 &&
2209 			    wd->status == (uint_t)SEND_WAIT)
2210 				;
2211 			switch (cv_wait_ret) {
2212 			case -1:	/* timeout */
2213 				DTRACE_PROBE(rpcib__i__clntsendwait__timeout);
2214 
2215 				wd->cv_sig = 0;		/* no signal needed */
2216 				error = RDMA_TIMEDOUT;
2217 				break;
2218 			case 0:		/* interrupted */
2219 				DTRACE_PROBE(rpcib__i__clntsendwait__intr);
2220 
2221 				wd->cv_sig = 0;		/* no signal needed */
2222 				error = RDMA_INTR;
2223 				break;
2224 			default:	/* got send completion */
2225 				break;
2226 			}
2227 		}
2228 	}
2229 
2230 	if (wd->status != (uint_t)SEND_WAIT) {
2231 		/* got send completion */
2232 		if (wd->status != RDMA_SUCCESS) {
2233 			switch (wd->status) {
2234 			case RDMA_CONNLOST:
2235 				error = RDMA_CONNLOST;
2236 				break;
2237 			default:
2238 				error = RDMA_FAILED;
2239 				break;
2240 			}
2241 		}
2242 		for (i = 0; i < wd->nsbufs; i++) {
2243 			rib_rbuf_free(qptoc(qp), SEND_BUFFER,
2244 			    (void *)(uintptr_t)wd->sbufaddr[i]);
2245 		}
2246 
2247 		rib_send_rele(qp);
2248 
2249 		mutex_exit(&wd->sendwait_lock);
2250 		(void) rib_free_sendwait(wd);
2251 
2252 	} else {
2253 		mutex_exit(&wd->sendwait_lock);
2254 	}
2255 	return (error);
2256 }
2257 
2258 static struct send_wid *
2259 rib_init_sendwait(uint32_t xid, int cv_sig, rib_qp_t *qp)
2260 {
2261 	struct send_wid	*wd;
2262 
2263 	wd = kmem_zalloc(sizeof (struct send_wid), KM_SLEEP);
2264 	wd->xid = xid;
2265 	wd->cv_sig = cv_sig;
2266 	wd->qp = qp;
2267 	cv_init(&wd->wait_cv, NULL, CV_DEFAULT, NULL);
2268 	mutex_init(&wd->sendwait_lock, NULL, MUTEX_DRIVER, NULL);
2269 	wd->status = (uint_t)SEND_WAIT;
2270 
2271 	return (wd);
2272 }
2273 
2274 static int
2275 rib_free_sendwait(struct send_wid *wdesc)
2276 {
2277 	cv_destroy(&wdesc->wait_cv);
2278 	mutex_destroy(&wdesc->sendwait_lock);
2279 	kmem_free(wdesc, sizeof (*wdesc));
2280 
2281 	return (0);
2282 }
2283 
2284 static rdma_stat
2285 rib_rem_rep(rib_qp_t *qp, struct reply *rep)
2286 {
2287 	mutex_enter(&qp->replylist_lock);
2288 	if (rep != NULL) {
2289 		(void) rib_remreply(qp, rep);
2290 		mutex_exit(&qp->replylist_lock);
2291 		return (RDMA_SUCCESS);
2292 	}
2293 	mutex_exit(&qp->replylist_lock);
2294 	return (RDMA_FAILED);
2295 }
2296 
2297 /*
2298  * Send buffers are freed here only in case of error in posting
2299  * on QP. If the post succeeded, the send buffers are freed upon
2300  * send completion in rib_sendwait() or in the scq_handler.
2301  */
2302 rdma_stat
2303 rib_send_and_wait(CONN *conn, struct clist *cl, uint32_t msgid,
2304 	int send_sig, int cv_sig, caddr_t *swid)
2305 {
2306 	struct send_wid	*wdesc;
2307 	struct clist	*clp;
2308 	ibt_status_t	ibt_status = IBT_SUCCESS;
2309 	rdma_stat	ret = RDMA_SUCCESS;
2310 	ibt_send_wr_t	tx_wr;
2311 	int		i, nds;
2312 	ibt_wr_ds_t	sgl[DSEG_MAX];
2313 	uint_t		total_msg_size;
2314 	rib_qp_t	*qp;
2315 
2316 	qp = ctoqp(conn);
2317 
2318 	ASSERT(cl != NULL);
2319 
2320 	bzero(&tx_wr, sizeof (ibt_send_wr_t));
2321 
2322 	nds = 0;
2323 	total_msg_size = 0;
2324 	clp = cl;
2325 	while (clp != NULL) {
2326 		if (nds >= DSEG_MAX) {
2327 			DTRACE_PROBE(rpcib__i__sendandwait_dsegmax_exceeded);
2328 			return (RDMA_FAILED);
2329 		}
2330 		sgl[nds].ds_va = clp->w.c_saddr;
2331 		sgl[nds].ds_key = clp->c_smemhandle.mrc_lmr; /* lkey */
2332 		sgl[nds].ds_len = clp->c_len;
2333 		total_msg_size += clp->c_len;
2334 		clp = clp->c_next;
2335 		nds++;
2336 	}
2337 
2338 	if (send_sig) {
2339 		/* Set SEND_SIGNAL flag. */
2340 		tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2341 		wdesc = rib_init_sendwait(msgid, cv_sig, qp);
2342 		*swid = (caddr_t)wdesc;
2343 		tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2344 		mutex_enter(&wdesc->sendwait_lock);
2345 		wdesc->nsbufs = nds;
2346 		for (i = 0; i < nds; i++) {
2347 			wdesc->sbufaddr[i] = sgl[i].ds_va;
2348 		}
2349 	} else {
2350 		tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2351 		*swid = NULL;
2352 		tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2353 	}
2354 
2355 	tx_wr.wr_opcode = IBT_WRC_SEND;
2356 	tx_wr.wr_trans = IBT_RC_SRV;
2357 	tx_wr.wr_nds = nds;
2358 	tx_wr.wr_sgl = sgl;
2359 
2360 	mutex_enter(&conn->c_lock);
2361 	if (conn->c_state == C_CONNECTED) {
2362 		ibt_status = ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2363 	}
2364 	if (conn->c_state != C_CONNECTED ||
2365 	    ibt_status != IBT_SUCCESS) {
2366 		if (conn->c_state != C_DISCONN_PEND)
2367 			conn->c_state = C_ERROR_CONN;
2368 		mutex_exit(&conn->c_lock);
2369 		if (send_sig) {
2370 			for (i = 0; i < nds; i++) {
2371 				rib_rbuf_free(conn, SEND_BUFFER,
2372 				    (void *)(uintptr_t)wdesc->sbufaddr[i]);
2373 			}
2374 			mutex_exit(&wdesc->sendwait_lock);
2375 			(void) rib_free_sendwait(wdesc);
2376 		}
2377 		return (RDMA_CONNLOST);
2378 	}
2379 
2380 	mutex_exit(&conn->c_lock);
2381 
2382 	if (send_sig) {
2383 		rib_send_hold(qp);
2384 		mutex_exit(&wdesc->sendwait_lock);
2385 		if (cv_sig) {
2386 			/*
2387 			 * cv_wait for send to complete.
2388 			 * We can fail due to a timeout or signal or
2389 			 * unsuccessful send.
2390 			 */
2391 			ret = rib_sendwait(qp, wdesc);
2392 
2393 			return (ret);
2394 		}
2395 	}
2396 
2397 	return (RDMA_SUCCESS);
2398 }
2399 
2400 
2401 rdma_stat
2402 rib_send(CONN *conn, struct clist *cl, uint32_t msgid)
2403 {
2404 	rdma_stat	ret;
2405 	caddr_t		wd;
2406 
2407 	/* send-wait & cv_signal */
2408 	ret = rib_send_and_wait(conn, cl, msgid, 1, 1, &wd);
2409 	return (ret);
2410 }
2411 
2412 /*
2413  * Deprecated/obsolete interface not used currently
2414  * but earlier used for READ-READ protocol.
2415  * Send RPC reply and wait for RDMA_DONE.
2416  */
2417 rdma_stat
2418 rib_send_resp(CONN *conn, struct clist *cl, uint32_t msgid)
2419 {
2420 	rdma_stat ret = RDMA_SUCCESS;
2421 	struct rdma_done_list *rd;
2422 	clock_t timout, cv_wait_ret;
2423 	caddr_t *wid = NULL;
2424 	rib_qp_t *qp = ctoqp(conn);
2425 
2426 	mutex_enter(&qp->rdlist_lock);
2427 	rd = rdma_done_add(qp, msgid);
2428 
2429 	/* No cv_signal (whether send-wait or no-send-wait) */
2430 	ret = rib_send_and_wait(conn, cl, msgid, 1, 0, wid);
2431 
2432 	if (ret != RDMA_SUCCESS) {
2433 		rdma_done_rm(qp, rd);
2434 	} else {
2435 		/*
2436 		 * Wait for RDMA_DONE from remote end
2437 		 */
2438 		timout =
2439 		    drv_usectohz(REPLY_WAIT_TIME * 1000000) + ddi_get_lbolt();
2440 		cv_wait_ret = cv_timedwait(&rd->rdma_done_cv,
2441 		    &qp->rdlist_lock,
2442 		    timout);
2443 
2444 		rdma_done_rm(qp, rd);
2445 
2446 		if (cv_wait_ret < 0) {
2447 			ret = RDMA_TIMEDOUT;
2448 		}
2449 	}
2450 
2451 	mutex_exit(&qp->rdlist_lock);
2452 	return (ret);
2453 }
2454 
2455 static struct recv_wid *
2456 rib_create_wid(rib_qp_t *qp, ibt_wr_ds_t *sgl, uint32_t msgid)
2457 {
2458 	struct recv_wid	*rwid;
2459 
2460 	rwid = kmem_zalloc(sizeof (struct recv_wid), KM_SLEEP);
2461 	rwid->xid = msgid;
2462 	rwid->addr = sgl->ds_va;
2463 	rwid->qp = qp;
2464 
2465 	return (rwid);
2466 }
2467 
2468 static void
2469 rib_free_wid(struct recv_wid *rwid)
2470 {
2471 	kmem_free(rwid, sizeof (struct recv_wid));
2472 }
2473 
2474 rdma_stat
2475 rib_clnt_post(CONN* conn, struct clist *cl, uint32_t msgid)
2476 {
2477 	rib_qp_t	*qp = ctoqp(conn);
2478 	struct clist	*clp = cl;
2479 	struct reply	*rep;
2480 	struct recv_wid	*rwid;
2481 	int		nds;
2482 	ibt_wr_ds_t	sgl[DSEG_MAX];
2483 	ibt_recv_wr_t	recv_wr;
2484 	rdma_stat	ret;
2485 	ibt_status_t	ibt_status;
2486 
2487 	/*
2488 	 * rdma_clnt_postrecv uses RECV_BUFFER.
2489 	 */
2490 
2491 	nds = 0;
2492 	while (cl != NULL) {
2493 		if (nds >= DSEG_MAX) {
2494 			ret = RDMA_FAILED;
2495 			goto done;
2496 		}
2497 		sgl[nds].ds_va = cl->w.c_saddr;
2498 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2499 		sgl[nds].ds_len = cl->c_len;
2500 		cl = cl->c_next;
2501 		nds++;
2502 	}
2503 
2504 	if (nds != 1) {
2505 		ret = RDMA_FAILED;
2506 		goto done;
2507 	}
2508 
2509 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2510 	recv_wr.wr_nds = nds;
2511 	recv_wr.wr_sgl = sgl;
2512 
2513 	rwid = rib_create_wid(qp, &sgl[0], msgid);
2514 	if (rwid) {
2515 		recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)rwid;
2516 	} else {
2517 		ret = RDMA_NORESOURCE;
2518 		goto done;
2519 	}
2520 	rep = rib_addreplylist(qp, msgid);
2521 	if (!rep) {
2522 		rib_free_wid(rwid);
2523 		ret = RDMA_NORESOURCE;
2524 		goto done;
2525 	}
2526 
2527 	mutex_enter(&conn->c_lock);
2528 
2529 	if (conn->c_state == C_CONNECTED) {
2530 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2531 	}
2532 
2533 	if (conn->c_state != C_CONNECTED ||
2534 	    ibt_status != IBT_SUCCESS) {
2535 		if (conn->c_state != C_DISCONN_PEND)
2536 			conn->c_state = C_ERROR_CONN;
2537 		mutex_exit(&conn->c_lock);
2538 		rib_free_wid(rwid);
2539 		(void) rib_rem_rep(qp, rep);
2540 		ret = RDMA_CONNLOST;
2541 		goto done;
2542 	}
2543 	mutex_exit(&conn->c_lock);
2544 	return (RDMA_SUCCESS);
2545 
2546 done:
2547 	while (clp != NULL) {
2548 		rib_rbuf_free(conn, RECV_BUFFER,
2549 		    (void *)(uintptr_t)clp->w.c_saddr3);
2550 		clp = clp->c_next;
2551 	}
2552 	return (ret);
2553 }
2554 
2555 rdma_stat
2556 rib_svc_post(CONN* conn, struct clist *cl)
2557 {
2558 	rib_qp_t	*qp = ctoqp(conn);
2559 	struct svc_recv	*s_recvp;
2560 	int		nds;
2561 	ibt_wr_ds_t	sgl[DSEG_MAX];
2562 	ibt_recv_wr_t	recv_wr;
2563 	ibt_status_t	ibt_status;
2564 
2565 	nds = 0;
2566 	while (cl != NULL) {
2567 		if (nds >= DSEG_MAX) {
2568 			return (RDMA_FAILED);
2569 		}
2570 		sgl[nds].ds_va = cl->w.c_saddr;
2571 		sgl[nds].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2572 		sgl[nds].ds_len = cl->c_len;
2573 		cl = cl->c_next;
2574 		nds++;
2575 	}
2576 
2577 	if (nds != 1) {
2578 		rib_rbuf_free(conn, RECV_BUFFER,
2579 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2580 
2581 		return (RDMA_FAILED);
2582 	}
2583 
2584 	bzero(&recv_wr, sizeof (ibt_recv_wr_t));
2585 	recv_wr.wr_nds = nds;
2586 	recv_wr.wr_sgl = sgl;
2587 
2588 	s_recvp = rib_init_svc_recv(qp, &sgl[0]);
2589 	/* Use s_recvp's addr as wr id */
2590 	recv_wr.wr_id = (ibt_wrid_t)(uintptr_t)s_recvp;
2591 	mutex_enter(&conn->c_lock);
2592 	if (conn->c_state == C_CONNECTED) {
2593 		ibt_status = ibt_post_recv(qp->qp_hdl, &recv_wr, 1, NULL);
2594 	}
2595 	if (conn->c_state != C_CONNECTED ||
2596 	    ibt_status != IBT_SUCCESS) {
2597 		if (conn->c_state != C_DISCONN_PEND)
2598 			conn->c_state = C_ERROR_CONN;
2599 		mutex_exit(&conn->c_lock);
2600 		rib_rbuf_free(conn, RECV_BUFFER,
2601 		    (caddr_t)(uintptr_t)sgl[0].ds_va);
2602 		(void) rib_free_svc_recv(s_recvp);
2603 
2604 		return (RDMA_CONNLOST);
2605 	}
2606 	mutex_exit(&conn->c_lock);
2607 
2608 	return (RDMA_SUCCESS);
2609 }
2610 
2611 /* Client */
2612 rdma_stat
2613 rib_post_resp(CONN* conn, struct clist *cl, uint32_t msgid)
2614 {
2615 
2616 	return (rib_clnt_post(conn, cl, msgid));
2617 }
2618 
2619 /* Client */
2620 rdma_stat
2621 rib_post_resp_remove(CONN* conn, uint32_t msgid)
2622 {
2623 	rib_qp_t	*qp = ctoqp(conn);
2624 	struct reply	*rep;
2625 
2626 	mutex_enter(&qp->replylist_lock);
2627 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2628 		if (rep->xid == msgid) {
2629 			if (rep->vaddr_cq) {
2630 				rib_rbuf_free(conn, RECV_BUFFER,
2631 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2632 			}
2633 			(void) rib_remreply(qp, rep);
2634 			break;
2635 		}
2636 	}
2637 	mutex_exit(&qp->replylist_lock);
2638 
2639 	return (RDMA_SUCCESS);
2640 }
2641 
2642 /* Server */
2643 rdma_stat
2644 rib_post_recv(CONN *conn, struct clist *cl)
2645 {
2646 	rib_qp_t	*qp = ctoqp(conn);
2647 
2648 	if (rib_svc_post(conn, cl) == RDMA_SUCCESS) {
2649 		mutex_enter(&qp->posted_rbufs_lock);
2650 		qp->n_posted_rbufs++;
2651 		mutex_exit(&qp->posted_rbufs_lock);
2652 		return (RDMA_SUCCESS);
2653 	}
2654 	return (RDMA_FAILED);
2655 }
2656 
2657 /*
2658  * Client side only interface to "recv" the rpc reply buf
2659  * posted earlier by rib_post_resp(conn, cl, msgid).
2660  */
2661 rdma_stat
2662 rib_recv(CONN *conn, struct clist **clp, uint32_t msgid)
2663 {
2664 	struct reply *rep = NULL;
2665 	clock_t timout, cv_wait_ret;
2666 	rdma_stat ret = RDMA_SUCCESS;
2667 	rib_qp_t *qp = ctoqp(conn);
2668 
2669 	/*
2670 	 * Find the reply structure for this msgid
2671 	 */
2672 	mutex_enter(&qp->replylist_lock);
2673 
2674 	for (rep = qp->replylist; rep != NULL; rep = rep->next) {
2675 		if (rep->xid == msgid)
2676 			break;
2677 	}
2678 
2679 	if (rep != NULL) {
2680 		/*
2681 		 * If message not yet received, wait.
2682 		 */
2683 		if (rep->status == (uint_t)REPLY_WAIT) {
2684 			timout = ddi_get_lbolt() +
2685 			    drv_usectohz(REPLY_WAIT_TIME * 1000000);
2686 
2687 			while ((cv_wait_ret = cv_timedwait_sig(&rep->wait_cv,
2688 			    &qp->replylist_lock, timout)) > 0 &&
2689 			    rep->status == (uint_t)REPLY_WAIT)
2690 				;
2691 
2692 			switch (cv_wait_ret) {
2693 			case -1:	/* timeout */
2694 				ret = RDMA_TIMEDOUT;
2695 				break;
2696 			case 0:
2697 				ret = RDMA_INTR;
2698 				break;
2699 			default:
2700 				break;
2701 			}
2702 		}
2703 
2704 		if (rep->status == RDMA_SUCCESS) {
2705 			struct clist *cl = NULL;
2706 
2707 			/*
2708 			 * Got message successfully
2709 			 */
2710 			clist_add(&cl, 0, rep->bytes_xfer, NULL,
2711 			    (caddr_t)(uintptr_t)rep->vaddr_cq, NULL, NULL);
2712 			*clp = cl;
2713 		} else {
2714 			if (rep->status != (uint_t)REPLY_WAIT) {
2715 				/*
2716 				 * Got error in reply message. Free
2717 				 * recv buffer here.
2718 				 */
2719 				ret = rep->status;
2720 				rib_rbuf_free(conn, RECV_BUFFER,
2721 				    (caddr_t)(uintptr_t)rep->vaddr_cq);
2722 			}
2723 		}
2724 		(void) rib_remreply(qp, rep);
2725 	} else {
2726 		/*
2727 		 * No matching reply structure found for given msgid on the
2728 		 * reply wait list.
2729 		 */
2730 		ret = RDMA_INVAL;
2731 		DTRACE_PROBE(rpcib__i__nomatchxid2);
2732 	}
2733 
2734 	/*
2735 	 * Done.
2736 	 */
2737 	mutex_exit(&qp->replylist_lock);
2738 	return (ret);
2739 }
2740 
2741 /*
2742  * RDMA write a buffer to the remote address.
2743  */
2744 rdma_stat
2745 rib_write(CONN *conn, struct clist *cl, int wait)
2746 {
2747 	ibt_send_wr_t	tx_wr;
2748 	int		cv_sig;
2749 	ibt_wr_ds_t	sgl[DSEG_MAX];
2750 	struct send_wid	*wdesc;
2751 	ibt_status_t	ibt_status;
2752 	rdma_stat	ret = RDMA_SUCCESS;
2753 	rib_qp_t	*qp = ctoqp(conn);
2754 	uint64_t	n_writes = 0;
2755 
2756 	if (cl == NULL) {
2757 		return (RDMA_FAILED);
2758 	}
2759 
2760 	while ((cl != NULL)) {
2761 		if (cl->c_len > 0) {
2762 			bzero(&tx_wr, sizeof (ibt_send_wr_t));
2763 			tx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->u.c_daddr;
2764 			tx_wr.wr.rc.rcwr.rdma.rdma_rkey =
2765 			    cl->c_dmemhandle.mrc_rmr; /* rkey */
2766 			sgl[0].ds_va = cl->w.c_saddr;
2767 			sgl[0].ds_key = cl->c_smemhandle.mrc_lmr; /* lkey */
2768 			sgl[0].ds_len = cl->c_len;
2769 
2770 			if (wait) {
2771 				cv_sig = 1;
2772 			} else {
2773 				if (n_writes > max_unsignaled_rws) {
2774 					n_writes = 0;
2775 					cv_sig = 1;
2776 				} else {
2777 					cv_sig = 0;
2778 				}
2779 			}
2780 
2781 			if (cv_sig) {
2782 				tx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2783 				wdesc = rib_init_sendwait(0, cv_sig, qp);
2784 				tx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2785 				mutex_enter(&wdesc->sendwait_lock);
2786 			} else {
2787 				tx_wr.wr_flags = IBT_WR_NO_FLAGS;
2788 				tx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2789 			}
2790 			tx_wr.wr_opcode = IBT_WRC_RDMAW;
2791 			tx_wr.wr_trans = IBT_RC_SRV;
2792 			tx_wr.wr_nds = 1;
2793 			tx_wr.wr_sgl = sgl;
2794 
2795 			mutex_enter(&conn->c_lock);
2796 			if (conn->c_state == C_CONNECTED) {
2797 				ibt_status =
2798 				    ibt_post_send(qp->qp_hdl, &tx_wr, 1, NULL);
2799 			}
2800 			if (conn->c_state != C_CONNECTED ||
2801 			    ibt_status != IBT_SUCCESS) {
2802 				if (conn->c_state != C_DISCONN_PEND)
2803 					conn->c_state = C_ERROR_CONN;
2804 				mutex_exit(&conn->c_lock);
2805 				if (cv_sig) {
2806 					mutex_exit(&wdesc->sendwait_lock);
2807 					(void) rib_free_sendwait(wdesc);
2808 				}
2809 				return (RDMA_CONNLOST);
2810 			}
2811 
2812 			mutex_exit(&conn->c_lock);
2813 
2814 			/*
2815 			 * Wait for send to complete
2816 			 */
2817 			if (cv_sig) {
2818 
2819 				rib_send_hold(qp);
2820 				mutex_exit(&wdesc->sendwait_lock);
2821 
2822 				ret = rib_sendwait(qp, wdesc);
2823 				if (ret != 0)
2824 					return (ret);
2825 			}
2826 			n_writes ++;
2827 		}
2828 		cl = cl->c_next;
2829 	}
2830 	return (RDMA_SUCCESS);
2831 }
2832 
2833 /*
2834  * RDMA Read a buffer from the remote address.
2835  */
2836 rdma_stat
2837 rib_read(CONN *conn, struct clist *cl, int wait)
2838 {
2839 	ibt_send_wr_t	rx_wr;
2840 	int		cv_sig = 0;
2841 	ibt_wr_ds_t	sgl;
2842 	struct send_wid	*wdesc;
2843 	ibt_status_t	ibt_status = IBT_SUCCESS;
2844 	rdma_stat	ret = RDMA_SUCCESS;
2845 	rib_qp_t	*qp = ctoqp(conn);
2846 
2847 	if (cl == NULL) {
2848 		return (RDMA_FAILED);
2849 	}
2850 
2851 	while (cl != NULL) {
2852 		bzero(&rx_wr, sizeof (ibt_send_wr_t));
2853 		/*
2854 		 * Remote address is at the head chunk item in list.
2855 		 */
2856 		rx_wr.wr.rc.rcwr.rdma.rdma_raddr = cl->w.c_saddr;
2857 		rx_wr.wr.rc.rcwr.rdma.rdma_rkey = cl->c_smemhandle.mrc_rmr;
2858 
2859 		sgl.ds_va = cl->u.c_daddr;
2860 		sgl.ds_key = cl->c_dmemhandle.mrc_lmr; /* lkey */
2861 		sgl.ds_len = cl->c_len;
2862 
2863 		/*
2864 		 * If there are multiple chunks to be read, and
2865 		 * wait is set, ask for signal only for the last chunk
2866 		 * and wait only on the last chunk. The completion of
2867 		 * RDMA_READ on last chunk ensures that reads on all
2868 		 * previous chunks are also completed.
2869 		 */
2870 		if (wait && (cl->c_next == NULL)) {
2871 			cv_sig = 1;
2872 			wdesc = rib_init_sendwait(0, cv_sig, qp);
2873 			rx_wr.wr_flags = IBT_WR_SEND_SIGNAL;
2874 			rx_wr.wr_id = (ibt_wrid_t)(uintptr_t)wdesc;
2875 			mutex_enter(&wdesc->sendwait_lock);
2876 		} else {
2877 			rx_wr.wr_flags = IBT_WR_NO_FLAGS;
2878 			rx_wr.wr_id = (ibt_wrid_t)RDMA_DUMMY_WRID;
2879 		}
2880 		rx_wr.wr_opcode = IBT_WRC_RDMAR;
2881 		rx_wr.wr_trans = IBT_RC_SRV;
2882 		rx_wr.wr_nds = 1;
2883 		rx_wr.wr_sgl = &sgl;
2884 
2885 		mutex_enter(&conn->c_lock);
2886 		if (conn->c_state == C_CONNECTED) {
2887 			ibt_status = ibt_post_send(qp->qp_hdl, &rx_wr, 1, NULL);
2888 		}
2889 		if (conn->c_state != C_CONNECTED ||
2890 		    ibt_status != IBT_SUCCESS) {
2891 			if (conn->c_state != C_DISCONN_PEND)
2892 				conn->c_state = C_ERROR_CONN;
2893 			mutex_exit(&conn->c_lock);
2894 			if (wait && (cl->c_next == NULL)) {
2895 				mutex_exit(&wdesc->sendwait_lock);
2896 				(void) rib_free_sendwait(wdesc);
2897 			}
2898 			return (RDMA_CONNLOST);
2899 		}
2900 
2901 		mutex_exit(&conn->c_lock);
2902 
2903 		/*
2904 		 * Wait for send to complete if this is the
2905 		 * last item in the list.
2906 		 */
2907 		if (wait && cl->c_next == NULL) {
2908 			rib_send_hold(qp);
2909 			mutex_exit(&wdesc->sendwait_lock);
2910 
2911 			ret = rib_sendwait(qp, wdesc);
2912 
2913 			if (ret != 0)
2914 				return (ret);
2915 		}
2916 		cl = cl->c_next;
2917 	}
2918 	return (RDMA_SUCCESS);
2919 }
2920 
2921 /*
2922  * rib_srv_cm_handler()
2923  *    Connection Manager callback to handle RC connection requests.
2924  */
2925 /* ARGSUSED */
2926 static ibt_cm_status_t
2927 rib_srv_cm_handler(void *any, ibt_cm_event_t *event,
2928 	ibt_cm_return_args_t *ret_args, void *priv_data,
2929 	ibt_priv_data_len_t len)
2930 {
2931 	queue_t		*q;
2932 	rib_qp_t	*qp;
2933 	rib_hca_t	*hca;
2934 	rdma_stat	status = RDMA_SUCCESS;
2935 	int		i;
2936 	struct clist	cl;
2937 	rdma_buf_t	rdbuf = {0};
2938 	void		*buf = NULL;
2939 	CONN		*conn;
2940 	ibt_ip_cm_info_t	ipinfo;
2941 	struct sockaddr_in *s;
2942 	struct sockaddr_in6 *s6;
2943 	int sin_size = sizeof (struct sockaddr_in);
2944 	int in_size = sizeof (struct in_addr);
2945 	int sin6_size = sizeof (struct sockaddr_in6);
2946 
2947 	ASSERT(any != NULL);
2948 	ASSERT(event != NULL);
2949 
2950 	hca = (rib_hca_t *)any;
2951 
2952 	/* got a connection request */
2953 	switch (event->cm_type) {
2954 	case IBT_CM_EVENT_REQ_RCV:
2955 		/*
2956 		 * If the plugin is in the NO_ACCEPT state, bail out.
2957 		 */
2958 		mutex_enter(&plugin_state_lock);
2959 		if (plugin_state == NO_ACCEPT) {
2960 			mutex_exit(&plugin_state_lock);
2961 			return (IBT_CM_REJECT);
2962 		}
2963 		mutex_exit(&plugin_state_lock);
2964 
2965 		/*
2966 		 * Need to send a MRA MAD to CM so that it does not
2967 		 * timeout on us.
2968 		 */
2969 		(void) ibt_cm_delay(IBT_CM_DELAY_REQ, event->cm_session_id,
2970 		    event->cm_event.req.req_timeout * 8, NULL, 0);
2971 
2972 		mutex_enter(&rib_stat->open_hca_lock);
2973 		q = rib_stat->q;
2974 		mutex_exit(&rib_stat->open_hca_lock);
2975 
2976 		status = rib_svc_create_chan(hca, (caddr_t)q,
2977 		    event->cm_event.req.req_prim_hca_port, &qp);
2978 
2979 		if (status) {
2980 			return (IBT_CM_REJECT);
2981 		}
2982 
2983 		ret_args->cm_ret.rep.cm_channel = qp->qp_hdl;
2984 		ret_args->cm_ret.rep.cm_rdma_ra_out = 4;
2985 		ret_args->cm_ret.rep.cm_rdma_ra_in = 4;
2986 		ret_args->cm_ret.rep.cm_rnr_retry_cnt = RNR_RETRIES;
2987 
2988 		/*
2989 		 * Pre-posts RECV buffers
2990 		 */
2991 		conn = qptoc(qp);
2992 		for (i = 0; i < preposted_rbufs; i++) {
2993 			bzero(&rdbuf, sizeof (rdbuf));
2994 			rdbuf.type = RECV_BUFFER;
2995 			buf = rib_rbuf_alloc(conn, &rdbuf);
2996 			if (buf == NULL) {
2997 				/*
2998 				 * A connection is not established yet.
2999 				 * Just flush the channel. Buffers
3000 				 * posted till now will error out with
3001 				 * IBT_WC_WR_FLUSHED_ERR.
3002 				 */
3003 				(void) ibt_flush_channel(qp->qp_hdl);
3004 				(void) rib_disconnect_channel(conn, NULL);
3005 				return (IBT_CM_REJECT);
3006 			}
3007 
3008 			bzero(&cl, sizeof (cl));
3009 			cl.w.c_saddr3 = (caddr_t)rdbuf.addr;
3010 			cl.c_len = rdbuf.len;
3011 			cl.c_smemhandle.mrc_lmr =
3012 			    rdbuf.handle.mrc_lmr; /* lkey */
3013 			cl.c_next = NULL;
3014 			status = rib_post_recv(conn, &cl);
3015 			if (status != RDMA_SUCCESS) {
3016 				/*
3017 				 * A connection is not established yet.
3018 				 * Just flush the channel. Buffers
3019 				 * posted till now will error out with
3020 				 * IBT_WC_WR_FLUSHED_ERR.
3021 				 */
3022 				(void) ibt_flush_channel(qp->qp_hdl);
3023 				(void) rib_disconnect_channel(conn, NULL);
3024 				return (IBT_CM_REJECT);
3025 			}
3026 		}
3027 		(void) rib_add_connlist(conn, &hca->srv_conn_list);
3028 
3029 		/*
3030 		 * Get the address translation
3031 		 */
3032 		rw_enter(&hca->state_lock, RW_READER);
3033 		if (hca->state == HCA_DETACHED) {
3034 			rw_exit(&hca->state_lock);
3035 			return (IBT_CM_REJECT);
3036 		}
3037 		rw_exit(&hca->state_lock);
3038 
3039 		bzero(&ipinfo, sizeof (ibt_ip_cm_info_t));
3040 
3041 		if (ibt_get_ip_data(event->cm_priv_data_len,
3042 		    event->cm_priv_data,
3043 		    &ipinfo) != IBT_SUCCESS) {
3044 
3045 			return (IBT_CM_REJECT);
3046 		}
3047 
3048 		switch (ipinfo.src_addr.family) {
3049 		case AF_INET:
3050 
3051 			conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1,
3052 			    KM_SLEEP);
3053 			(void) strcpy(conn->c_netid, RIBNETID_TCP);
3054 
3055 			conn->c_raddr.maxlen =
3056 			    conn->c_raddr.len = sin_size;
3057 			conn->c_raddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3058 
3059 			s = (struct sockaddr_in *)conn->c_raddr.buf;
3060 			s->sin_family = AF_INET;
3061 			bcopy((void *)&ipinfo.src_addr.un.ip4addr,
3062 			    &s->sin_addr, in_size);
3063 
3064 			conn->c_laddr.maxlen =
3065 			    conn->c_laddr.len = sin_size;
3066 			conn->c_laddr.buf = kmem_zalloc(sin_size, KM_SLEEP);
3067 
3068 			s = (struct sockaddr_in *)conn->c_laddr.buf;
3069 			s->sin_family = AF_INET;
3070 			bcopy((void *)&ipinfo.dst_addr.un.ip4addr,
3071 			    &s->sin_addr, in_size);
3072 
3073 			break;
3074 
3075 		case AF_INET6:
3076 
3077 			conn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1,
3078 			    KM_SLEEP);
3079 			(void) strcpy(conn->c_netid, RIBNETID_TCP6);
3080 
3081 			conn->c_raddr.maxlen =
3082 			    conn->c_raddr.len = sin6_size;
3083 			conn->c_raddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3084 
3085 			s6 = (struct sockaddr_in6 *)conn->c_raddr.buf;
3086 			s6->sin6_family = AF_INET6;
3087 			bcopy((void *)&ipinfo.src_addr.un.ip6addr,
3088 			    &s6->sin6_addr,
3089 			    sizeof (struct in6_addr));
3090 
3091 			conn->c_laddr.maxlen =
3092 			    conn->c_laddr.len = sin6_size;
3093 			conn->c_laddr.buf = kmem_zalloc(sin6_size, KM_SLEEP);
3094 
3095 			s6 = (struct sockaddr_in6 *)conn->c_laddr.buf;
3096 			s6->sin6_family = AF_INET6;
3097 			bcopy((void *)&ipinfo.dst_addr.un.ip6addr,
3098 			    &s6->sin6_addr,
3099 			    sizeof (struct in6_addr));
3100 
3101 			break;
3102 
3103 		default:
3104 			return (IBT_CM_REJECT);
3105 		}
3106 
3107 		break;
3108 
3109 	case IBT_CM_EVENT_CONN_CLOSED:
3110 	{
3111 		CONN		*conn;
3112 		rib_qp_t	*qp;
3113 
3114 		switch (event->cm_event.closed) {
3115 		case IBT_CM_CLOSED_DREP_RCVD:
3116 		case IBT_CM_CLOSED_DREQ_TIMEOUT:
3117 		case IBT_CM_CLOSED_DUP:
3118 		case IBT_CM_CLOSED_ABORT:
3119 		case IBT_CM_CLOSED_ALREADY:
3120 			/*
3121 			 * These cases indicate the local end initiated
3122 			 * the closing of the channel. Nothing to do here.
3123 			 */
3124 			break;
3125 		default:
3126 			/*
3127 			 * Reason for CONN_CLOSED event must be one of
3128 			 * IBT_CM_CLOSED_DREQ_RCVD or IBT_CM_CLOSED_REJ_RCVD
3129 			 * or IBT_CM_CLOSED_STALE. These indicate cases were
3130 			 * the remote end is closing the channel. In these
3131 			 * cases free the channel and transition to error
3132 			 * state
3133 			 */
3134 			qp = ibt_get_chan_private(event->cm_channel);
3135 			conn = qptoc(qp);
3136 			mutex_enter(&conn->c_lock);
3137 			if (conn->c_state == C_DISCONN_PEND) {
3138 				mutex_exit(&conn->c_lock);
3139 				break;
3140 			}
3141 			conn->c_state = C_ERROR_CONN;
3142 
3143 			/*
3144 			 * Free the conn if c_ref goes down to 0
3145 			 */
3146 			if (conn->c_ref == 0) {
3147 				/*
3148 				 * Remove from list and free conn
3149 				 */
3150 				conn->c_state = C_DISCONN_PEND;
3151 				mutex_exit(&conn->c_lock);
3152 				(void) rib_disconnect_channel(conn,
3153 				    &hca->srv_conn_list);
3154 			} else {
3155 				/*
3156 				 * conn will be freed when c_ref goes to 0.
3157 				 * Indicate to cleaning thread not to close
3158 				 * the connection, but just free the channel.
3159 				 */
3160 				conn->c_flags |= C_CLOSE_NOTNEEDED;
3161 				mutex_exit(&conn->c_lock);
3162 			}
3163 			DTRACE_PROBE(rpcib__i__srvcm_chandisconnect);
3164 			break;
3165 		}
3166 		break;
3167 	}
3168 	case IBT_CM_EVENT_CONN_EST:
3169 		/*
3170 		 * RTU received, hence connection established.
3171 		 */
3172 		if (rib_debug > 1)
3173 			cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3174 			    "(CONN_EST) channel established");
3175 		break;
3176 
3177 	default:
3178 		if (rib_debug > 2) {
3179 			/* Let CM handle the following events. */
3180 			if (event->cm_type == IBT_CM_EVENT_REP_RCV) {
3181 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3182 				    "server recv'ed IBT_CM_EVENT_REP_RCV\n");
3183 			} else if (event->cm_type == IBT_CM_EVENT_LAP_RCV) {
3184 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3185 				    "server recv'ed IBT_CM_EVENT_LAP_RCV\n");
3186 			} else if (event->cm_type == IBT_CM_EVENT_MRA_RCV) {
3187 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3188 				    "server recv'ed IBT_CM_EVENT_MRA_RCV\n");
3189 			} else if (event->cm_type == IBT_CM_EVENT_APR_RCV) {
3190 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3191 				    "server recv'ed IBT_CM_EVENT_APR_RCV\n");
3192 			} else if (event->cm_type == IBT_CM_EVENT_FAILURE) {
3193 				cmn_err(CE_NOTE, "rib_srv_cm_handler: "
3194 				    "server recv'ed IBT_CM_EVENT_FAILURE\n");
3195 			}
3196 		}
3197 		return (IBT_CM_DEFAULT);
3198 	}
3199 
3200 	/* accept all other CM messages (i.e. let the CM handle them) */
3201 	return (IBT_CM_ACCEPT);
3202 }
3203 
3204 static rdma_stat
3205 rib_register_service(rib_hca_t *hca, int service_type,
3206 	uint8_t protocol_num, in_port_t dst_port)
3207 {
3208 	ibt_srv_desc_t		sdesc;
3209 	ibt_hca_portinfo_t	*port_infop;
3210 	ib_svc_id_t		srv_id;
3211 	ibt_srv_hdl_t		srv_hdl;
3212 	uint_t			port_size;
3213 	uint_t			pki, i, num_ports, nbinds;
3214 	ibt_status_t		ibt_status;
3215 	rib_service_t		*service;
3216 	ib_pkey_t		pkey;
3217 
3218 	/*
3219 	 * Query all ports for the given HCA
3220 	 */
3221 	rw_enter(&hca->state_lock, RW_READER);
3222 	if (hca->state != HCA_DETACHED) {
3223 		ibt_status = ibt_query_hca_ports(hca->hca_hdl, 0, &port_infop,
3224 		    &num_ports, &port_size);
3225 		rw_exit(&hca->state_lock);
3226 	} else {
3227 		rw_exit(&hca->state_lock);
3228 		return (RDMA_FAILED);
3229 	}
3230 	if (ibt_status != IBT_SUCCESS) {
3231 		return (RDMA_FAILED);
3232 	}
3233 
3234 	DTRACE_PROBE1(rpcib__i__regservice_numports,
3235 	    int, num_ports);
3236 
3237 	for (i = 0; i < num_ports; i++) {
3238 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE) {
3239 			DTRACE_PROBE1(rpcib__i__regservice__portinactive,
3240 			    int, i+1);
3241 		} else if (port_infop[i].p_linkstate == IBT_PORT_ACTIVE) {
3242 			DTRACE_PROBE1(rpcib__i__regservice__portactive,
3243 			    int, i+1);
3244 		}
3245 	}
3246 
3247 	/*
3248 	 * Get all the IP addresses on this system to register the
3249 	 * given "service type" on all DNS recognized IP addrs.
3250 	 * Each service type such as NFS will have all the systems
3251 	 * IP addresses as its different names. For now the only
3252 	 * type of service we support in RPCIB is NFS.
3253 	 */
3254 	rw_enter(&rib_stat->service_list_lock, RW_WRITER);
3255 	/*
3256 	 * Start registering and binding service to active
3257 	 * on active ports on this HCA.
3258 	 */
3259 	nbinds = 0;
3260 	for (service = rib_stat->service_list;
3261 	    service && (service->srv_type != service_type);
3262 	    service = service->next)
3263 		;
3264 
3265 	if (service == NULL) {
3266 		/*
3267 		 * We use IP addresses as the service names for
3268 		 * service registration.  Register each of them
3269 		 * with CM to obtain a svc_id and svc_hdl.  We do not
3270 		 * register the service with machine's loopback address.
3271 		 */
3272 		(void) bzero(&srv_id, sizeof (ib_svc_id_t));
3273 		(void) bzero(&srv_hdl, sizeof (ibt_srv_hdl_t));
3274 		(void) bzero(&sdesc, sizeof (ibt_srv_desc_t));
3275 		sdesc.sd_handler = rib_srv_cm_handler;
3276 		sdesc.sd_flags = 0;
3277 		ibt_status = ibt_register_service(hca->ibt_clnt_hdl,
3278 		    &sdesc, ibt_get_ip_sid(protocol_num, dst_port),
3279 		    1, &srv_hdl, &srv_id);
3280 		if ((ibt_status != IBT_SUCCESS) &&
3281 		    (ibt_status != IBT_CM_SERVICE_EXISTS)) {
3282 			rw_exit(&rib_stat->service_list_lock);
3283 			DTRACE_PROBE1(rpcib__i__regservice__ibtres,
3284 			    int, ibt_status);
3285 			ibt_free_portinfo(port_infop, port_size);
3286 			return (RDMA_FAILED);
3287 		}
3288 
3289 		/*
3290 		 * Allocate and prepare a service entry
3291 		 */
3292 		service = kmem_zalloc(sizeof (rib_service_t), KM_SLEEP);
3293 
3294 		service->srv_type = service_type;
3295 		service->srv_hdl = srv_hdl;
3296 		service->srv_id = srv_id;
3297 
3298 		service->next = rib_stat->service_list;
3299 		rib_stat->service_list = service;
3300 		DTRACE_PROBE1(rpcib__i__regservice__new__service,
3301 		    int, service->srv_type);
3302 	} else {
3303 		srv_hdl = service->srv_hdl;
3304 		srv_id = service->srv_id;
3305 		DTRACE_PROBE1(rpcib__i__regservice__existing__service,
3306 		    int, service->srv_type);
3307 	}
3308 
3309 	for (i = 0; i < num_ports; i++) {
3310 		ibt_sbind_hdl_t		sbp;
3311 		rib_hca_service_t	*hca_srv;
3312 		ib_gid_t		gid;
3313 
3314 		if (port_infop[i].p_linkstate != IBT_PORT_ACTIVE)
3315 			continue;
3316 
3317 		for (pki = 0; pki < port_infop[i].p_pkey_tbl_sz; pki++) {
3318 			pkey = port_infop[i].p_pkey_tbl[pki];
3319 
3320 			rw_enter(&hca->bound_services_lock, RW_READER);
3321 			gid = port_infop[i].p_sgid_tbl[0];
3322 			for (hca_srv = hca->bound_services; hca_srv;
3323 			    hca_srv = hca_srv->next) {
3324 				if ((hca_srv->srv_id == service->srv_id) &&
3325 				    (hca_srv->gid.gid_prefix ==
3326 				    gid.gid_prefix) &&
3327 				    (hca_srv->gid.gid_guid == gid.gid_guid))
3328 					break;
3329 			}
3330 			rw_exit(&hca->bound_services_lock);
3331 			if (hca_srv != NULL) {
3332 				/*
3333 				 * port is alreay bound the the service
3334 				 */
3335 				DTRACE_PROBE1(
3336 				    rpcib__i__regservice__already__bound,
3337 				    int, i+1);
3338 				nbinds++;
3339 				continue;
3340 			}
3341 
3342 			if ((pkey & IBSRM_HB) &&
3343 			    (pkey != IB_PKEY_INVALID_FULL)) {
3344 
3345 				sbp = NULL;
3346 				ibt_status = ibt_bind_service(srv_hdl,
3347 				    gid, NULL, hca, &sbp);
3348 
3349 				if (ibt_status == IBT_SUCCESS) {
3350 					hca_srv = kmem_zalloc(
3351 					    sizeof (rib_hca_service_t),
3352 					    KM_SLEEP);
3353 					hca_srv->srv_id = srv_id;
3354 					hca_srv->gid = gid;
3355 					hca_srv->sbind_hdl = sbp;
3356 
3357 					rw_enter(&hca->bound_services_lock,
3358 					    RW_WRITER);
3359 					hca_srv->next = hca->bound_services;
3360 					hca->bound_services = hca_srv;
3361 					rw_exit(&hca->bound_services_lock);
3362 					nbinds++;
3363 				}
3364 
3365 				DTRACE_PROBE1(rpcib__i__regservice__bindres,
3366 				    int, ibt_status);
3367 			}
3368 		}
3369 	}
3370 	rw_exit(&rib_stat->service_list_lock);
3371 
3372 	ibt_free_portinfo(port_infop, port_size);
3373 
3374 	if (nbinds == 0) {
3375 		return (RDMA_FAILED);
3376 	} else {
3377 		/*
3378 		 * Put this plugin into accept state, since atleast
3379 		 * one registration was successful.
3380 		 */
3381 		mutex_enter(&plugin_state_lock);
3382 		plugin_state = ACCEPT;
3383 		mutex_exit(&plugin_state_lock);
3384 		return (RDMA_SUCCESS);
3385 	}
3386 }
3387 
3388 void
3389 rib_listen(struct rdma_svc_data *rd)
3390 {
3391 	rdma_stat status;
3392 	int n_listening = 0;
3393 	rib_hca_t *hca;
3394 
3395 	mutex_enter(&rib_stat->listen_lock);
3396 	/*
3397 	 * if rd parameter is NULL then it means that rib_stat->q is
3398 	 * already initialized by a call from RDMA and we just want to
3399 	 * add a newly attached HCA to the same listening state as other
3400 	 * HCAs.
3401 	 */
3402 	if (rd == NULL) {
3403 		if (rib_stat->q == NULL) {
3404 			mutex_exit(&rib_stat->listen_lock);
3405 			return;
3406 		}
3407 	} else {
3408 		rib_stat->q = &rd->q;
3409 	}
3410 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3411 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3412 		/*
3413 		 * First check if a hca is still attached
3414 		 */
3415 		rw_enter(&hca->state_lock, RW_READER);
3416 		if (hca->state != HCA_INITED) {
3417 			rw_exit(&hca->state_lock);
3418 			continue;
3419 		}
3420 		rw_exit(&hca->state_lock);
3421 
3422 		/*
3423 		 * Right now the only service type is NFS. Hence
3424 		 * force feed this value. Ideally to communicate
3425 		 * the service type it should be passed down in
3426 		 * rdma_svc_data.
3427 		 */
3428 		status = rib_register_service(hca, NFS,
3429 		    IPPROTO_TCP, nfs_rdma_port);
3430 		if (status == RDMA_SUCCESS)
3431 			n_listening++;
3432 	}
3433 	rw_exit(&rib_stat->hcas_list_lock);
3434 
3435 	/*
3436 	 * Service active on an HCA, check rd->err_code for more
3437 	 * explainable errors.
3438 	 */
3439 	if (rd) {
3440 		if (n_listening > 0) {
3441 			rd->active = 1;
3442 			rd->err_code = RDMA_SUCCESS;
3443 		} else {
3444 			rd->active = 0;
3445 			rd->err_code = RDMA_FAILED;
3446 		}
3447 	}
3448 	mutex_exit(&rib_stat->listen_lock);
3449 }
3450 
3451 /* XXXX */
3452 /* ARGSUSED */
3453 static void
3454 rib_listen_stop(struct rdma_svc_data *svcdata)
3455 {
3456 	rib_hca_t		*hca;
3457 
3458 	mutex_enter(&rib_stat->listen_lock);
3459 	/*
3460 	 * KRPC called the RDMATF to stop the listeners, this means
3461 	 * stop sending incomming or recieved requests to KRPC master
3462 	 * transport handle for RDMA-IB. This is also means that the
3463 	 * master transport handle, responsible for us, is going away.
3464 	 */
3465 	mutex_enter(&plugin_state_lock);
3466 	plugin_state = NO_ACCEPT;
3467 	if (svcdata != NULL)
3468 		svcdata->active = 0;
3469 	mutex_exit(&plugin_state_lock);
3470 
3471 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
3472 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
3473 		/*
3474 		 * First check if a hca is still attached
3475 		 */
3476 		rw_enter(&hca->state_lock, RW_READER);
3477 		if (hca->state == HCA_DETACHED) {
3478 			rw_exit(&hca->state_lock);
3479 			continue;
3480 		}
3481 		rib_close_channels(&hca->srv_conn_list);
3482 		rib_stop_services(hca);
3483 		rw_exit(&hca->state_lock);
3484 	}
3485 	rw_exit(&rib_stat->hcas_list_lock);
3486 
3487 	/*
3488 	 * Avoid rib_listen() using the stale q field.
3489 	 * This could happen if a port goes up after all services
3490 	 * are already unregistered.
3491 	 */
3492 	rib_stat->q = NULL;
3493 	mutex_exit(&rib_stat->listen_lock);
3494 }
3495 
3496 /*
3497  * Traverse the HCA's service list to unbind and deregister services.
3498  * For each bound service of HCA to be removed, first find the corresponding
3499  * service handle (srv_hdl) and then unbind the service by calling
3500  * ibt_unbind_service().
3501  */
3502 static void
3503 rib_stop_services(rib_hca_t *hca)
3504 {
3505 	rib_hca_service_t *srv_list, *to_remove;
3506 
3507 	/*
3508 	 * unbind and deregister the services for this service type.
3509 	 * Right now there is only one service type. In future it will
3510 	 * be passed down to this function.
3511 	 */
3512 	rw_enter(&hca->bound_services_lock, RW_READER);
3513 	srv_list = hca->bound_services;
3514 	hca->bound_services = NULL;
3515 	rw_exit(&hca->bound_services_lock);
3516 
3517 	while (srv_list != NULL) {
3518 		rib_service_t *sc;
3519 
3520 		to_remove = srv_list;
3521 		srv_list = to_remove->next;
3522 		rw_enter(&rib_stat->service_list_lock, RW_READER);
3523 		for (sc = rib_stat->service_list;
3524 		    sc && (sc->srv_id != to_remove->srv_id);
3525 		    sc = sc->next)
3526 			;
3527 		/*
3528 		 * if sc is NULL then the service doesn't exist anymore,
3529 		 * probably just removed completely through rib_stat.
3530 		 */
3531 		if (sc != NULL)
3532 			(void) ibt_unbind_service(sc->srv_hdl,
3533 			    to_remove->sbind_hdl);
3534 		rw_exit(&rib_stat->service_list_lock);
3535 		kmem_free(to_remove, sizeof (rib_hca_service_t));
3536 	}
3537 }
3538 
3539 static struct svc_recv *
3540 rib_init_svc_recv(rib_qp_t *qp, ibt_wr_ds_t *sgl)
3541 {
3542 	struct svc_recv	*recvp;
3543 
3544 	recvp = kmem_zalloc(sizeof (struct svc_recv), KM_SLEEP);
3545 	recvp->vaddr = sgl->ds_va;
3546 	recvp->qp = qp;
3547 	recvp->bytes_xfer = 0;
3548 	return (recvp);
3549 }
3550 
3551 static int
3552 rib_free_svc_recv(struct svc_recv *recvp)
3553 {
3554 	kmem_free(recvp, sizeof (*recvp));
3555 
3556 	return (0);
3557 }
3558 
3559 static struct reply *
3560 rib_addreplylist(rib_qp_t *qp, uint32_t msgid)
3561 {
3562 	struct reply	*rep;
3563 
3564 
3565 	rep = kmem_zalloc(sizeof (struct reply), KM_NOSLEEP);
3566 	if (rep == NULL) {
3567 		DTRACE_PROBE(rpcib__i__addrreply__nomem);
3568 		return (NULL);
3569 	}
3570 	rep->xid = msgid;
3571 	rep->vaddr_cq = NULL;
3572 	rep->bytes_xfer = 0;
3573 	rep->status = (uint_t)REPLY_WAIT;
3574 	rep->prev = NULL;
3575 	cv_init(&rep->wait_cv, NULL, CV_DEFAULT, NULL);
3576 
3577 	mutex_enter(&qp->replylist_lock);
3578 	if (qp->replylist) {
3579 		rep->next = qp->replylist;
3580 		qp->replylist->prev = rep;
3581 	}
3582 	qp->rep_list_size++;
3583 
3584 	DTRACE_PROBE1(rpcib__i__addrreply__listsize,
3585 	    int, qp->rep_list_size);
3586 
3587 	qp->replylist = rep;
3588 	mutex_exit(&qp->replylist_lock);
3589 
3590 	return (rep);
3591 }
3592 
3593 static rdma_stat
3594 rib_rem_replylist(rib_qp_t *qp)
3595 {
3596 	struct reply	*r, *n;
3597 
3598 	mutex_enter(&qp->replylist_lock);
3599 	for (r = qp->replylist; r != NULL; r = n) {
3600 		n = r->next;
3601 		(void) rib_remreply(qp, r);
3602 	}
3603 	mutex_exit(&qp->replylist_lock);
3604 
3605 	return (RDMA_SUCCESS);
3606 }
3607 
3608 static int
3609 rib_remreply(rib_qp_t *qp, struct reply *rep)
3610 {
3611 
3612 	ASSERT(MUTEX_HELD(&qp->replylist_lock));
3613 	if (rep->prev) {
3614 		rep->prev->next = rep->next;
3615 	}
3616 	if (rep->next) {
3617 		rep->next->prev = rep->prev;
3618 	}
3619 	if (qp->replylist == rep)
3620 		qp->replylist = rep->next;
3621 
3622 	cv_destroy(&rep->wait_cv);
3623 	qp->rep_list_size--;
3624 
3625 	DTRACE_PROBE1(rpcib__i__remreply__listsize,
3626 	    int, qp->rep_list_size);
3627 
3628 	kmem_free(rep, sizeof (*rep));
3629 
3630 	return (0);
3631 }
3632 
3633 rdma_stat
3634 rib_registermem(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3635 	struct mrc *buf_handle)
3636 {
3637 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3638 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3639 	rdma_stat	status;
3640 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3641 
3642 	/*
3643 	 * Note: ALL buffer pools use the same memory type RDMARW.
3644 	 */
3645 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3646 	if (status == RDMA_SUCCESS) {
3647 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3648 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3649 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3650 	} else {
3651 		buf_handle->mrc_linfo = NULL;
3652 		buf_handle->mrc_lmr = 0;
3653 		buf_handle->mrc_rmr = 0;
3654 	}
3655 	return (status);
3656 }
3657 
3658 static rdma_stat
3659 rib_reg_mem(rib_hca_t *hca, caddr_t adsp, caddr_t buf, uint_t size,
3660 	ibt_mr_flags_t spec,
3661 	ibt_mr_hdl_t *mr_hdlp, ibt_mr_desc_t *mr_descp)
3662 {
3663 	ibt_mr_attr_t	mem_attr;
3664 	ibt_status_t	ibt_status;
3665 	mem_attr.mr_vaddr = (uintptr_t)buf;
3666 	mem_attr.mr_len = (ib_msglen_t)size;
3667 	mem_attr.mr_as = (struct as *)(caddr_t)adsp;
3668 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE |
3669 	    IBT_MR_ENABLE_REMOTE_READ | IBT_MR_ENABLE_REMOTE_WRITE |
3670 	    IBT_MR_ENABLE_WINDOW_BIND | spec;
3671 
3672 	rw_enter(&hca->state_lock, RW_READER);
3673 	if (hca->state != HCA_DETACHED) {
3674 		ibt_status = ibt_register_mr(hca->hca_hdl, hca->pd_hdl,
3675 		    &mem_attr, mr_hdlp, mr_descp);
3676 		rw_exit(&hca->state_lock);
3677 	} else {
3678 		rw_exit(&hca->state_lock);
3679 		return (RDMA_FAILED);
3680 	}
3681 
3682 	if (ibt_status != IBT_SUCCESS) {
3683 		return (RDMA_FAILED);
3684 	}
3685 	return (RDMA_SUCCESS);
3686 }
3687 
3688 rdma_stat
3689 rib_registermemsync(CONN *conn,  caddr_t adsp, caddr_t buf, uint_t buflen,
3690 	struct mrc *buf_handle, RIB_SYNCMEM_HANDLE *sync_handle, void *lrc)
3691 {
3692 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
3693 	rib_lrc_entry_t *l;
3694 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
3695 	rdma_stat	status;
3696 	rib_hca_t	*hca = (ctoqp(conn))->hca;
3697 
3698 	/*
3699 	 * Non-coherent memory registration.
3700 	 */
3701 	l = (rib_lrc_entry_t *)lrc;
3702 	if (l) {
3703 		if (l->registered) {
3704 			buf_handle->mrc_linfo =
3705 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3706 			buf_handle->mrc_lmr =
3707 			    (uint32_t)l->lrc_mhandle.mrc_lmr;
3708 			buf_handle->mrc_rmr =
3709 			    (uint32_t)l->lrc_mhandle.mrc_rmr;
3710 			*sync_handle = (RIB_SYNCMEM_HANDLE)
3711 			    (uintptr_t)l->lrc_mhandle.mrc_linfo;
3712 			return (RDMA_SUCCESS);
3713 		} else {
3714 			/* Always register the whole buffer */
3715 			buf = (caddr_t)l->lrc_buf;
3716 			buflen = l->lrc_len;
3717 		}
3718 	}
3719 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
3720 
3721 	if (status == RDMA_SUCCESS) {
3722 		if (l) {
3723 			l->lrc_mhandle.mrc_linfo = (uintptr_t)mr_hdl;
3724 			l->lrc_mhandle.mrc_lmr   = (uint32_t)mr_desc.md_lkey;
3725 			l->lrc_mhandle.mrc_rmr   = (uint32_t)mr_desc.md_rkey;
3726 			l->registered		 = TRUE;
3727 		}
3728 		buf_handle->mrc_linfo = (uintptr_t)mr_hdl;
3729 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
3730 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
3731 		*sync_handle = (RIB_SYNCMEM_HANDLE)mr_hdl;
3732 	} else {
3733 		buf_handle->mrc_linfo = NULL;
3734 		buf_handle->mrc_lmr = 0;
3735 		buf_handle->mrc_rmr = 0;
3736 	}
3737 	return (status);
3738 }
3739 
3740 /* ARGSUSED */
3741 rdma_stat
3742 rib_deregistermem(CONN *conn, caddr_t buf, struct mrc buf_handle)
3743 {
3744 	rib_hca_t *hca = (ctoqp(conn))->hca;
3745 	/*
3746 	 * Allow memory deregistration even if HCA is
3747 	 * getting detached. Need all outstanding
3748 	 * memory registrations to be deregistered
3749 	 * before HCA_DETACH_EVENT can be accepted.
3750 	 */
3751 	(void) ibt_deregister_mr(hca->hca_hdl,
3752 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
3753 	return (RDMA_SUCCESS);
3754 }
3755 
3756 /* ARGSUSED */
3757 rdma_stat
3758 rib_deregistermemsync(CONN *conn, caddr_t buf, struct mrc buf_handle,
3759 		RIB_SYNCMEM_HANDLE sync_handle, void *lrc)
3760 {
3761 	rib_lrc_entry_t *l;
3762 	l = (rib_lrc_entry_t *)lrc;
3763 	if (l)
3764 		if (l->registered)
3765 			return (RDMA_SUCCESS);
3766 
3767 	(void) rib_deregistermem(conn, buf, buf_handle);
3768 
3769 	return (RDMA_SUCCESS);
3770 }
3771 
3772 /* ARGSUSED */
3773 rdma_stat
3774 rib_syncmem(CONN *conn, RIB_SYNCMEM_HANDLE shandle, caddr_t buf,
3775 		int len, int cpu)
3776 {
3777 	ibt_status_t	status;
3778 	rib_hca_t *hca = (ctoqp(conn))->hca;
3779 	ibt_mr_sync_t	mr_segment;
3780 
3781 	mr_segment.ms_handle = (ibt_mr_hdl_t)shandle;
3782 	mr_segment.ms_vaddr = (ib_vaddr_t)(uintptr_t)buf;
3783 	mr_segment.ms_len = (ib_memlen_t)len;
3784 	if (cpu) {
3785 		/* make incoming data visible to memory */
3786 		mr_segment.ms_flags = IBT_SYNC_WRITE;
3787 	} else {
3788 		/* make memory changes visible to IO */
3789 		mr_segment.ms_flags = IBT_SYNC_READ;
3790 	}
3791 	rw_enter(&hca->state_lock, RW_READER);
3792 	if (hca->state != HCA_DETACHED) {
3793 		status = ibt_sync_mr(hca->hca_hdl, &mr_segment, 1);
3794 		rw_exit(&hca->state_lock);
3795 	} else {
3796 		rw_exit(&hca->state_lock);
3797 		return (RDMA_FAILED);
3798 	}
3799 
3800 	if (status == IBT_SUCCESS)
3801 		return (RDMA_SUCCESS);
3802 	else {
3803 		return (RDMA_FAILED);
3804 	}
3805 }
3806 
3807 /*
3808  * XXXX	????
3809  */
3810 static rdma_stat
3811 rib_getinfo(rdma_info_t *info)
3812 {
3813 	/*
3814 	 * XXXX	Hack!
3815 	 */
3816 	info->addrlen = 16;
3817 	info->mts = 1000000;
3818 	info->mtu = 1000000;
3819 
3820 	return (RDMA_SUCCESS);
3821 }
3822 
3823 rib_bufpool_t *
3824 rib_rbufpool_create(rib_hca_t *hca, int ptype, int num)
3825 {
3826 	rib_bufpool_t	*rbp = NULL;
3827 	bufpool_t	*bp = NULL;
3828 	caddr_t		buf;
3829 	ibt_mr_attr_t	mem_attr;
3830 	ibt_status_t	ibt_status;
3831 	int		i, j;
3832 
3833 	rbp = (rib_bufpool_t *)kmem_zalloc(sizeof (rib_bufpool_t), KM_SLEEP);
3834 
3835 	bp = (bufpool_t *)kmem_zalloc(sizeof (bufpool_t) +
3836 	    num * sizeof (void *), KM_SLEEP);
3837 
3838 	mutex_init(&bp->buflock, NULL, MUTEX_DRIVER, hca->iblock);
3839 	bp->numelems = num;
3840 
3841 
3842 	switch (ptype) {
3843 	case SEND_BUFFER:
3844 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3845 		bp->rsize = RPC_MSG_SZ;
3846 		break;
3847 	case RECV_BUFFER:
3848 		mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3849 		bp->rsize = RPC_BUF_SIZE;
3850 		break;
3851 	default:
3852 		goto fail;
3853 	}
3854 
3855 	/*
3856 	 * Register the pool.
3857 	 */
3858 	bp->bufsize = num * bp->rsize;
3859 	bp->buf = kmem_zalloc(bp->bufsize, KM_SLEEP);
3860 	rbp->mr_hdl = (ibt_mr_hdl_t *)kmem_zalloc(num *
3861 	    sizeof (ibt_mr_hdl_t), KM_SLEEP);
3862 	rbp->mr_desc = (ibt_mr_desc_t *)kmem_zalloc(num *
3863 	    sizeof (ibt_mr_desc_t), KM_SLEEP);
3864 	rw_enter(&hca->state_lock, RW_READER);
3865 
3866 	if (hca->state == HCA_DETACHED) {
3867 		rw_exit(&hca->state_lock);
3868 		goto fail;
3869 	}
3870 
3871 	for (i = 0, buf = bp->buf; i < num; i++, buf += bp->rsize) {
3872 		bzero(&rbp->mr_desc[i], sizeof (ibt_mr_desc_t));
3873 		mem_attr.mr_vaddr = (uintptr_t)buf;
3874 		mem_attr.mr_len = (ib_msglen_t)bp->rsize;
3875 		mem_attr.mr_as = NULL;
3876 		ibt_status = ibt_register_mr(hca->hca_hdl,
3877 		    hca->pd_hdl, &mem_attr,
3878 		    &rbp->mr_hdl[i],
3879 		    &rbp->mr_desc[i]);
3880 		if (ibt_status != IBT_SUCCESS) {
3881 			for (j = 0; j < i; j++) {
3882 				(void) ibt_deregister_mr(hca->hca_hdl,
3883 				    rbp->mr_hdl[j]);
3884 			}
3885 			rw_exit(&hca->state_lock);
3886 			goto fail;
3887 		}
3888 	}
3889 	rw_exit(&hca->state_lock);
3890 	buf = (caddr_t)bp->buf;
3891 	for (i = 0; i < num; i++, buf += bp->rsize) {
3892 		bp->buflist[i] = (void *)buf;
3893 	}
3894 	bp->buffree = num - 1;	/* no. of free buffers */
3895 	rbp->bpool = bp;
3896 
3897 	return (rbp);
3898 fail:
3899 	if (bp) {
3900 		if (bp->buf)
3901 			kmem_free(bp->buf, bp->bufsize);
3902 		kmem_free(bp, sizeof (bufpool_t) + num*sizeof (void *));
3903 	}
3904 	if (rbp) {
3905 		if (rbp->mr_hdl)
3906 			kmem_free(rbp->mr_hdl, num*sizeof (ibt_mr_hdl_t));
3907 		if (rbp->mr_desc)
3908 			kmem_free(rbp->mr_desc, num*sizeof (ibt_mr_desc_t));
3909 		kmem_free(rbp, sizeof (rib_bufpool_t));
3910 	}
3911 	return (NULL);
3912 }
3913 
3914 static void
3915 rib_rbufpool_deregister(rib_hca_t *hca, int ptype)
3916 {
3917 	int i;
3918 	rib_bufpool_t *rbp = NULL;
3919 	bufpool_t *bp;
3920 
3921 	/*
3922 	 * Obtain pool address based on type of pool
3923 	 */
3924 	switch (ptype) {
3925 		case SEND_BUFFER:
3926 			rbp = hca->send_pool;
3927 			break;
3928 		case RECV_BUFFER:
3929 			rbp = hca->recv_pool;
3930 			break;
3931 		default:
3932 			return;
3933 	}
3934 	if (rbp == NULL)
3935 		return;
3936 
3937 	bp = rbp->bpool;
3938 
3939 	/*
3940 	 * Deregister the pool memory and free it.
3941 	 */
3942 	for (i = 0; i < bp->numelems; i++) {
3943 		(void) ibt_deregister_mr(hca->hca_hdl, rbp->mr_hdl[i]);
3944 	}
3945 }
3946 
3947 static void
3948 rib_rbufpool_free(rib_hca_t *hca, int ptype)
3949 {
3950 
3951 	rib_bufpool_t *rbp = NULL;
3952 	bufpool_t *bp;
3953 
3954 	/*
3955 	 * Obtain pool address based on type of pool
3956 	 */
3957 	switch (ptype) {
3958 		case SEND_BUFFER:
3959 			rbp = hca->send_pool;
3960 			break;
3961 		case RECV_BUFFER:
3962 			rbp = hca->recv_pool;
3963 			break;
3964 		default:
3965 			return;
3966 	}
3967 	if (rbp == NULL)
3968 		return;
3969 
3970 	bp = rbp->bpool;
3971 
3972 	/*
3973 	 * Free the pool memory.
3974 	 */
3975 	if (rbp->mr_hdl)
3976 		kmem_free(rbp->mr_hdl, bp->numelems*sizeof (ibt_mr_hdl_t));
3977 
3978 	if (rbp->mr_desc)
3979 		kmem_free(rbp->mr_desc, bp->numelems*sizeof (ibt_mr_desc_t));
3980 	if (bp->buf)
3981 		kmem_free(bp->buf, bp->bufsize);
3982 	mutex_destroy(&bp->buflock);
3983 	kmem_free(bp, sizeof (bufpool_t) + bp->numelems*sizeof (void *));
3984 	kmem_free(rbp, sizeof (rib_bufpool_t));
3985 }
3986 
3987 void
3988 rib_rbufpool_destroy(rib_hca_t *hca, int ptype)
3989 {
3990 	/*
3991 	 * Deregister the pool memory and free it.
3992 	 */
3993 	rib_rbufpool_deregister(hca, ptype);
3994 	rib_rbufpool_free(hca, ptype);
3995 }
3996 
3997 /*
3998  * Fetch a buffer from the pool of type specified in rdbuf->type.
3999  */
4000 static rdma_stat
4001 rib_reg_buf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4002 {
4003 	rib_lrc_entry_t *rlep;
4004 
4005 	if (rdbuf->type ==  RDMA_LONG_BUFFER) {
4006 		rlep = rib_get_cache_buf(conn, rdbuf->len);
4007 		rdbuf->rb_private =  (caddr_t)rlep;
4008 		rdbuf->addr = rlep->lrc_buf;
4009 		rdbuf->handle = rlep->lrc_mhandle;
4010 		return (RDMA_SUCCESS);
4011 	}
4012 
4013 	rdbuf->addr = rib_rbuf_alloc(conn, rdbuf);
4014 	if (rdbuf->addr) {
4015 		switch (rdbuf->type) {
4016 		case SEND_BUFFER:
4017 			rdbuf->len = RPC_MSG_SZ;	/* 1K */
4018 			break;
4019 		case RECV_BUFFER:
4020 			rdbuf->len = RPC_BUF_SIZE; /* 2K */
4021 			break;
4022 		default:
4023 			rdbuf->len = 0;
4024 		}
4025 		return (RDMA_SUCCESS);
4026 	} else
4027 		return (RDMA_FAILED);
4028 }
4029 
4030 /*
4031  * Fetch a buffer of specified type.
4032  * Note that rdbuf->handle is mw's rkey.
4033  */
4034 static void *
4035 rib_rbuf_alloc(CONN *conn, rdma_buf_t *rdbuf)
4036 {
4037 	rib_qp_t	*qp = ctoqp(conn);
4038 	rib_hca_t	*hca = qp->hca;
4039 	rdma_btype	ptype = rdbuf->type;
4040 	void		*buf;
4041 	rib_bufpool_t	*rbp = NULL;
4042 	bufpool_t	*bp;
4043 	int		i;
4044 
4045 	/*
4046 	 * Obtain pool address based on type of pool
4047 	 */
4048 	switch (ptype) {
4049 	case SEND_BUFFER:
4050 		rbp = hca->send_pool;
4051 		break;
4052 	case RECV_BUFFER:
4053 		rbp = hca->recv_pool;
4054 		break;
4055 	default:
4056 		return (NULL);
4057 	}
4058 	if (rbp == NULL)
4059 		return (NULL);
4060 
4061 	bp = rbp->bpool;
4062 
4063 	mutex_enter(&bp->buflock);
4064 	if (bp->buffree < 0) {
4065 		mutex_exit(&bp->buflock);
4066 		return (NULL);
4067 	}
4068 
4069 	/* XXXX put buf, rdbuf->handle.mrc_rmr, ... in one place. */
4070 	buf = bp->buflist[bp->buffree];
4071 	rdbuf->addr = buf;
4072 	rdbuf->len = bp->rsize;
4073 	for (i = bp->numelems - 1; i >= 0; i--) {
4074 		if ((ib_vaddr_t)(uintptr_t)buf == rbp->mr_desc[i].md_vaddr) {
4075 			rdbuf->handle.mrc_rmr =
4076 			    (uint32_t)rbp->mr_desc[i].md_rkey;
4077 			rdbuf->handle.mrc_linfo =
4078 			    (uintptr_t)rbp->mr_hdl[i];
4079 			rdbuf->handle.mrc_lmr =
4080 			    (uint32_t)rbp->mr_desc[i].md_lkey;
4081 			bp->buffree--;
4082 
4083 			mutex_exit(&bp->buflock);
4084 
4085 			return (buf);
4086 		}
4087 	}
4088 
4089 	mutex_exit(&bp->buflock);
4090 
4091 	return (NULL);
4092 }
4093 
4094 static void
4095 rib_reg_buf_free(CONN *conn, rdma_buf_t *rdbuf)
4096 {
4097 
4098 	if (rdbuf->type == RDMA_LONG_BUFFER) {
4099 		rib_free_cache_buf(conn, (rib_lrc_entry_t *)rdbuf->rb_private);
4100 		rdbuf->rb_private = NULL;
4101 		return;
4102 	}
4103 	rib_rbuf_free(conn, rdbuf->type, rdbuf->addr);
4104 }
4105 
4106 static void
4107 rib_rbuf_free(CONN *conn, int ptype, void *buf)
4108 {
4109 	rib_qp_t *qp = ctoqp(conn);
4110 	rib_hca_t *hca = qp->hca;
4111 	rib_bufpool_t *rbp = NULL;
4112 	bufpool_t *bp;
4113 
4114 	/*
4115 	 * Obtain pool address based on type of pool
4116 	 */
4117 	switch (ptype) {
4118 	case SEND_BUFFER:
4119 		rbp = hca->send_pool;
4120 		break;
4121 	case RECV_BUFFER:
4122 		rbp = hca->recv_pool;
4123 		break;
4124 	default:
4125 		return;
4126 	}
4127 	if (rbp == NULL)
4128 		return;
4129 
4130 	bp = rbp->bpool;
4131 
4132 	mutex_enter(&bp->buflock);
4133 	if (++bp->buffree >= bp->numelems) {
4134 		/*
4135 		 * Should never happen
4136 		 */
4137 		bp->buffree--;
4138 	} else {
4139 		bp->buflist[bp->buffree] = buf;
4140 	}
4141 	mutex_exit(&bp->buflock);
4142 }
4143 
4144 static rdma_stat
4145 rib_add_connlist(CONN *cn, rib_conn_list_t *connlist)
4146 {
4147 	rw_enter(&connlist->conn_lock, RW_WRITER);
4148 	if (connlist->conn_hd) {
4149 		cn->c_next = connlist->conn_hd;
4150 		connlist->conn_hd->c_prev = cn;
4151 	}
4152 	connlist->conn_hd = cn;
4153 	rw_exit(&connlist->conn_lock);
4154 
4155 	return (RDMA_SUCCESS);
4156 }
4157 
4158 static rdma_stat
4159 rib_rm_conn(CONN *cn, rib_conn_list_t *connlist)
4160 {
4161 	rw_enter(&connlist->conn_lock, RW_WRITER);
4162 	if (cn->c_prev) {
4163 		cn->c_prev->c_next = cn->c_next;
4164 	}
4165 	if (cn->c_next) {
4166 		cn->c_next->c_prev = cn->c_prev;
4167 	}
4168 	if (connlist->conn_hd == cn)
4169 		connlist->conn_hd = cn->c_next;
4170 	rw_exit(&connlist->conn_lock);
4171 
4172 	return (RDMA_SUCCESS);
4173 }
4174 
4175 /* ARGSUSED */
4176 static rdma_stat
4177 rib_conn_get(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4178     int addr_type, void *handle, CONN **conn)
4179 {
4180 	rdma_stat status;
4181 	rpcib_ping_t rpt;
4182 
4183 	status = rib_connect(s_svcaddr, d_svcaddr, addr_type, &rpt, conn);
4184 	return (status);
4185 }
4186 
4187 /*
4188  * rib_find_hca_connection
4189  *
4190  * if there is an existing connection to the specified address then
4191  * it will be returned in conn, otherwise conn will be set to NULL.
4192  * Also cleans up any connection that is in error state.
4193  */
4194 static int
4195 rib_find_hca_connection(rib_hca_t *hca, struct netbuf *s_svcaddr,
4196     struct netbuf *d_svcaddr, CONN **conn)
4197 {
4198 	CONN *cn;
4199 	clock_t cv_stat, timout;
4200 
4201 	*conn = NULL;
4202 again:
4203 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4204 	cn = hca->cl_conn_list.conn_hd;
4205 	while (cn != NULL) {
4206 		/*
4207 		 * First, clear up any connection in the ERROR state
4208 		 */
4209 		mutex_enter(&cn->c_lock);
4210 		if (cn->c_state == C_ERROR_CONN) {
4211 			if (cn->c_ref == 0) {
4212 				/*
4213 				 * Remove connection from list and destroy it.
4214 				 */
4215 				cn->c_state = C_DISCONN_PEND;
4216 				mutex_exit(&cn->c_lock);
4217 				rw_exit(&hca->cl_conn_list.conn_lock);
4218 				rib_conn_close((void *)cn);
4219 				goto again;
4220 			}
4221 			mutex_exit(&cn->c_lock);
4222 			cn = cn->c_next;
4223 			continue;
4224 		}
4225 		if (cn->c_state == C_DISCONN_PEND) {
4226 			mutex_exit(&cn->c_lock);
4227 			cn = cn->c_next;
4228 			continue;
4229 		}
4230 
4231 		/*
4232 		 * source address is only checked for if there is one,
4233 		 * this is the case for retries.
4234 		 */
4235 		if ((cn->c_raddr.len == d_svcaddr->len) &&
4236 		    (bcmp(d_svcaddr->buf, cn->c_raddr.buf,
4237 		    d_svcaddr->len) == 0) &&
4238 		    ((s_svcaddr->len == 0) ||
4239 		    ((cn->c_laddr.len == s_svcaddr->len) &&
4240 		    (bcmp(s_svcaddr->buf, cn->c_laddr.buf,
4241 		    s_svcaddr->len) == 0)))) {
4242 			/*
4243 			 * Our connection. Give up conn list lock
4244 			 * as we are done traversing the list.
4245 			 */
4246 			rw_exit(&hca->cl_conn_list.conn_lock);
4247 			if (cn->c_state == C_CONNECTED) {
4248 				cn->c_ref++;	/* sharing a conn */
4249 				mutex_exit(&cn->c_lock);
4250 				*conn = cn;
4251 				return (RDMA_SUCCESS);
4252 			}
4253 			if (cn->c_state == C_CONN_PEND) {
4254 				/*
4255 				 * Hold a reference to this conn before
4256 				 * we give up the lock.
4257 				 */
4258 				cn->c_ref++;
4259 				timout =  ddi_get_lbolt() +
4260 				    drv_usectohz(CONN_WAIT_TIME * 1000000);
4261 				while ((cv_stat = cv_timedwait_sig(&cn->c_cv,
4262 				    &cn->c_lock, timout)) > 0 &&
4263 				    cn->c_state == C_CONN_PEND)
4264 					;
4265 				if (cv_stat == 0) {
4266 					cn->c_ref--;
4267 					mutex_exit(&cn->c_lock);
4268 					return (RDMA_INTR);
4269 				}
4270 				if (cv_stat < 0) {
4271 					cn->c_ref--;
4272 					mutex_exit(&cn->c_lock);
4273 					return (RDMA_TIMEDOUT);
4274 				}
4275 				if (cn->c_state == C_CONNECTED) {
4276 					*conn = cn;
4277 					mutex_exit(&cn->c_lock);
4278 					return (RDMA_SUCCESS);
4279 				} else {
4280 					cn->c_ref--;
4281 					mutex_exit(&cn->c_lock);
4282 					return (RDMA_TIMEDOUT);
4283 				}
4284 			}
4285 		}
4286 		mutex_exit(&cn->c_lock);
4287 		cn = cn->c_next;
4288 	}
4289 	rw_exit(&hca->cl_conn_list.conn_lock);
4290 	*conn = NULL;
4291 	return (RDMA_FAILED);
4292 }
4293 
4294 /*
4295  * Connection management.
4296  * IBTF does not support recycling of channels. So connections are only
4297  * in four states - C_CONN_PEND, or C_CONNECTED, or C_ERROR_CONN or
4298  * C_DISCONN_PEND state. No C_IDLE state.
4299  * C_CONN_PEND state: Connection establishment in progress to the server.
4300  * C_CONNECTED state: A connection when created is in C_CONNECTED state.
4301  * It has an RC channel associated with it. ibt_post_send/recv are allowed
4302  * only in this state.
4303  * C_ERROR_CONN state: A connection transitions to this state when WRs on the
4304  * channel are completed in error or an IBT_CM_EVENT_CONN_CLOSED event
4305  * happens on the channel or a IBT_HCA_DETACH_EVENT occurs on the HCA.
4306  * C_DISCONN_PEND state: When a connection is in C_ERROR_CONN state and when
4307  * c_ref drops to 0 (this indicates that RPC has no more references to this
4308  * connection), the connection should be destroyed. A connection transitions
4309  * into this state when it is being destroyed.
4310  */
4311 /* ARGSUSED */
4312 static rdma_stat
4313 rib_connect(struct netbuf *s_svcaddr, struct netbuf *d_svcaddr,
4314     int addr_type, rpcib_ping_t *rpt, CONN **conn)
4315 {
4316 	CONN *cn;
4317 	int status;
4318 	rib_hca_t *hca;
4319 	rib_qp_t *qp;
4320 	int s_addr_len;
4321 	char *s_addr_buf;
4322 
4323 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
4324 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
4325 		rw_enter(&hca->state_lock, RW_READER);
4326 		if (hca->state != HCA_DETACHED) {
4327 			status = rib_find_hca_connection(hca, s_svcaddr,
4328 			    d_svcaddr, conn);
4329 			rw_exit(&hca->state_lock);
4330 			if ((status == RDMA_INTR) || (status == RDMA_SUCCESS)) {
4331 				rw_exit(&rib_stat->hcas_list_lock);
4332 				return (status);
4333 			}
4334 		} else
4335 			rw_exit(&hca->state_lock);
4336 	}
4337 	rw_exit(&rib_stat->hcas_list_lock);
4338 
4339 	/*
4340 	 * No existing connection found, establish a new connection.
4341 	 */
4342 	bzero(rpt, sizeof (rpcib_ping_t));
4343 
4344 	status = rib_ping_srv(addr_type, d_svcaddr, rpt);
4345 	if (status != RDMA_SUCCESS) {
4346 		return (RDMA_FAILED);
4347 	}
4348 	hca = rpt->hca;
4349 
4350 	if (rpt->srcip.family == AF_INET) {
4351 		s_addr_len = sizeof (rpt->srcip.un.ip4addr);
4352 		s_addr_buf = (char *)&rpt->srcip.un.ip4addr;
4353 	} else if (rpt->srcip.family == AF_INET6) {
4354 		s_addr_len = sizeof (rpt->srcip.un.ip6addr);
4355 		s_addr_buf = (char *)&rpt->srcip.un.ip6addr;
4356 	} else {
4357 		return (RDMA_FAILED);
4358 	}
4359 
4360 	/*
4361 	 * Channel to server doesn't exist yet, create one.
4362 	 */
4363 	if (rib_clnt_create_chan(hca, d_svcaddr, &qp) != RDMA_SUCCESS) {
4364 		return (RDMA_FAILED);
4365 	}
4366 	cn = qptoc(qp);
4367 	cn->c_state = C_CONN_PEND;
4368 	cn->c_ref = 1;
4369 
4370 	cn->c_laddr.buf = kmem_alloc(s_addr_len, KM_SLEEP);
4371 	bcopy(s_addr_buf, cn->c_laddr.buf, s_addr_len);
4372 	cn->c_laddr.len = cn->c_laddr.maxlen = s_addr_len;
4373 
4374 	if (rpt->srcip.family == AF_INET) {
4375 		cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP) + 1, KM_SLEEP);
4376 		(void) strcpy(cn->c_netid, RIBNETID_TCP);
4377 	} else {
4378 		cn->c_netid = kmem_zalloc(strlen(RIBNETID_TCP6) + 1, KM_SLEEP);
4379 		(void) strcpy(cn->c_netid, RIBNETID_TCP6);
4380 	}
4381 
4382 	/*
4383 	 * Add to conn list.
4384 	 * We had given up the READER lock. In the time since then,
4385 	 * another thread might have created the connection we are
4386 	 * trying here. But for now, that is quiet alright - there
4387 	 * might be two connections between a pair of hosts instead
4388 	 * of one. If we really want to close that window,
4389 	 * then need to check the list after acquiring the
4390 	 * WRITER lock.
4391 	 */
4392 	(void) rib_add_connlist(cn, &hca->cl_conn_list);
4393 	status = rib_conn_to_srv(hca, qp, rpt);
4394 	mutex_enter(&cn->c_lock);
4395 	if (status == RDMA_SUCCESS) {
4396 		cn->c_state = C_CONNECTED;
4397 		*conn = cn;
4398 	} else {
4399 		cn->c_state = C_ERROR_CONN;
4400 		cn->c_ref--;
4401 	}
4402 	cv_broadcast(&cn->c_cv);
4403 	mutex_exit(&cn->c_lock);
4404 	return (status);
4405 }
4406 
4407 static void
4408 rib_conn_close(void *rarg)
4409 {
4410 	CONN *conn = (CONN *)rarg;
4411 	rib_qp_t *qp = ctoqp(conn);
4412 
4413 	mutex_enter(&conn->c_lock);
4414 	if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4415 
4416 		conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4417 		/*
4418 		 * Live connection in CONNECTED state.
4419 		 */
4420 		if (conn->c_state == C_CONNECTED) {
4421 			conn->c_state = C_ERROR_CONN;
4422 		}
4423 		mutex_exit(&conn->c_lock);
4424 
4425 		rib_close_a_channel(conn);
4426 
4427 		mutex_enter(&conn->c_lock);
4428 		conn->c_flags &= ~C_CLOSE_PENDING;
4429 		cv_signal(&conn->c_cv);
4430 	}
4431 
4432 	mutex_exit(&conn->c_lock);
4433 
4434 	if (qp->mode == RIB_SERVER)
4435 		(void) rib_disconnect_channel(conn,
4436 		    &qp->hca->srv_conn_list);
4437 	else
4438 		(void) rib_disconnect_channel(conn,
4439 		    &qp->hca->cl_conn_list);
4440 }
4441 
4442 static void
4443 rib_conn_timeout_call(void *carg)
4444 {
4445 	time_t idle_time;
4446 	CONN *conn = (CONN *)carg;
4447 	rib_hca_t *hca = ctoqp(conn)->hca;
4448 	int error;
4449 
4450 	mutex_enter(&conn->c_lock);
4451 	if ((conn->c_ref > 0) ||
4452 	    (conn->c_state == C_DISCONN_PEND)) {
4453 		conn->c_timeout = NULL;
4454 		mutex_exit(&conn->c_lock);
4455 		return;
4456 	}
4457 
4458 	idle_time = (gethrestime_sec() - conn->c_last_used);
4459 
4460 	if ((idle_time <= rib_conn_timeout) &&
4461 	    (conn->c_state != C_ERROR_CONN)) {
4462 		/*
4463 		 * There was activity after the last timeout.
4464 		 * Extend the conn life. Unless the conn is
4465 		 * already in error state.
4466 		 */
4467 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4468 		    SEC_TO_TICK(rib_conn_timeout - idle_time));
4469 		mutex_exit(&conn->c_lock);
4470 		return;
4471 	}
4472 
4473 	error = ddi_taskq_dispatch(hca->cleanup_helper, rib_conn_close,
4474 	    (void *)conn, DDI_NOSLEEP);
4475 
4476 	/*
4477 	 * If taskq dispatch fails above, then reset the timeout
4478 	 * to try again after 10 secs.
4479 	 */
4480 
4481 	if (error != DDI_SUCCESS) {
4482 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4483 		    SEC_TO_TICK(RDMA_CONN_REAP_RETRY));
4484 		mutex_exit(&conn->c_lock);
4485 		return;
4486 	}
4487 
4488 	conn->c_state = C_DISCONN_PEND;
4489 	mutex_exit(&conn->c_lock);
4490 }
4491 
4492 static rdma_stat
4493 rib_conn_release(CONN *conn)
4494 {
4495 
4496 	mutex_enter(&conn->c_lock);
4497 	conn->c_ref--;
4498 
4499 	conn->c_last_used = gethrestime_sec();
4500 	if (conn->c_ref > 0) {
4501 		mutex_exit(&conn->c_lock);
4502 		return (RDMA_SUCCESS);
4503 	}
4504 
4505 	/*
4506 	 * If a conn is C_ERROR_CONN, close the channel.
4507 	 */
4508 	if (conn->c_ref == 0 && conn->c_state == C_ERROR_CONN) {
4509 		conn->c_state = C_DISCONN_PEND;
4510 		mutex_exit(&conn->c_lock);
4511 		rib_conn_close((void *)conn);
4512 		return (RDMA_SUCCESS);
4513 	}
4514 
4515 	/*
4516 	 * c_ref == 0, set a timeout for conn release
4517 	 */
4518 
4519 	if (conn->c_timeout == NULL) {
4520 		conn->c_timeout = timeout(rib_conn_timeout_call, conn,
4521 		    SEC_TO_TICK(rib_conn_timeout));
4522 	}
4523 
4524 	mutex_exit(&conn->c_lock);
4525 	return (RDMA_SUCCESS);
4526 }
4527 
4528 /*
4529  * Add at front of list
4530  */
4531 static struct rdma_done_list *
4532 rdma_done_add(rib_qp_t *qp, uint32_t xid)
4533 {
4534 	struct rdma_done_list *rd;
4535 
4536 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4537 
4538 	rd = kmem_alloc(sizeof (*rd), KM_SLEEP);
4539 	rd->xid = xid;
4540 	cv_init(&rd->rdma_done_cv, NULL, CV_DEFAULT, NULL);
4541 
4542 	rd->prev = NULL;
4543 	rd->next = qp->rdlist;
4544 	if (qp->rdlist != NULL)
4545 		qp->rdlist->prev = rd;
4546 	qp->rdlist = rd;
4547 
4548 	return (rd);
4549 }
4550 
4551 static void
4552 rdma_done_rm(rib_qp_t *qp, struct rdma_done_list *rd)
4553 {
4554 	struct rdma_done_list *r;
4555 
4556 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4557 
4558 	r = rd->next;
4559 	if (r != NULL) {
4560 		r->prev = rd->prev;
4561 	}
4562 
4563 	r = rd->prev;
4564 	if (r != NULL) {
4565 		r->next = rd->next;
4566 	} else {
4567 		qp->rdlist = rd->next;
4568 	}
4569 
4570 	cv_destroy(&rd->rdma_done_cv);
4571 	kmem_free(rd, sizeof (*rd));
4572 }
4573 
4574 static void
4575 rdma_done_rem_list(rib_qp_t *qp)
4576 {
4577 	struct rdma_done_list	*r, *n;
4578 
4579 	mutex_enter(&qp->rdlist_lock);
4580 	for (r = qp->rdlist; r != NULL; r = n) {
4581 		n = r->next;
4582 		rdma_done_rm(qp, r);
4583 	}
4584 	mutex_exit(&qp->rdlist_lock);
4585 }
4586 
4587 static void
4588 rdma_done_notify(rib_qp_t *qp, uint32_t xid)
4589 {
4590 	struct rdma_done_list *r = qp->rdlist;
4591 
4592 	ASSERT(MUTEX_HELD(&qp->rdlist_lock));
4593 
4594 	while (r) {
4595 		if (r->xid == xid) {
4596 			cv_signal(&r->rdma_done_cv);
4597 			return;
4598 		} else {
4599 			r = r->next;
4600 		}
4601 	}
4602 	DTRACE_PROBE1(rpcib__i__donenotify__nomatchxid,
4603 	    int, xid);
4604 }
4605 
4606 /*
4607  * Expects conn->c_lock to be held by the caller.
4608  */
4609 
4610 static void
4611 rib_close_a_channel(CONN *conn)
4612 {
4613 	rib_qp_t	*qp;
4614 	qp = ctoqp(conn);
4615 
4616 	if (qp->qp_hdl == NULL) {
4617 		/* channel already freed */
4618 		return;
4619 	}
4620 
4621 	/*
4622 	 * Call ibt_close_rc_channel in blocking mode
4623 	 * with no callbacks.
4624 	 */
4625 	(void) ibt_close_rc_channel(qp->qp_hdl, IBT_NOCALLBACKS,
4626 	    NULL, 0, NULL, NULL, 0);
4627 }
4628 
4629 /*
4630  * Goes through all connections and closes the channel
4631  * This will cause all the WRs on those channels to be
4632  * flushed.
4633  */
4634 static void
4635 rib_close_channels(rib_conn_list_t *connlist)
4636 {
4637 	CONN 		*conn, *tmp;
4638 
4639 	rw_enter(&connlist->conn_lock, RW_READER);
4640 	conn = connlist->conn_hd;
4641 	while (conn != NULL) {
4642 		mutex_enter(&conn->c_lock);
4643 		tmp = conn->c_next;
4644 		if (!(conn->c_flags & C_CLOSE_NOTNEEDED)) {
4645 
4646 			conn->c_flags |= (C_CLOSE_NOTNEEDED | C_CLOSE_PENDING);
4647 
4648 			/*
4649 			 * Live connection in CONNECTED state.
4650 			 */
4651 			if (conn->c_state == C_CONNECTED)
4652 				conn->c_state = C_ERROR_CONN;
4653 			mutex_exit(&conn->c_lock);
4654 
4655 			rib_close_a_channel(conn);
4656 
4657 			mutex_enter(&conn->c_lock);
4658 			conn->c_flags &= ~C_CLOSE_PENDING;
4659 			/* Signal a pending rib_disconnect_channel() */
4660 			cv_signal(&conn->c_cv);
4661 		}
4662 		mutex_exit(&conn->c_lock);
4663 		conn = tmp;
4664 	}
4665 	rw_exit(&connlist->conn_lock);
4666 }
4667 
4668 /*
4669  * Frees up all connections that are no longer being referenced
4670  */
4671 static void
4672 rib_purge_connlist(rib_conn_list_t *connlist)
4673 {
4674 	CONN 		*conn;
4675 
4676 top:
4677 	rw_enter(&connlist->conn_lock, RW_READER);
4678 	conn = connlist->conn_hd;
4679 	while (conn != NULL) {
4680 		mutex_enter(&conn->c_lock);
4681 
4682 		/*
4683 		 * At this point connection is either in ERROR
4684 		 * or DISCONN_PEND state. If in DISCONN_PEND state
4685 		 * then some other thread is culling that connection.
4686 		 * If not and if c_ref is 0, then destroy the connection.
4687 		 */
4688 		if (conn->c_ref == 0 &&
4689 		    conn->c_state != C_DISCONN_PEND) {
4690 			/*
4691 			 * Cull the connection
4692 			 */
4693 			conn->c_state = C_DISCONN_PEND;
4694 			mutex_exit(&conn->c_lock);
4695 			rw_exit(&connlist->conn_lock);
4696 			(void) rib_disconnect_channel(conn, connlist);
4697 			goto top;
4698 		} else {
4699 			/*
4700 			 * conn disconnect already scheduled or will
4701 			 * happen from conn_release when c_ref drops to 0.
4702 			 */
4703 			mutex_exit(&conn->c_lock);
4704 		}
4705 		conn = conn->c_next;
4706 	}
4707 	rw_exit(&connlist->conn_lock);
4708 
4709 	/*
4710 	 * At this point, only connections with c_ref != 0 are on the list
4711 	 */
4712 }
4713 
4714 /*
4715  * Free all the HCA resources and close
4716  * the hca.
4717  */
4718 
4719 static void
4720 rib_free_hca(rib_hca_t *hca)
4721 {
4722 	(void) ibt_free_cq(hca->clnt_rcq->rib_cq_hdl);
4723 	(void) ibt_free_cq(hca->clnt_scq->rib_cq_hdl);
4724 	(void) ibt_free_cq(hca->svc_rcq->rib_cq_hdl);
4725 	(void) ibt_free_cq(hca->svc_scq->rib_cq_hdl);
4726 
4727 	kmem_free(hca->clnt_rcq, sizeof (rib_cq_t));
4728 	kmem_free(hca->clnt_scq, sizeof (rib_cq_t));
4729 	kmem_free(hca->svc_rcq, sizeof (rib_cq_t));
4730 	kmem_free(hca->svc_scq, sizeof (rib_cq_t));
4731 
4732 	rib_rbufpool_destroy(hca, RECV_BUFFER);
4733 	rib_rbufpool_destroy(hca, SEND_BUFFER);
4734 	rib_destroy_cache(hca);
4735 	if (rib_mod.rdma_count == 0)
4736 		rdma_unregister_mod(&rib_mod);
4737 	(void) ibt_free_pd(hca->hca_hdl, hca->pd_hdl);
4738 	(void) ibt_close_hca(hca->hca_hdl);
4739 	hca->hca_hdl = NULL;
4740 }
4741 
4742 
4743 static void
4744 rib_stop_hca_services(rib_hca_t *hca)
4745 {
4746 	rib_stop_services(hca);
4747 	rib_close_channels(&hca->cl_conn_list);
4748 	rib_close_channels(&hca->srv_conn_list);
4749 
4750 	rib_purge_connlist(&hca->cl_conn_list);
4751 	rib_purge_connlist(&hca->srv_conn_list);
4752 
4753 	if ((rib_stat->hcas_list == NULL) && stats_enabled) {
4754 		kstat_delete_byname_zone("unix", 0, "rpcib_cache",
4755 		    GLOBAL_ZONEID);
4756 		stats_enabled = FALSE;
4757 	}
4758 
4759 	rw_enter(&hca->srv_conn_list.conn_lock, RW_READER);
4760 	rw_enter(&hca->cl_conn_list.conn_lock, RW_READER);
4761 	if (hca->srv_conn_list.conn_hd == NULL &&
4762 	    hca->cl_conn_list.conn_hd == NULL) {
4763 		/*
4764 		 * conn_lists are NULL, so destroy
4765 		 * buffers, close hca and be done.
4766 		 */
4767 		rib_free_hca(hca);
4768 	}
4769 	rw_exit(&hca->cl_conn_list.conn_lock);
4770 	rw_exit(&hca->srv_conn_list.conn_lock);
4771 
4772 	if (hca->hca_hdl != NULL) {
4773 		mutex_enter(&hca->inuse_lock);
4774 		while (hca->inuse)
4775 			cv_wait(&hca->cb_cv, &hca->inuse_lock);
4776 		mutex_exit(&hca->inuse_lock);
4777 
4778 		rib_free_hca(hca);
4779 	}
4780 	rw_destroy(&hca->bound_services_lock);
4781 
4782 	if (hca->cleanup_helper != NULL) {
4783 		ddi_taskq_destroy(hca->cleanup_helper);
4784 		hca->cleanup_helper = NULL;
4785 	}
4786 }
4787 
4788 /*
4789  * Cleans and closes up all uses of the HCA
4790  */
4791 static void
4792 rib_detach_hca(rib_hca_t *hca)
4793 {
4794 	rib_hca_t **hcap;
4795 
4796 	/*
4797 	 * Stop all services on the HCA
4798 	 * Go through cl_conn_list and close all rc_channels
4799 	 * Go through svr_conn_list and close all rc_channels
4800 	 * Free connections whose c_ref has dropped to 0
4801 	 * Destroy all CQs
4802 	 * Deregister and released all buffer pool memory after all
4803 	 * connections are destroyed
4804 	 * Free the protection domain
4805 	 * ibt_close_hca()
4806 	 */
4807 	rw_enter(&hca->state_lock, RW_WRITER);
4808 	if (hca->state == HCA_DETACHED) {
4809 		rw_exit(&hca->state_lock);
4810 		return;
4811 	}
4812 
4813 	hca->state = HCA_DETACHED;
4814 	rw_enter(&rib_stat->hcas_list_lock, RW_WRITER);
4815 	for (hcap = &rib_stat->hcas_list; *hcap && (*hcap != hca);
4816 	    hcap = &(*hcap)->next)
4817 		;
4818 	ASSERT(*hcap == hca);
4819 	*hcap = hca->next;
4820 	rib_stat->nhca_inited--;
4821 	rib_mod.rdma_count--;
4822 	rw_exit(&rib_stat->hcas_list_lock);
4823 	rw_exit(&hca->state_lock);
4824 
4825 	rib_stop_hca_services(hca);
4826 
4827 	kmem_free(hca, sizeof (*hca));
4828 }
4829 
4830 static void
4831 rib_server_side_cache_reclaim(void *argp)
4832 {
4833 	cache_avl_struct_t    *rcas;
4834 	rib_lrc_entry_t		*rb;
4835 	rib_hca_t *hca = (rib_hca_t *)argp;
4836 
4837 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4838 	rcas = avl_first(&hca->avl_tree);
4839 	if (rcas != NULL)
4840 		avl_remove(&hca->avl_tree, rcas);
4841 
4842 	while (rcas != NULL) {
4843 		while (rcas->r.forw != &rcas->r) {
4844 			rcas->elements--;
4845 			rb = rcas->r.forw;
4846 			remque(rb);
4847 			if (rb->registered)
4848 				(void) rib_deregistermem_via_hca(hca,
4849 				    rb->lrc_buf, rb->lrc_mhandle);
4850 
4851 			hca->cache_allocation -= rb->lrc_len;
4852 			kmem_free(rb->lrc_buf, rb->lrc_len);
4853 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4854 		}
4855 		mutex_destroy(&rcas->node_lock);
4856 		kmem_cache_free(hca->server_side_cache, rcas);
4857 		rcas = avl_first(&hca->avl_tree);
4858 		if (rcas != NULL)
4859 			avl_remove(&hca->avl_tree, rcas);
4860 	}
4861 	rw_exit(&hca->avl_rw_lock);
4862 }
4863 
4864 static void
4865 rib_server_side_cache_cleanup(void *argp)
4866 {
4867 	cache_avl_struct_t    *rcas;
4868 	rib_lrc_entry_t		*rb;
4869 	rib_hca_t *hca = (rib_hca_t *)argp;
4870 
4871 	mutex_enter(&hca->cache_allocation_lock);
4872 	if (hca->cache_allocation < cache_limit) {
4873 		mutex_exit(&hca->cache_allocation_lock);
4874 		return;
4875 	}
4876 	mutex_exit(&hca->cache_allocation_lock);
4877 
4878 	rw_enter(&hca->avl_rw_lock, RW_WRITER);
4879 	rcas = avl_last(&hca->avl_tree);
4880 	if (rcas != NULL)
4881 		avl_remove(&hca->avl_tree, rcas);
4882 
4883 	while (rcas != NULL) {
4884 		while (rcas->r.forw != &rcas->r) {
4885 			rcas->elements--;
4886 			rb = rcas->r.forw;
4887 			remque(rb);
4888 			if (rb->registered)
4889 				(void) rib_deregistermem_via_hca(hca,
4890 				    rb->lrc_buf, rb->lrc_mhandle);
4891 
4892 			hca->cache_allocation -= rb->lrc_len;
4893 
4894 			kmem_free(rb->lrc_buf, rb->lrc_len);
4895 			kmem_free(rb, sizeof (rib_lrc_entry_t));
4896 		}
4897 		mutex_destroy(&rcas->node_lock);
4898 		if (hca->server_side_cache) {
4899 			kmem_cache_free(hca->server_side_cache, rcas);
4900 		}
4901 
4902 		if (hca->cache_allocation < cache_limit) {
4903 			rw_exit(&hca->avl_rw_lock);
4904 			return;
4905 		}
4906 
4907 		rcas = avl_last(&hca->avl_tree);
4908 		if (rcas != NULL)
4909 			avl_remove(&hca->avl_tree, rcas);
4910 	}
4911 	rw_exit(&hca->avl_rw_lock);
4912 }
4913 
4914 static int
4915 avl_compare(const void *t1, const void *t2)
4916 {
4917 	if (((cache_avl_struct_t *)t1)->len == ((cache_avl_struct_t *)t2)->len)
4918 		return (0);
4919 
4920 	if (((cache_avl_struct_t *)t1)->len < ((cache_avl_struct_t *)t2)->len)
4921 		return (-1);
4922 
4923 	return (1);
4924 }
4925 
4926 static void
4927 rib_destroy_cache(rib_hca_t *hca)
4928 {
4929 	if (hca->avl_init) {
4930 		rib_server_side_cache_reclaim((void *)hca);
4931 		if (hca->server_side_cache) {
4932 			kmem_cache_destroy(hca->server_side_cache);
4933 			hca->server_side_cache = NULL;
4934 		}
4935 		avl_destroy(&hca->avl_tree);
4936 		mutex_destroy(&hca->cache_allocation_lock);
4937 		rw_destroy(&hca->avl_rw_lock);
4938 	}
4939 	hca->avl_init = FALSE;
4940 }
4941 
4942 static void
4943 rib_force_cleanup(void *hca)
4944 {
4945 	if (((rib_hca_t *)hca)->cleanup_helper != NULL)
4946 		(void) ddi_taskq_dispatch(
4947 		    ((rib_hca_t *)hca)->cleanup_helper,
4948 		    rib_server_side_cache_cleanup,
4949 		    (void *)hca, DDI_NOSLEEP);
4950 }
4951 
4952 static rib_lrc_entry_t *
4953 rib_get_cache_buf(CONN *conn, uint32_t len)
4954 {
4955 	cache_avl_struct_t	cas, *rcas;
4956 	rib_hca_t	*hca = (ctoqp(conn))->hca;
4957 	rib_lrc_entry_t *reply_buf;
4958 	avl_index_t where = NULL;
4959 	uint64_t c_alloc = 0;
4960 
4961 	if (!hca->avl_init)
4962 		goto  error_alloc;
4963 
4964 	cas.len = len;
4965 
4966 	rw_enter(&hca->avl_rw_lock, RW_READER);
4967 
4968 	mutex_enter(&hca->cache_allocation_lock);
4969 	c_alloc = hca->cache_allocation;
4970 	mutex_exit(&hca->cache_allocation_lock);
4971 
4972 	if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree, &cas,
4973 	    &where)) == NULL) {
4974 		/* Am I above the cache limit */
4975 		if ((c_alloc + len) >= cache_limit) {
4976 			rib_force_cleanup((void *)hca);
4977 			rw_exit(&hca->avl_rw_lock);
4978 			mutex_enter(&hca->cache_allocation_lock);
4979 			hca->cache_misses_above_the_limit ++;
4980 			mutex_exit(&hca->cache_allocation_lock);
4981 
4982 			/* Allocate and register the buffer directly */
4983 			goto error_alloc;
4984 		}
4985 
4986 		rw_exit(&hca->avl_rw_lock);
4987 		rw_enter(&hca->avl_rw_lock, RW_WRITER);
4988 
4989 		/* Recheck to make sure no other thread added the entry in */
4990 		if ((rcas = (cache_avl_struct_t *)avl_find(&hca->avl_tree,
4991 		    &cas, &where)) == NULL) {
4992 			/* Allocate an avl tree entry */
4993 			rcas = (cache_avl_struct_t *)
4994 			    kmem_cache_alloc(hca->server_side_cache, KM_SLEEP);
4995 
4996 			bzero(rcas, sizeof (cache_avl_struct_t));
4997 			rcas->elements = 0;
4998 			rcas->r.forw = &rcas->r;
4999 			rcas->r.back = &rcas->r;
5000 			rcas->len = len;
5001 			mutex_init(&rcas->node_lock, NULL, MUTEX_DEFAULT, NULL);
5002 			avl_insert(&hca->avl_tree, rcas, where);
5003 		}
5004 	}
5005 
5006 	mutex_enter(&rcas->node_lock);
5007 
5008 	if (rcas->r.forw != &rcas->r && rcas->elements > 0) {
5009 		reply_buf = rcas->r.forw;
5010 		remque(reply_buf);
5011 		rcas->elements--;
5012 		mutex_exit(&rcas->node_lock);
5013 		rw_exit(&hca->avl_rw_lock);
5014 
5015 		mutex_enter(&hca->cache_allocation_lock);
5016 		hca->cache_hits++;
5017 		hca->cache_allocation -= len;
5018 		mutex_exit(&hca->cache_allocation_lock);
5019 	} else {
5020 		/* Am I above the cache limit */
5021 		mutex_exit(&rcas->node_lock);
5022 		if ((c_alloc + len) >= cache_limit) {
5023 			rib_force_cleanup((void *)hca);
5024 			rw_exit(&hca->avl_rw_lock);
5025 
5026 			mutex_enter(&hca->cache_allocation_lock);
5027 			hca->cache_misses_above_the_limit++;
5028 			mutex_exit(&hca->cache_allocation_lock);
5029 			/* Allocate and register the buffer directly */
5030 			goto error_alloc;
5031 		}
5032 		rw_exit(&hca->avl_rw_lock);
5033 		mutex_enter(&hca->cache_allocation_lock);
5034 		hca->cache_misses++;
5035 		mutex_exit(&hca->cache_allocation_lock);
5036 		/* Allocate a reply_buf entry */
5037 		reply_buf = (rib_lrc_entry_t *)
5038 		    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5039 		bzero(reply_buf, sizeof (rib_lrc_entry_t));
5040 		reply_buf->lrc_buf  = kmem_alloc(len, KM_SLEEP);
5041 		reply_buf->lrc_len  = len;
5042 		reply_buf->registered = FALSE;
5043 		reply_buf->avl_node = (void *)rcas;
5044 	}
5045 
5046 	return (reply_buf);
5047 
5048 error_alloc:
5049 	reply_buf = (rib_lrc_entry_t *)
5050 	    kmem_zalloc(sizeof (rib_lrc_entry_t), KM_SLEEP);
5051 	bzero(reply_buf, sizeof (rib_lrc_entry_t));
5052 	reply_buf->lrc_buf = kmem_alloc(len, KM_SLEEP);
5053 	reply_buf->lrc_len = len;
5054 	reply_buf->registered = FALSE;
5055 	reply_buf->avl_node = NULL;
5056 
5057 	return (reply_buf);
5058 }
5059 
5060 /*
5061  * Return a pre-registered back to the cache (without
5062  * unregistering the buffer)..
5063  */
5064 
5065 static void
5066 rib_free_cache_buf(CONN *conn, rib_lrc_entry_t *reg_buf)
5067 {
5068 	cache_avl_struct_t    cas, *rcas;
5069 	avl_index_t where = NULL;
5070 	rib_hca_t	*hca = (ctoqp(conn))->hca;
5071 
5072 	if (!hca->avl_init)
5073 		goto  error_free;
5074 
5075 	cas.len = reg_buf->lrc_len;
5076 	rw_enter(&hca->avl_rw_lock, RW_READER);
5077 	if ((rcas = (cache_avl_struct_t *)
5078 	    avl_find(&hca->avl_tree, &cas, &where)) == NULL) {
5079 		rw_exit(&hca->avl_rw_lock);
5080 		goto error_free;
5081 	} else {
5082 		cas.len = reg_buf->lrc_len;
5083 		mutex_enter(&rcas->node_lock);
5084 		insque(reg_buf, &rcas->r);
5085 		rcas->elements ++;
5086 		mutex_exit(&rcas->node_lock);
5087 		rw_exit(&hca->avl_rw_lock);
5088 		mutex_enter(&hca->cache_allocation_lock);
5089 		hca->cache_allocation += cas.len;
5090 		mutex_exit(&hca->cache_allocation_lock);
5091 	}
5092 
5093 	return;
5094 
5095 error_free:
5096 
5097 	if (reg_buf->registered)
5098 		(void) rib_deregistermem_via_hca(hca,
5099 		    reg_buf->lrc_buf, reg_buf->lrc_mhandle);
5100 	kmem_free(reg_buf->lrc_buf, reg_buf->lrc_len);
5101 	kmem_free(reg_buf, sizeof (rib_lrc_entry_t));
5102 }
5103 
5104 static rdma_stat
5105 rib_registermem_via_hca(rib_hca_t *hca, caddr_t adsp, caddr_t buf,
5106 	uint_t buflen, struct mrc *buf_handle)
5107 {
5108 	ibt_mr_hdl_t	mr_hdl = NULL;	/* memory region handle */
5109 	ibt_mr_desc_t	mr_desc;	/* vaddr, lkey, rkey */
5110 	rdma_stat	status;
5111 
5112 
5113 	/*
5114 	 * Note: ALL buffer pools use the same memory type RDMARW.
5115 	 */
5116 	status = rib_reg_mem(hca, adsp, buf, buflen, 0, &mr_hdl, &mr_desc);
5117 	if (status == RDMA_SUCCESS) {
5118 		buf_handle->mrc_linfo = (uint64_t)(uintptr_t)mr_hdl;
5119 		buf_handle->mrc_lmr = (uint32_t)mr_desc.md_lkey;
5120 		buf_handle->mrc_rmr = (uint32_t)mr_desc.md_rkey;
5121 	} else {
5122 		buf_handle->mrc_linfo = NULL;
5123 		buf_handle->mrc_lmr = 0;
5124 		buf_handle->mrc_rmr = 0;
5125 	}
5126 	return (status);
5127 }
5128 
5129 /* ARGSUSED */
5130 static rdma_stat
5131 rib_deregistermemsync_via_hca(rib_hca_t *hca, caddr_t buf,
5132     struct mrc buf_handle, RIB_SYNCMEM_HANDLE sync_handle)
5133 {
5134 
5135 	(void) rib_deregistermem_via_hca(hca, buf, buf_handle);
5136 	return (RDMA_SUCCESS);
5137 }
5138 
5139 /* ARGSUSED */
5140 static rdma_stat
5141 rib_deregistermem_via_hca(rib_hca_t *hca, caddr_t buf, struct mrc buf_handle)
5142 {
5143 
5144 	(void) ibt_deregister_mr(hca->hca_hdl,
5145 	    (ibt_mr_hdl_t)(uintptr_t)buf_handle.mrc_linfo);
5146 	return (RDMA_SUCCESS);
5147 }
5148 
5149 /*
5150  * Check if the IP interface named by `lifrp' is RDMA-capable.
5151  */
5152 static boolean_t
5153 rpcib_rdma_capable_interface(struct lifreq *lifrp)
5154 {
5155 	char ifname[LIFNAMSIZ];
5156 	char *cp;
5157 
5158 	if (lifrp->lifr_type == IFT_IB)
5159 		return (B_TRUE);
5160 
5161 	/*
5162 	 * Strip off the logical interface portion before getting
5163 	 * intimate with the name.
5164 	 */
5165 	(void) strlcpy(ifname, lifrp->lifr_name, LIFNAMSIZ);
5166 	if ((cp = strchr(ifname, ':')) != NULL)
5167 		*cp = '\0';
5168 
5169 	return (strcmp("lo0", ifname) == 0);
5170 }
5171 
5172 static int
5173 rpcib_do_ip_ioctl(int cmd, int len, void *arg)
5174 {
5175 	vnode_t *kvp, *vp;
5176 	TIUSER  *tiptr;
5177 	struct  strioctl iocb;
5178 	k_sigset_t smask;
5179 	int	err = 0;
5180 
5181 	if (lookupname("/dev/udp", UIO_SYSSPACE, FOLLOW, NULLVPP, &kvp) == 0) {
5182 		if (t_kopen(NULL, kvp->v_rdev, FREAD|FWRITE,
5183 		    &tiptr, CRED()) == 0) {
5184 			vp = tiptr->fp->f_vnode;
5185 		} else {
5186 			VN_RELE(kvp);
5187 			return (EPROTO);
5188 		}
5189 	} else {
5190 		return (EPROTO);
5191 	}
5192 
5193 	iocb.ic_cmd = cmd;
5194 	iocb.ic_timout = 0;
5195 	iocb.ic_len = len;
5196 	iocb.ic_dp = (caddr_t)arg;
5197 	sigintr(&smask, 0);
5198 	err = kstr_ioctl(vp, I_STR, (intptr_t)&iocb);
5199 	sigunintr(&smask);
5200 	(void) t_kclose(tiptr, 0);
5201 	VN_RELE(kvp);
5202 	return (err);
5203 }
5204 
5205 /*
5206  * Issue an SIOCGLIFCONF down to IP and return the result in `lifcp'.
5207  * lifcp->lifc_buf is dynamically allocated to be *bufsizep bytes.
5208  */
5209 static int
5210 rpcib_do_lifconf(struct lifconf *lifcp, uint_t *bufsizep)
5211 {
5212 	int err;
5213 	struct lifnum lifn;
5214 
5215 	bzero(&lifn, sizeof (struct lifnum));
5216 	lifn.lifn_family = AF_UNSPEC;
5217 
5218 	err = rpcib_do_ip_ioctl(SIOCGLIFNUM, sizeof (struct lifnum), &lifn);
5219 	if (err != 0)
5220 		return (err);
5221 
5222 	/*
5223 	 * Pad the interface count to account for additional interfaces that
5224 	 * may have been configured between the SIOCGLIFNUM and SIOCGLIFCONF.
5225 	 */
5226 	lifn.lifn_count += 4;
5227 
5228 	bzero(lifcp, sizeof (struct lifconf));
5229 	lifcp->lifc_family = AF_UNSPEC;
5230 	lifcp->lifc_len = *bufsizep = lifn.lifn_count * sizeof (struct lifreq);
5231 	lifcp->lifc_buf = kmem_zalloc(*bufsizep, KM_SLEEP);
5232 
5233 	err = rpcib_do_ip_ioctl(SIOCGLIFCONF, sizeof (struct lifconf), lifcp);
5234 	if (err != 0) {
5235 		kmem_free(lifcp->lifc_buf, *bufsizep);
5236 		return (err);
5237 	}
5238 	return (0);
5239 }
5240 
5241 static boolean_t
5242 rpcib_get_ib_addresses(rpcib_ipaddrs_t *addrs4, rpcib_ipaddrs_t *addrs6)
5243 {
5244 	uint_t i, nifs;
5245 	uint_t bufsize;
5246 	struct lifconf lifc;
5247 	struct lifreq *lifrp;
5248 	struct sockaddr_in *sinp;
5249 	struct sockaddr_in6 *sin6p;
5250 
5251 	bzero(addrs4, sizeof (rpcib_ipaddrs_t));
5252 	bzero(addrs6, sizeof (rpcib_ipaddrs_t));
5253 
5254 	if (rpcib_do_lifconf(&lifc, &bufsize) != 0)
5255 		return (B_FALSE);
5256 
5257 	if ((nifs = lifc.lifc_len / sizeof (struct lifreq)) == 0) {
5258 		kmem_free(lifc.lifc_buf, bufsize);
5259 		return (B_FALSE);
5260 	}
5261 
5262 	/*
5263 	 * Worst case is that all of the addresses are IB-capable and have
5264 	 * the same address family, so size our buffers accordingly.
5265 	 */
5266 	addrs4->ri_size = nifs * sizeof (struct sockaddr_in);
5267 	addrs4->ri_list = kmem_zalloc(addrs4->ri_size, KM_SLEEP);
5268 	addrs6->ri_size = nifs * sizeof (struct sockaddr_in6);
5269 	addrs6->ri_list = kmem_zalloc(addrs6->ri_size, KM_SLEEP);
5270 
5271 	for (lifrp = lifc.lifc_req, i = 0; i < nifs; i++, lifrp++) {
5272 		if (!rpcib_rdma_capable_interface(lifrp))
5273 			continue;
5274 
5275 		if (lifrp->lifr_addr.ss_family == AF_INET) {
5276 			sinp = addrs4->ri_list;
5277 			bcopy(&lifrp->lifr_addr, &sinp[addrs4->ri_count++],
5278 			    sizeof (struct sockaddr_in));
5279 		} else if (lifrp->lifr_addr.ss_family == AF_INET6) {
5280 			sin6p = addrs6->ri_list;
5281 			bcopy(&lifrp->lifr_addr, &sin6p[addrs6->ri_count++],
5282 			    sizeof (struct sockaddr_in6));
5283 		}
5284 	}
5285 
5286 	kmem_free(lifc.lifc_buf, bufsize);
5287 	return (B_TRUE);
5288 }
5289 
5290 /* ARGSUSED */
5291 static int
5292 rpcib_cache_kstat_update(kstat_t *ksp, int rw)
5293 {
5294 	rib_hca_t *hca;
5295 
5296 	if (KSTAT_WRITE == rw) {
5297 		return (EACCES);
5298 	}
5299 
5300 	rpcib_kstat.cache_limit.value.ui64 =
5301 	    (uint64_t)cache_limit;
5302 	rw_enter(&rib_stat->hcas_list_lock, RW_READER);
5303 	for (hca = rib_stat->hcas_list; hca; hca = hca->next) {
5304 		rpcib_kstat.cache_allocation.value.ui64 +=
5305 		    (uint64_t)hca->cache_allocation;
5306 		rpcib_kstat.cache_hits.value.ui64 +=
5307 		    (uint64_t)hca->cache_hits;
5308 		rpcib_kstat.cache_misses.value.ui64 +=
5309 		    (uint64_t)hca->cache_misses;
5310 		rpcib_kstat.cache_misses_above_the_limit.value.ui64 +=
5311 		    (uint64_t)hca->cache_misses_above_the_limit;
5312 	}
5313 	rw_exit(&rib_stat->hcas_list_lock);
5314 	return (0);
5315 }
5316