xref: /illumos-gate/usr/src/uts/common/rpc/ib.h (revision 92a0208178405fef708b0283ffcaa02fbc3468ff)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /*
26  * Copyright (c) 2007, The Ohio State University. All rights reserved.
27  *
28  * Portions of this source code is developed by the team members of
29  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
30  * headed by Professor Dhabaleswar K. (DK) Panda.
31  *
32  * Acknowledgements to contributions from developors:
33  *   Ranjit Noronha: noronha@cse.ohio-state.edu
34  *   Lei Chai      : chail@cse.ohio-state.edu
35  *   Weikuan Yu    : yuw@cse.ohio-state.edu
36  *
37  */
38 
39 
40 #ifndef _IB_H
41 #define	_IB_H
42 
43 /*
44  * ib.h, rpcib plugin interface.
45  */
46 
47 #include <sys/types.h>
48 #include <sys/ddi.h>
49 #include <sys/sunddi.h>
50 #include <sys/conf.h>
51 #include <sys/stat.h>
52 #include <rpc/rpc.h>
53 #include <rpc/rpc_rdma.h>
54 #include <sys/ib/ibtl/ibti.h>
55 #include <sys/avl.h>
56 
57 #ifdef __cplusplus
58 extern "C" {
59 #endif
60 
61 #define	MAX_BUFS	1024	/* max no. of buffers per pool */
62 
63 #define	DEF_CQ_SIZE	4096 - 1	/* default CQ size */
64 				/*
65 				 * Tavor returns the next higher power of 2
66 				 * CQ entries than the requested size.
67 				 * For instance, if you request (2^12 - 1)
68 				 * CQ entries, Tavor returns 2^12 entries.
69 				 * 4K CQ entries suffice.  Hence, 4096 - 1.
70 				 */
71 #define	DEF_SQ_SIZE	128	/* default SendQ size */
72 #define	DEF_RQ_SIZE	256	/* default RecvQ size */
73 #define	DSEG_MAX	2
74 #define	RQ_DSEG_MAX	1	/* default RQ data seg */
75 #define	IBSRM_HB	0x8000	/* high order bit of pkey */
76 
77 /* max no. of refresh attempts on IBT_CM_CONN_STALE error */
78 #define	REFRESH_ATTEMPTS	3
79 
80 typedef struct rib_hca_s rib_hca_t;
81 typedef struct rib_qp_s rib_qp_t;
82 typedef struct rib_cq_s rib_cq_t;
83 
84 /*
85  * Notification for RDMA_DONE is based on xid
86  */
87 struct rdma_done_list {
88 	uint32_t	xid;		/* XID waiting for RDMA_DONE */
89 	kcondvar_t	rdma_done_cv;	/* cv for RDMA_DONE */
90 	struct rdma_done_list	*next;
91 	struct rdma_done_list	*prev;
92 };
93 
94 /*
95  * State of the plugin.
96  * ACCEPT = accepting new connections and requests
97  * NO_ACCEPT = not accepting new connection and requests
98  */
99 #define	ACCEPT		1
100 #define	NO_ACCEPT	2
101 
102 /*
103  * Send Wait states
104  */
105 #define	SEND_WAIT	-1
106 
107 /*
108  * Reply states
109  */
110 #define	REPLY_WAIT	-1
111 
112 typedef void * rib_pvoid;
113 typedef rib_pvoid RIB_SYNCMEM_HANDLE;
114 
115 /*
116  * IB buffer pool management structure
117  */
118 
119 /*
120  * Buffer pool info
121  */
122 typedef struct {
123 	kmutex_t	buflock;	/* lock for this structure */
124 	caddr_t		buf;		/* pool address */
125 	uint32_t	bufhandle;	/* rkey for this pool */
126 	ulong_t		bufsize;	/* size of pool */
127 	int		rsize;		/* size of each element */
128 	int		numelems;	/* no. of elements allocated */
129 	int		buffree;	/* no. of free elements */
130 	void		*buflist[1];	/* free elements in pool */
131 } bufpool_t;
132 
133 typedef struct {
134 	bufpool_t	*bpool;
135 	ibt_mr_hdl_t	*mr_hdl;
136 	ibt_mr_desc_t	*mr_desc;	/* vaddr, lkey, rkey */
137 } rib_bufpool_t;
138 
139 /*
140  * ATS relsted defines and structures.
141  */
142 #define	ATS_AR_DATA_LEN	16
143 #define	IBD_NAME	"ibd"
144 #define	N_IBD_INSTANCES	4
145 
146 
147 /*
148  * Service types supported by RPCIB
149  * For now only NFS is supported.
150  */
151 #define	NFS		1
152 #define	NLM		2
153 
154 /*
155  * Tracks consumer state (client or server).
156  */
157 typedef enum {
158 	RIB_SERVER,
159 	RIB_CLIENT
160 } rib_mode_t;
161 
162 /*
163  * CQ structure
164  */
165 struct rib_cq_s {
166 	rib_hca_t		*rib_hca;
167 	ibt_cq_hdl_t		rib_cq_hdl;
168 };
169 
170 /*
171  * RPCIB plugin state
172  */
173 typedef struct rpcib_state {
174 	ibt_clnt_hdl_t		ibt_clnt_hdl;
175 	uint32_t		hca_count;
176 	uint32_t		nhca_inited;
177 	ib_guid_t		*hca_guids;
178 	rib_hca_t		*hcas;
179 	int			refcount;
180 	kmutex_t		open_hca_lock;
181 	rib_hca_t		*hca;		/* the hca being used */
182 	queue_t			*q;		/* up queue for a serv_type */
183 	uint32_t		service_type;	/* NFS, NLM, etc */
184 	void			*private;
185 } rpcib_state_t;
186 
187 /*
188  * Each registered service's data structure.
189  * Each HCA has a list of these structures, which are the registered
190  * services on this HCA.
191  */
192 typedef struct rib_service rib_service_t;
193 struct rib_service {
194 	uint32_t		srv_type;	/* i.e, NFS, NLM, v4CBD */
195 	ibt_srv_hdl_t		srv_hdl;	/* from ibt_register call */
196 	rib_service_t		*srv_next;
197 };
198 
199 /*
200  * Connection lists
201  */
202 typedef struct {
203 	krwlock_t	conn_lock;	/* list lock */
204 	CONN		*conn_hd;	/* list head */
205 } rib_conn_list_t;
206 
207 enum hca_state {
208 	HCA_INITED,		/* hca in up and running state */
209 	HCA_DETACHED		/* hca in detached state */
210 };
211 
212 /*
213  * RPCIB per HCA structure
214  */
215 struct rib_hca_s {
216 	ibt_clnt_hdl_t		ibt_clnt_hdl;
217 
218 	/*
219 	 * per HCA.
220 	 */
221 	ibt_hca_hdl_t		hca_hdl;	/* HCA handle */
222 	ibt_hca_attr_t		hca_attrs;	/* HCA attributes */
223 	ibt_pd_hdl_t		pd_hdl;
224 	ib_guid_t		hca_guid;
225 	uint32_t		hca_nports;
226 	ibt_hca_portinfo_t	*hca_ports;
227 	size_t			hca_pinfosz;
228 	enum hca_state		state;		/* state of HCA */
229 	krwlock_t		state_lock;	/* protects state field */
230 	bool_t			inuse;		/* indicates HCA usage */
231 	kmutex_t		inuse_lock;	/* protects inuse field */
232 	/*
233 	 * List of services registered on all ports available
234 	 * on this HCA. Only one consumer of KRPC can register
235 	 * its services at one time or tear them down at one
236 	 * time.
237 	 */
238 	rib_service_t	*service_list;
239 	krwlock_t		service_list_lock;
240 
241 
242 	rib_conn_list_t		cl_conn_list;	/* client conn list */
243 	rib_conn_list_t		srv_conn_list;	/* server conn list */
244 
245 	rib_cq_t		*clnt_scq;
246 	rib_cq_t		*clnt_rcq;
247 	rib_cq_t		*svc_scq;
248 	rib_cq_t		*svc_rcq;
249 	kmutex_t		cb_lock;
250 	kcondvar_t		cb_cv;
251 
252 	rib_bufpool_t		*recv_pool;	/* recv buf pool */
253 	rib_bufpool_t		*send_pool;	/* send buf pool */
254 
255 	void			*iblock;	/* interrupt cookie */
256 
257 	kmem_cache_t	*server_side_cache;	/* long reply pool */
258 	avl_tree_t	avl_tree;
259 	kmutex_t	avl_lock;
260 	krwlock_t	avl_rw_lock;
261 	volatile bool_t avl_init;
262 	kmutex_t	cache_allocation;
263 	ddi_taskq_t *reg_cache_clean_up;
264 	ib_svc_id_t	srv_id;
265 	ibt_srv_hdl_t 	srv_hdl;
266 	uint_t		reg_state;
267 
268 };
269 
270 
271 /*
272  * Structure on wait state of a post send
273  */
274 struct send_wid {
275 	uint32_t 	xid;
276 	int		cv_sig;
277 	kmutex_t	sendwait_lock;
278 	kcondvar_t	wait_cv;
279 	uint_t		status;
280 	rib_qp_t	*qp;
281 	int		nsbufs;			/* # of send buffers posted */
282 	uint64_t	sbufaddr[DSEG_MAX];	/* posted send buffers */
283 	caddr_t		c;
284 	caddr_t		c1;
285 	int		l1;
286 	caddr_t		c2;
287 	int		l2;
288 	int		wl, rl;
289 };
290 
291 /*
292  * Structure on reply descriptor for recv queue.
293  * Different from the above posting of a descriptor.
294  */
295 struct reply {
296 	uint32_t 	xid;
297 	uint_t		status;
298 	uint64_t	vaddr_cq;	/* buf addr from CQ */
299 	uint_t		bytes_xfer;
300 	kcondvar_t	wait_cv;
301 	struct reply	*next;
302 	struct reply 	*prev;
303 };
304 
305 struct svc_recv {
306 	rib_qp_t	*qp;
307 	uint64_t	vaddr;
308 	uint_t		bytes_xfer;
309 };
310 
311 struct recv_wid {
312 	uint32_t 	xid;
313 	rib_qp_t	*qp;
314 	uint64_t	addr;	/* posted buf addr */
315 };
316 
317 /*
318  * Per QP data structure
319  */
320 struct rib_qp_s {
321 	rib_hca_t		*hca;
322 	rib_mode_t		mode;	/* RIB_SERVER or RIB_CLIENT */
323 	CONN			rdmaconn;
324 	ibt_channel_hdl_t	qp_hdl;
325 	uint_t			port_num;
326 	ib_qpn_t		qpn;
327 	int			chan_flags;
328 	clock_t			timeout;
329 	ibt_rc_chan_query_attr_t	qp_q_attrs;
330 	rib_cq_t		*send_cq;	/* send CQ */
331 	rib_cq_t		*recv_cq;	/* recv CQ */
332 
333 	/*
334 	 * Number of pre-posted rbufs
335 	 */
336 	uint_t			n_posted_rbufs;
337 	kcondvar_t 		posted_rbufs_cv;
338 	kmutex_t		posted_rbufs_lock;
339 
340 	/*
341 	 * RPC reply
342 	 */
343 	uint_t			rep_list_size;
344 	struct reply		*replylist;
345 	kmutex_t		replylist_lock;
346 
347 	/*
348 	 * server only, RDMA_DONE
349 	 */
350 	struct rdma_done_list	*rdlist;
351 	kmutex_t		rdlist_lock;
352 
353 	kmutex_t		cb_lock;
354 	kcondvar_t 		cb_conn_cv;
355 
356 	caddr_t			q;	/* upstream queue */
357 	struct send_wid		wd;
358 };
359 
360 #define	ctoqp(conn)	((rib_qp_t *)((conn)->c_private))
361 #define	qptoc(rqp)	((CONN *)&((rqp)->rdmaconn))
362 
363 /*
364  * Timeout for various calls
365  */
366 #define	CONN_WAIT_TIME	40
367 #define	SEND_WAIT_TIME	40	/* time for send completion */
368 
369 #define	REPLY_WAIT_TIME	40	/* time to get reply from remote QP */
370 
371 #ifdef __cplusplus
372 }
373 #endif
374 
375 #endif	/* !_IB_H */
376