xref: /illumos-gate/usr/src/uts/common/rpc/ib.h (revision 3ce5372277f4657ad0e52d36c979527c4ca22de2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /*
25  * Copyright (c) 2007, The Ohio State University. All rights reserved.
26  *
27  * Portions of this source code is developed by the team members of
28  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
29  * headed by Professor Dhabaleswar K. (DK) Panda.
30  *
31  * Acknowledgements to contributions from developors:
32  *   Ranjit Noronha: noronha@cse.ohio-state.edu
33  *   Lei Chai      : chail@cse.ohio-state.edu
34  *   Weikuan Yu    : yuw@cse.ohio-state.edu
35  *
36  */
37 
38 
39 #ifndef _IB_H
40 #define	_IB_H
41 
42 /*
43  * ib.h, rpcib plugin interface.
44  */
45 
46 #include <sys/types.h>
47 #include <sys/ddi.h>
48 #include <sys/sunddi.h>
49 #include <sys/conf.h>
50 #include <sys/stat.h>
51 #include <rpc/rpc.h>
52 #include <rpc/rpc_rdma.h>
53 #include <sys/ib/ibtl/ibti.h>
54 #include <sys/avl.h>
55 
56 #ifdef __cplusplus
57 extern "C" {
58 #endif
59 
60 #define	MAX_BUFS	1024	/* max no. of buffers per pool */
61 
62 #define	DEF_CQ_SIZE	4096 - 1	/* default CQ size */
63 				/*
64 				 * Tavor returns the next higher power of 2
65 				 * CQ entries than the requested size.
66 				 * For instance, if you request (2^12 - 1)
67 				 * CQ entries, Tavor returns 2^12 entries.
68 				 * 4K CQ entries suffice.  Hence, 4096 - 1.
69 				 */
70 #define	DEF_SQ_SIZE	128	/* default SendQ size */
71 #define	DEF_RQ_SIZE	256	/* default RecvQ size */
72 #define	DSEG_MAX	2
73 #define	RQ_DSEG_MAX	1	/* default RQ data seg */
74 #define	IBSRM_HB	0x8000	/* high order bit of pkey */
75 
76 /* max no. of refresh attempts on IBT_CM_CONN_STALE error */
77 #define	REFRESH_ATTEMPTS	3
78 
79 typedef struct rib_hca_s rib_hca_t;
80 typedef struct rib_qp_s rib_qp_t;
81 typedef struct rib_cq_s rib_cq_t;
82 
83 /*
84  * Notification for RDMA_DONE is based on xid
85  */
86 struct rdma_done_list {
87 	uint32_t	xid;		/* XID waiting for RDMA_DONE */
88 	kcondvar_t	rdma_done_cv;	/* cv for RDMA_DONE */
89 	struct rdma_done_list	*next;
90 	struct rdma_done_list	*prev;
91 };
92 
93 /*
94  * State of the plugin.
95  * ACCEPT = accepting new connections and requests
96  * NO_ACCEPT = not accepting new connection and requests
97  */
98 #define	ACCEPT		1
99 #define	NO_ACCEPT	2
100 
101 /*
102  * Send Wait states
103  */
104 #define	SEND_WAIT	-1
105 
106 /*
107  * Reply states
108  */
109 #define	REPLY_WAIT	-1
110 
111 typedef void * rib_pvoid;
112 typedef rib_pvoid RIB_SYNCMEM_HANDLE;
113 
114 /*
115  * IB buffer pool management structure
116  */
117 
118 /*
119  * Buffer pool info
120  */
121 typedef struct {
122 	kmutex_t	buflock;	/* lock for this structure */
123 	caddr_t		buf;		/* pool address */
124 	uint32_t	bufhandle;	/* rkey for this pool */
125 	ulong_t		bufsize;	/* size of pool */
126 	int		rsize;		/* size of each element */
127 	int		numelems;	/* no. of elements allocated */
128 	int		buffree;	/* no. of free elements */
129 	void		*buflist[1];	/* free elements in pool */
130 } bufpool_t;
131 
132 typedef struct {
133 	bufpool_t	*bpool;
134 	ibt_mr_hdl_t	*mr_hdl;
135 	ibt_mr_desc_t	*mr_desc;	/* vaddr, lkey, rkey */
136 } rib_bufpool_t;
137 
138 /*
139  * ATS relsted defines and structures.
140  */
141 #define	ATS_AR_DATA_LEN	16
142 
143 
144 /*
145  * Service types supported by RPCIB
146  * For now only NFS is supported.
147  */
148 #define	NFS		1
149 #define	NLM		2
150 
151 /*
152  * Tracks consumer state (client or server).
153  */
154 typedef enum {
155 	RIB_SERVER,
156 	RIB_CLIENT
157 } rib_mode_t;
158 
159 /*
160  * CQ structure
161  */
162 struct rib_cq_s {
163 	rib_hca_t		*rib_hca;
164 	ibt_cq_hdl_t		rib_cq_hdl;
165 };
166 
167 /*
168  * Each registered service's data structure.
169  */
170 typedef struct rib_service_s rib_service_t;
171 struct rib_service_s {
172 	uint32_t		srv_type;	/* i.e, NFS, NLM, v4CBD */
173 	ibt_srv_hdl_t		srv_hdl;	/* from ibt_register call */
174 	ib_svc_id_t		srv_id;
175 	rib_service_t		*next;
176 };
177 
178 /*
179  * RPCIB plugin state
180  */
181 typedef struct rpcib_state {
182 	ibt_clnt_hdl_t		ibt_clnt_hdl;
183 	uint32_t		hca_count;
184 	uint32_t		nhca_inited;
185 	rib_hca_t		*hcas_list;
186 	krwlock_t		hcas_list_lock;	/* protects hcas_list */
187 	int			refcount;
188 	kmutex_t		open_hca_lock;
189 	queue_t			*q;		/* up queue for a serv_type */
190 	void			*private;
191 	rib_service_t		*service_list;
192 	krwlock_t		service_list_lock;
193 	kmutex_t		listen_lock;
194 } rpcib_state_t;
195 
196 /*
197  * Connection lists
198  */
199 typedef struct {
200 	krwlock_t	conn_lock;	/* list lock */
201 	CONN		*conn_hd;	/* list head */
202 } rib_conn_list_t;
203 
204 enum hca_state {
205 	HCA_DETACHED,		/* hca in detached state */
206 	HCA_INITED,		/* hca in up and running state */
207 };
208 
209 typedef struct rib_hca_service_s rib_hca_service_t;
210 struct rib_hca_service_s {
211 	ib_svc_id_t	srv_id;
212 	ib_gid_t	gid;
213 	ibt_sbind_hdl_t	sbind_hdl;
214 	rib_hca_service_t *next;
215 };
216 
217 /*
218  * RPCIB per HCA structure
219  */
220 struct rib_hca_s {
221 	ibt_clnt_hdl_t		ibt_clnt_hdl;
222 
223 	/*
224 	 * per HCA.
225 	 */
226 	ibt_hca_hdl_t		hca_hdl;	/* HCA handle */
227 	ibt_hca_attr_t		hca_attrs;	/* HCA attributes */
228 	ibt_pd_hdl_t		pd_hdl;
229 	rib_hca_service_t	*bound_services;
230 	krwlock_t		bound_services_lock;
231 	ib_guid_t		hca_guid;
232 	uint32_t		hca_nports;
233 	ibt_hca_portinfo_t	*hca_ports;
234 	size_t			hca_pinfosz;
235 	enum hca_state		state;		/* state of HCA */
236 	krwlock_t		state_lock;	/* protects state field */
237 	bool_t			inuse;		/* indicates HCA usage */
238 	kmutex_t		inuse_lock;	/* protects inuse field */
239 
240 	rib_conn_list_t		cl_conn_list;	/* client conn list */
241 	rib_conn_list_t		srv_conn_list;	/* server conn list */
242 
243 	rib_cq_t		*clnt_scq;
244 	rib_cq_t		*clnt_rcq;
245 	rib_cq_t		*svc_scq;
246 	rib_cq_t		*svc_rcq;
247 	kmutex_t		cb_lock;
248 	kcondvar_t		cb_cv;
249 
250 	rib_bufpool_t		*recv_pool;	/* recv buf pool */
251 	rib_bufpool_t		*send_pool;	/* send buf pool */
252 
253 	void			*iblock;	/* interrupt cookie */
254 
255 	kmem_cache_t	*server_side_cache;	/* long reply pool */
256 	avl_tree_t	avl_tree;
257 	kmutex_t	avl_lock;
258 	krwlock_t	avl_rw_lock;
259 	volatile bool_t avl_init;
260 	kmutex_t	cache_allocation_lock;
261 	ddi_taskq_t	*cleanup_helper;
262 	ib_svc_id_t	srv_id;
263 	ibt_srv_hdl_t 	srv_hdl;
264 	uint_t		reg_state;
265 
266 	volatile uint64_t	cache_allocation;
267 	uint64_t	cache_hits;
268 	uint64_t	cache_misses;
269 	uint64_t	cache_cold_misses;
270 	uint64_t	cache_hot_misses;
271 	uint64_t	cache_misses_above_the_limit;
272 
273 	struct rib_hca_s *next;
274 };
275 
276 
277 /*
278  * Structure on wait state of a post send
279  */
280 struct send_wid {
281 	uint32_t 	xid;
282 	int		cv_sig;
283 	kmutex_t	sendwait_lock;
284 	kcondvar_t	wait_cv;
285 	uint_t		status;
286 	rib_qp_t	*qp;
287 	int		nsbufs;			/* # of send buffers posted */
288 	uint64_t	sbufaddr[DSEG_MAX];	/* posted send buffers */
289 	caddr_t		c;
290 	caddr_t		c1;
291 	int		l1;
292 	caddr_t		c2;
293 	int		l2;
294 	int		wl, rl;
295 };
296 
297 /*
298  * Structure on reply descriptor for recv queue.
299  * Different from the above posting of a descriptor.
300  */
301 struct reply {
302 	uint32_t 	xid;
303 	uint_t		status;
304 	uint64_t	vaddr_cq;	/* buf addr from CQ */
305 	uint_t		bytes_xfer;
306 	kcondvar_t	wait_cv;
307 	struct reply	*next;
308 	struct reply 	*prev;
309 };
310 
311 struct svc_recv {
312 	rib_qp_t	*qp;
313 	uint64_t	vaddr;
314 	uint_t		bytes_xfer;
315 };
316 
317 struct recv_wid {
318 	uint32_t 	xid;
319 	rib_qp_t	*qp;
320 	uint64_t	addr;	/* posted buf addr */
321 };
322 
323 /*
324  * Per QP data structure
325  */
326 struct rib_qp_s {
327 	rib_hca_t		*hca;
328 	rib_mode_t		mode;	/* RIB_SERVER or RIB_CLIENT */
329 	CONN			rdmaconn;
330 	ibt_channel_hdl_t	qp_hdl;
331 	uint_t			port_num;
332 	ib_qpn_t		qpn;
333 	int			chan_flags;
334 	clock_t			timeout;
335 	ibt_rc_chan_query_attr_t	qp_q_attrs;
336 	rib_cq_t		*send_cq;	/* send CQ */
337 	rib_cq_t		*recv_cq;	/* recv CQ */
338 
339 	/*
340 	 * Number of pre-posted rbufs
341 	 */
342 	uint_t			n_posted_rbufs;
343 	kcondvar_t 		posted_rbufs_cv;
344 	kmutex_t		posted_rbufs_lock;
345 
346 	/*
347 	 * Number of SENDs pending completion
348 	 */
349 
350 	uint_t			n_send_rbufs;
351 	kcondvar_t 		send_rbufs_cv;
352 	kmutex_t		send_rbufs_lock;
353 
354 	/*
355 	 * RPC reply
356 	 */
357 	uint_t			rep_list_size;
358 	struct reply		*replylist;
359 	kmutex_t		replylist_lock;
360 
361 	/*
362 	 * server only, RDMA_DONE
363 	 */
364 	struct rdma_done_list	*rdlist;
365 	kmutex_t		rdlist_lock;
366 
367 	kmutex_t		cb_lock;
368 	kcondvar_t 		cb_conn_cv;
369 
370 	caddr_t			q;	/* upstream queue */
371 	struct send_wid		wd;
372 };
373 
374 #define	ctoqp(conn)	((rib_qp_t *)((conn)->c_private))
375 #define	qptoc(rqp)	((CONN *)&((rqp)->rdmaconn))
376 
377 /*
378  * Timeout for various calls
379  */
380 #define	CONN_WAIT_TIME	40
381 #define	SEND_WAIT_TIME	40	/* time for send completion */
382 
383 #define	REPLY_WAIT_TIME	40	/* time to get reply from remote QP */
384 
385 #ifdef __cplusplus
386 }
387 #endif
388 
389 #endif	/* !_IB_H */
390