xref: /titanic_50/usr/src/uts/common/rpc/rpc_rdma.h (revision c5c4113dfcabb1eed3d4bdf7609de5170027a794)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #ifndef	_RPC_RPC_RDMA_H
28 #define	_RPC_RPC_RDMA_H
29 
30 #pragma ident	"%Z%%M%	%I%	%E% SMI"
31 
32 #include <rpc/rpc.h>
33 #include <rpc/rpc_sztypes.h>
34 #include <sys/sunddi.h>
35 #include <sys/sunldi.h>
36 
37 #ifdef __cplusplus
38 extern "C" {
39 #endif
40 
41 #define	RPCRDMA_VERS	0	/* Version of the RPC over RDMA protocol */
42 #define	RDMATF_VERS	1	/* Version of the API used by RPC for RDMA */
43 #define	RDMATF_VERS_1	1	/* Current version of RDMATF */
44 
45 /*
46  * The size of an RPC call or reply message
47  */
48 #define	RPC_MSG_SZ  1024
49 
50 /*
51  * Storage for a chunk list
52  */
53 #define	RPC_CL_SZ  1024
54 
55 /*
56  * Chunk size
57  */
58 #define	MINCHUNK  1024
59 
60 /*
61  * Size of receive buffer
62  */
63 #define	RPC_BUF_SIZE	2048
64 
65 #define	NOWAIT	0	/* don't wait for operation of complete */
66 #define	WAIT	1	/* wait and ensure that operation is complete */
67 
68 /*
69  * RDMA xdr buffer control and other control flags. Add new flags here,
70  * set them in private structure for xdr over RDMA in xdr_rdma.c
71  */
72 #define	RDMA_NOCHUNK		0x1
73 
74 /*
75  * Return codes from RDMA operations
76  */
77 typedef enum {
78 
79 	RDMA_SUCCESS = 0,	/* successful operation */
80 
81 	RDMA_INVAL = 1,		/* invalid parameter */
82 	RDMA_TIMEDOUT = 2,	/* operation timed out */
83 	RDMA_INTR = 3,		/* operation interrupted */
84 	RDMA_NORESOURCE = 4,	/* insufficient resource */
85 	/*
86 	 * connection errors
87 	 */
88 	RDMA_REJECT = 5,	/* connection req rejected */
89 	RDMA_NOLISTENER = 6,	/* no listener on server */
90 	RDMA_UNREACHABLE = 7,	/* host unreachable */
91 	RDMA_CONNLOST = 8,	/* connection lost */
92 
93 	RDMA_XPRTFAILED = 9,	/* RDMA transport failed */
94 	RDMA_PROTECTERR = 10,	/* memory protection error */
95 	RDMA_OVERRUN = 11,	/* transport overrun */
96 	RDMA_RECVQEMPTY = 12,	/* incoming pkt dropped, recv q empty */
97 	RDMA_PROTFAILED = 13,	/* RDMA protocol failed */
98 	RDMA_NOTSUPP = 14,	/* requested feature not supported */
99 	RDMA_REMOTERR = 15,	/* error at remote end */
100 	/*
101 	 * RDMATF errors
102 	 */
103 	RDMA_BADVERS = 16,	/* mismatch RDMATF versions */
104 	RDMA_REG_EXIST = 17,	/* RDMATF registration already exists */
105 
106 	/*
107 	 * fallback error
108 	 */
109 	RDMA_FAILED = 18	/* generic error */
110 } rdma_stat;
111 
112 /*
113  * Memory region context. This is an RDMA provider generated
114  * handle for a registered arbitrary size contiguous virtual
115  * memory. The RDMA Interface Adapter needs this for local or
116  * remote memory access.
117  *
118  * The mrc_rmr field holds the remote memory region context
119  * which is sent over-the-wire to provide the remote host
120  * with RDMA access to the memory region.
121  */
122 struct mrc {
123 	uint32_t	mrc_rmr;	/* Remote MR context, sent OTW */
124 	union {
125 		struct mr {
126 			uint32_t	lmr; 	/* Local MR context */
127 			uint64_t	linfo;	/* Local memory info */
128 		} mr;
129 	} lhdl;
130 };
131 
132 #define	mrc_lmr		lhdl.mr.lmr
133 #define	mrc_linfo	lhdl.mr.linfo
134 
135 /*
136  * The XDR offset value is used by the XDR
137  * routine to identify the position in the
138  * RPC message where the opaque object would
139  * normally occur. Neither the data content
140  * of the chunk, nor its size field are included
141  * in the RPC message.  The XDR offset is calculated
142  * as if the chunks were present.
143  *
144  * The remaining fields identify the chunk of data
145  * on the sender.  The c_memhandle identifies a
146  * registered RDMA memory region and the c_addr
147  * and c_len fields identify the chunk within it.
148  */
149 struct clist {
150 	uint32		c_xdroff;	/* XDR offset */
151 	uint32		c_len;		/* Length */
152 	struct mrc	c_smemhandle;	/* src memory handle */
153 	uint64 		c_ssynchandle;	/* src sync handle */
154 	uint64		c_saddr;	/* src address */
155 	struct mrc	c_dmemhandle;	/* dst memory handle */
156 	uint64		c_dsynchandle;	/* dst sync handle */
157 	uint64		c_daddr;	/* dst address */
158 	struct clist	*c_next;	/* Next chunk */
159 };
160 
161 typedef struct clist clist;
162 
163 enum rdma_proc {
164 	RDMA_MSG	= 0,	/* chunk list and RPC msg follow */
165 	RDMA_NOMSG	= 1,	/* only chunk list follows */
166 	RDMA_MSGP	= 2,	/* chunk list and RPC msg with padding follow */
167 	RDMA_DONE	= 3	/* signal completion of chunk transfer */
168 };
169 
170 /*
171  * Listener information for a service
172  */
173 struct rdma_svc_data {
174 	queue_t		q;	/* queue_t to place incoming pkts */
175 	int		active;	/* If active, after registeration startup */
176 	rdma_stat	err_code;	/* Error code from plugin layer */
177 	int32_t		svcid;		/* RDMA based service identifier */
178 };
179 
180 /*
181  * Per RDMA plugin module information.
182  * Will be populated by each plugin
183  * module during its initialization.
184  */
185 typedef struct rdma_mod {
186 	char 		*rdma_api;		/* "kvipl", "ibtf", etc */
187 	uint_t 		rdma_version;		/* RDMATF API version */
188 	int		rdma_count;		/* # of devices */
189 	struct rdmaops 	*rdma_ops;		/* rdma op vector for api */
190 } rdma_mod_t;
191 
192 /*
193  * Registry of RDMA plugins
194  */
195 typedef struct rdma_registry {
196 	rdma_mod_t	*r_mod;		/* plugin mod info */
197 	struct rdma_registry *r_next;	/* next registered RDMA plugin */
198 } rdma_registry_t;
199 
200 /*
201  * RDMA transport information
202  */
203 typedef struct rdma_info {
204 	uint_t	addrlen;	/* address length */
205 	uint_t  mts;		/* max transfer size */
206 	uint_t  mtu;		/* native mtu size of unlerlying network */
207 } rdma_info_t;
208 
209 /*
210  * RDMA Connection information
211  */
212 typedef struct conn {
213 	rdma_mod_t	*c_rdmamod;	/* RDMA transport info for conn */
214 	struct netbuf	c_raddr;	/* remote address */
215 	struct netbuf	c_laddr;	/* local address */
216 	int		c_ref;		/* no. of clients of connection */
217 	struct conn	*c_next;	/* next in list of connections */
218 	struct conn	*c_prev;	/* prev in list of connections */
219 	caddr_t		c_private;	/* transport specific stuff */
220 
221 #define	C_IDLE		0x80000000
222 #define	C_CONN_PEND	0x40000000
223 #define	C_CONNECTED	0x20000000
224 #define	C_ERROR		0x10000000
225 #define	C_DISCONN_PEND	0x08000000
226 #define	C_REMOTE_DOWN	0x04000000
227 
228 	uint_t		c_state;	/* state of connection */
229 	kmutex_t	c_lock;		/* protect c_state and c_ref fields */
230 	kcondvar_t	c_cv;		/* to signal when pending is done */
231 } CONN;
232 
233 
234 /*
235  * Memory management for the RDMA buffers
236  */
237 /*
238  * RDMA buffer types
239  */
240 typedef enum {
241 	SEND_BUFFER,	/* buf for send msg */
242 	SEND_DESCRIPTOR, /* buf used for send msg descriptor in plugins only */
243 	RECV_BUFFER,	/* buf for recv msg */
244 	RECV_DESCRIPTOR, /* buf used for recv msg descriptor in plugins only */
245 	CHUNK_BUFFER	/* chunk buf used in RDMATF only and not in plugins */
246 } rdma_btype;
247 
248 /*
249  * RDMA buffer information
250  */
251 typedef struct rdma_buf {
252 	rdma_btype	type;	/* buffer type */
253 	int		len;	/* length of buffer */
254 	caddr_t		addr;	/* buffer address */
255 	struct mrc	handle;	/* buffer registration handle */
256 } rdma_buf_t;
257 
258 /*
259  * Data transferred from plugin interrupt to svc_queuereq()
260  */
261 struct recv_data {
262 	CONN		*conn;
263 	int		status;
264 	rdma_buf_t	rpcmsg;
265 };
266 
267 /*
268  * Operations vector for RDMA transports.
269  */
270 typedef struct rdmaops {
271 	/* Network */
272 	rdma_stat	(*rdma_reachable)(int addr_type, struct netbuf *,
273 						void **handle);
274 	/* Connection */
275 	rdma_stat	(*rdma_get_conn)(struct netbuf *, int addr_type,
276 						void *, CONN **);
277 	rdma_stat	(*rdma_rel_conn)(CONN *);
278 	/* Server side listner start and stop routines */
279 	void		(*rdma_svc_listen)(struct rdma_svc_data *);
280 	void		(*rdma_svc_stop)(struct rdma_svc_data *);
281 	/* Memory */
282 	rdma_stat	(*rdma_regmem)(CONN *, caddr_t, uint_t, struct mrc *);
283 	rdma_stat	(*rdma_deregmem)(CONN *, caddr_t, struct mrc);
284 	rdma_stat	(*rdma_regmemsync)(CONN *, caddr_t, uint_t,
285 				struct mrc *, void **);
286 	rdma_stat	(*rdma_deregmemsync)(CONN *, caddr_t, struct mrc,
287 				void *);
288 	rdma_stat	(*rdma_syncmem)(CONN *, void *, caddr_t, int, int);
289 	/* Buffer */
290 	rdma_stat	(*rdma_buf_alloc)(CONN *, rdma_buf_t *);
291 	void		(*rdma_buf_free)(CONN *, rdma_buf_t *);
292 	/* Transfer */
293 	rdma_stat	(*rdma_send)(CONN *, clist *, uint32_t);
294 	rdma_stat	(*rdma_send_resp)(CONN *, clist *, uint32_t);
295 	rdma_stat	(*rdma_clnt_recvbuf)(CONN *, clist *, uint32_t);
296 	rdma_stat	(*rdma_svc_recvbuf)(CONN *, clist *);
297 	rdma_stat	(*rdma_recv)(CONN *, clist **, uint32_t);
298 	/* RDMA */
299 	rdma_stat	(*rdma_read)(CONN *, clist *, int);
300 	rdma_stat	(*rdma_write)(CONN *, clist *, int);
301 	/* INFO */
302 	rdma_stat	(*rdma_getinfo)(rdma_info_t *info);
303 
304 } rdmaops_t;
305 
306 /*
307  * RDMA operations.
308  */
309 #define	RDMA_REACHABLE(rdma_ops, addr_type, addr, handle)	\
310 	(*(rdma_ops)->rdma_reachable)(addr_type, addr, handle)
311 
312 #define	RDMA_GET_CONN(rdma_ops, addr, addr_type, handle, conn)	\
313 	(*(rdma_ops)->rdma_get_conn)(addr, addr_type, handle, conn)
314 
315 #define	RDMA_REL_CONN(conn)	\
316 	(*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn)
317 
318 #define	RDMA_REGMEM(conn, buff, len, handle)	\
319 	(*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, buff, len, handle)
320 
321 #define	RDMA_DEREGMEM(conn, buff, handle)	\
322 	(*(conn)->c_rdmamod->rdma_ops->rdma_deregmem)(conn, buff, handle)
323 
324 #define	RDMA_REGMEMSYNC(conn, buff, len, handle, synchandle)	\
325 	(*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, buff, \
326 	    len, handle, synchandle)
327 
328 #define	RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle)	\
329 	(*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \
330 	    handle, synchandle)
331 
332 #define	RDMA_SYNCMEM(conn, handle, buff, len, direction)	\
333 	(*(conn)->c_rdmamod->rdma_ops->rdma_syncmem)(conn, handle, \
334 	    buff, len, direction)
335 
336 #define	RDMA_BUF_ALLOC(conn, rbuf)	\
337 	(*(conn)->c_rdmamod->rdma_ops->rdma_buf_alloc)(conn, rbuf)
338 
339 #define	RDMA_BUF_FREE(conn, rbuf)	\
340 	(*(conn)->c_rdmamod->rdma_ops->rdma_buf_free)(conn, rbuf)
341 
342 #define	RDMA_SEND(conn, sendlist, xid)	\
343 	(*(conn)->c_rdmamod->rdma_ops->rdma_send)(conn, sendlist, xid)
344 
345 #define	RDMA_SEND_RESP(conn, sendlist, xid)	\
346 	(*(conn)->c_rdmamod->rdma_ops->rdma_send_resp)(conn, sendlist, xid)
347 
348 #define	RDMA_CLNT_RECVBUF(conn, cl, xid)	\
349 	(*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf)(conn, cl, xid)
350 
351 #define	RDMA_SVC_RECVBUF(conn, cl)	\
352 	(*(conn)->c_rdmamod->rdma_ops->rdma_svc_recvbuf)(conn, cl)
353 
354 #define	RDMA_RECV(conn, recvlist, xid)	\
355 	(*(conn)->c_rdmamod->rdma_ops->rdma_recv)(conn, recvlist, xid)
356 
357 #define	RDMA_READ(conn, cl, wait)	\
358 	(*(conn)->c_rdmamod->rdma_ops->rdma_read)(conn, cl, wait)
359 
360 #define	RDMA_WRITE(conn, cl, wait)	\
361 	(*(conn)->c_rdmamod->rdma_ops->rdma_write)(conn, cl, wait)
362 
363 #define	RDMA_GETINFO(rdma_mod, info)	\
364 	(*(rdma_mod)->rdma_ops->rdma_getinfo)(info)
365 
366 #ifdef _KERNEL
367 extern rdma_registry_t	*rdma_mod_head;
368 extern krwlock_t rdma_lock;		/* protects rdma_mod_head list */
369 extern int rdma_modloaded;		/* flag for loading RDMA plugins */
370 extern int rdma_dev_available;		/* rdma device is loaded or not */
371 extern kmutex_t rdma_modload_lock;	/* protects rdma_modloaded flag */
372 extern uint_t rdma_minchunk;
373 extern ldi_ident_t rpcmod_li; 		/* needed by layed driver framework */
374 
375 /*
376  * General RDMA routines
377  */
378 extern void clist_add(struct clist **clp, uint32_t xdroff, int len,
379 	struct mrc *shandle, caddr_t saddr,
380 	struct mrc *dhandle, caddr_t daddr);
381 extern void clist_free(struct clist *cl);
382 extern int clist_register(CONN *conn, struct clist *cl, bool_t src);
383 extern int clist_deregister(CONN *conn, struct clist *cl, bool_t src);
384 rdma_stat rdma_clnt_postrecv(CONN *conn, uint32_t xid);
385 rdma_stat rdma_svc_postrecv(CONN *conn);
386 extern rdma_stat clist_syncmem(CONN *conn, struct clist *cl, bool_t src);
387 extern rdma_stat rdma_register_mod(rdma_mod_t *mod);
388 extern rdma_stat rdma_unregister_mod(rdma_mod_t *mod);
389 extern void rdma_buf_free(CONN *conn, rdma_buf_t *rbuf);
390 extern int rdma_modload();
391 
392 /*
393  * RDMA XDR
394  */
395 extern void xdrrdma_create(XDR *, caddr_t, uint_t, int, struct clist *,
396 	enum xdr_op, CONN *);
397 extern void xdrrdma_destroy(XDR *);
398 extern struct clist *xdrrdma_clist(XDR *);
399 extern uint_t xdrrdma_getpos(XDR *);
400 extern bool_t xdrrdma_setpos(XDR *, uint_t);
401 extern bool_t xdr_clist(XDR *, clist *);
402 extern bool_t xdr_do_clist(XDR *, clist **);
403 extern uint_t xdr_getbufsize(XDR *);
404 unsigned int xdrrdma_sizeof(xdrproc_t func, void *data, int min_chunk);
405 unsigned int xdrrdma_authsize(AUTH *auth, struct cred *cred, int min_chunk);
406 #endif /* _KERNEL */
407 
408 #ifdef __cplusplus
409 }
410 #endif
411 
412 #endif	/* _RPC_RPC_RDMA_H */
413