xref: /illumos-gate/usr/src/uts/common/rpc/rpc_rdma.h (revision 2654012f83cec5dc15b61dfe3e4a4915f186e7a6)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /*
27  * Copyright (c) 2007, The Ohio State University. All rights reserved.
28  *
29  * Portions of this source code is developed by the team members of
30  * The Ohio State University's Network-Based Computing Laboratory (NBCL),
31  * headed by Professor Dhabaleswar K. (DK) Panda.
32  *
33  * Acknowledgements to contributions from developors:
34  *   Ranjit Noronha: noronha@cse.ohio-state.edu
35  *   Lei Chai      : chail@cse.ohio-state.edu
36  *   Weikuan Yu    : yuw@cse.ohio-state.edu
37  *
38  */
39 
40 #ifndef	_RPC_RPC_RDMA_H
41 #define	_RPC_RPC_RDMA_H
42 
43 #include <rpc/rpc.h>
44 #include <rpc/rpc_sztypes.h>
45 #include <sys/sunddi.h>
46 #include <sys/sunldi.h>
47 
48 #ifdef __cplusplus
49 extern "C" {
50 #endif
51 
52 #define	RPCRDMA_VERS	1	/* Version of the RPC over RDMA protocol */
53 #define	RDMATF_VERS	1	/* Version of the API used by RPC for RDMA */
54 #define	RDMATF_VERS_1	1	/* Current version of RDMATF */
55 
56 /*
57  * The size of an RPC call or reply message
58  */
59 #define	RPC_MSG_SZ	1024
60 
61 /*
62  * RDMA chunk size
63  */
64 #define	RDMA_MINCHUNK	1024
65 
66 /*
67  * Storage for a chunk list
68  */
69 #define	RPC_CL_SZ  1024
70 
71 /*
72  * Chunk size
73  */
74 #define	MINCHUNK  1024
75 
76 /*
77  * Size of receive buffer
78  */
79 #define	RPC_BUF_SIZE	2048
80 
81 #define	NOWAIT	0	/* don't wait for operation of complete */
82 #define	WAIT	1	/* wait and ensure that operation is complete */
83 
84 /*
85  * RDMA xdr buffer control and other control flags. Add new flags here,
86  * set them in private structure for xdr over RDMA in xdr_rdma.c
87  */
88 #define	XDR_RDMA_CHUNK			0x1
89 #define	XDR_RDMA_WLIST_REG		0x2
90 #define	XDR_RDMA_RLIST_REG		0x4
91 
92 #define	LONG_REPLY_LEN	65536
93 #define	WCL_BUF_LEN	32768
94 #define	RCL_BUF_LEN	32768
95 
96 
97 #define	RDMA_BUFS_RQST	34	/* Num bufs requested by client */
98 #define	RDMA_BUFS_GRANT	32	/* Num bufs granted by server */
99 
100 struct xdr_ops *xdrrdma_xops(void);
101 
102 /*
103  * Credit Control Structures.
104  */
105 typedef enum rdma_cc_type {
106 	RDMA_CC_CLNT,	/* CONN is for a client */
107 	RDMA_CC_SRV	/* CONN is for a server */
108 } rdma_cc_type_t;
109 
110 /*
111  * Client side credit control data structure.
112  */
113 typedef struct rdma_clnt_cred_ctrl {
114 	uint32_t	clnt_cc_granted_ops;
115 	uint32_t	clnt_cc_in_flight_ops;
116 	kcondvar_t	clnt_cc_cv;
117 } rdma_clnt_cred_ctrl_t;
118 
119 /*
120  * Server side credit control data structure.
121  */
122 typedef struct rdma_srv_cred_ctrl {
123 	uint32_t	srv_cc_buffers_granted;
124 	uint32_t	srv_cc_cur_buffers_used;
125 	uint32_t	srv_cc_posted;
126 	uint32_t	srv_cc_max_buf_size;	/* to be determined by CCP */
127 	uint32_t	srv_cc_cur_buf_size;	/* to be determined by CCP */
128 } rdma_srv_cred_ctrl_t;
129 
130 typedef enum {
131     RPCCALL_WLIST,
132     RPCCALL_WCHUNK,
133     RPCCALL_NOWRITE
134 }rpccall_write_t;
135 
136 typedef enum {
137 	CLIST_REG_SOURCE,
138 	CLIST_REG_DST
139 } clist_dstsrc;
140 
141 /*
142  * Return codes from RDMA operations
143  */
144 typedef enum {
145 
146 	RDMA_SUCCESS = 0,	/* successful operation */
147 
148 	RDMA_INVAL = 1,		/* invalid parameter */
149 	RDMA_TIMEDOUT = 2,	/* operation timed out */
150 	RDMA_INTR = 3,		/* operation interrupted */
151 	RDMA_NORESOURCE = 4,	/* insufficient resource */
152 	/*
153 	 * connection errors
154 	 */
155 	RDMA_REJECT = 5,	/* connection req rejected */
156 	RDMA_NOLISTENER = 6,	/* no listener on server */
157 	RDMA_UNREACHABLE = 7,	/* host unreachable */
158 	RDMA_CONNLOST = 8,	/* connection lost */
159 
160 	RDMA_XPRTFAILED = 9,	/* RDMA transport failed */
161 	RDMA_PROTECTERR = 10,	/* memory protection error */
162 	RDMA_OVERRUN = 11,	/* transport overrun */
163 	RDMA_RECVQEMPTY = 12,	/* incoming pkt dropped, recv q empty */
164 	RDMA_PROTFAILED = 13,	/* RDMA protocol failed */
165 	RDMA_NOTSUPP = 14,	/* requested feature not supported */
166 	RDMA_REMOTERR = 15,	/* error at remote end */
167 	/*
168 	 * RDMATF errors
169 	 */
170 	RDMA_BADVERS = 16,	/* mismatch RDMATF versions */
171 	RDMA_REG_EXIST = 17,	/* RDMATF registration already exists */
172 
173 	/*
174 	 * fallback error
175 	 */
176 	RDMA_FAILED = 18	/* generic error */
177 } rdma_stat;
178 
179 /*
180  * Memory region context. This is an RDMA provider generated
181  * handle for a registered arbitrary size contiguous virtual
182  * memory. The RDMA Interface Adapter needs this for local or
183  * remote memory access.
184  *
185  * The mrc_rmr field holds the remote memory region context
186  * which is sent over-the-wire to provide the remote host
187  * with RDMA access to the memory region.
188  */
189 struct mrc {
190 	uint32_t	mrc_rmr;	/* Remote MR context, sent OTW */
191 	union {
192 		struct mr {
193 			uint32_t	lmr; 	/* Local MR context */
194 			uint64_t	linfo;	/* Local memory info */
195 		} mr;
196 	} lhdl;
197 };
198 
199 #define	mrc_lmr		lhdl.mr.lmr
200 #define	mrc_linfo	lhdl.mr.linfo
201 
202 /*
203  * Memory management for the RDMA buffers
204  */
205 /*
206  * RDMA buffer types
207  */
208 typedef enum {
209 	SEND_BUFFER,	/* buf for send msg */
210 	SEND_DESCRIPTOR, /* buf used for send msg descriptor in plugins only */
211 	RECV_BUFFER,	/* buf for recv msg */
212 	RECV_DESCRIPTOR, /* buf used for recv msg descriptor in plugins only */
213 	RDMA_LONG_BUFFER /* chunk buf used in RDMATF only and not in plugins */
214 } rdma_btype;
215 
216 /*
217  * RDMA buffer information
218  */
219 typedef struct rdma_buf {
220 	rdma_btype	type;	/* buffer type */
221 	uint_t		len;	/* length of buffer */
222 	caddr_t		addr;	/* buffer address */
223 	struct mrc	handle;	/* buffer registration handle */
224 	caddr_t		rb_private;
225 } rdma_buf_t;
226 
227 
228 /*
229  * The XDR offset value is used by the XDR
230  * routine to identify the position in the
231  * RPC message where the opaque object would
232  * normally occur. Neither the data content
233  * of the chunk, nor its size field are included
234  * in the RPC message.  The XDR offset is calculated
235  * as if the chunks were present.
236  *
237  * The remaining fields identify the chunk of data
238  * on the sender.  The c_memhandle identifies a
239  * registered RDMA memory region and the c_addr
240  * and c_len fields identify the chunk within it.
241  */
242 struct clist {
243 	uint32		c_xdroff;	/* XDR offset */
244 	uint32		c_len;		/* Length */
245 	struct mrc	c_smemhandle;	/* src memory handle */
246 	uint64 		c_ssynchandle;	/* src sync handle */
247 	union {
248 		uint64		c_saddr;	/* src address */
249 		caddr_t 	c_saddr3;
250 	} w;
251 	struct mrc	c_dmemhandle;	/* dst memory handle */
252 	uint64		c_dsynchandle;	/* dst sync handle */
253 	union {
254 		uint64	c_daddr;	/* dst address */
255 		caddr_t	c_daddr3;
256 	} u;
257 	struct as	*c_adspc;	/* address space for saddr/daddr */
258 	rdma_buf_t	rb_longbuf;	/* used for long requests/replies */
259 	struct clist	*c_next;	/* Next chunk */
260 };
261 
262 typedef struct clist clist;
263 
264 /*
265  * max 4M wlist xfer size
266  * This is defined because the rfs3_tsize service requires
267  * svc_req struct (which we don't have that in krecv).
268  */
269 #define	MAX_SVC_XFER_SIZE (4*1024*1024)
270 
271 enum rdma_proc {
272 	RDMA_MSG	= 0,	/* chunk list and RPC msg follow */
273 	RDMA_NOMSG	= 1,	/* only chunk list follows */
274 	RDMA_MSGP	= 2,	/* chunk list and RPC msg with padding follow */
275 	RDMA_DONE	= 3	/* signal completion of chunk transfer */
276 };
277 
278 /*
279  * Listener information for a service
280  */
281 struct rdma_svc_data {
282 	queue_t		q;	/* queue_t to place incoming pkts */
283 	int		active;	/* If active, after registeration startup */
284 	rdma_stat	err_code;	/* Error code from plugin layer */
285 	int32_t		svcid;		/* RDMA based service identifier */
286 };
287 
288 /*
289  * Per RDMA plugin module information.
290  * Will be populated by each plugin
291  * module during its initialization.
292  */
293 typedef struct rdma_mod {
294 	char 		*rdma_api;		/* "kvipl", "ibtf", etc */
295 	uint_t 		rdma_version;		/* RDMATF API version */
296 	int		rdma_count;		/* # of devices */
297 	struct rdmaops 	*rdma_ops;		/* rdma op vector for api */
298 } rdma_mod_t;
299 
300 /*
301  * Registry of RDMA plugins
302  */
303 typedef struct rdma_registry {
304 	rdma_mod_t	*r_mod;		/* plugin mod info */
305 	struct rdma_registry *r_next;	/* next registered RDMA plugin */
306 } rdma_registry_t;
307 
308 /*
309  * RDMA transport information
310  */
311 typedef struct rdma_info {
312 	uint_t	addrlen;	/* address length */
313 	uint_t  mts;		/* max transfer size */
314 	uint_t  mtu;		/* native mtu size of unlerlying network */
315 } rdma_info_t;
316 
317 typedef enum {
318 	C_IDLE		= 0x00000001,
319 	C_CONN_PEND	= 0x00000002,
320 	C_CONNECTED	= 0x00000004,
321 	C_ERROR_CONN	= 0x00000008,
322 	C_DISCONN_PEND	= 0x00000010,
323 	C_REMOTE_DOWN	= 0x00000020
324 } conn_c_state;
325 
326 /*
327  * RDMA Connection information
328  */
329 typedef struct conn {
330 	rdma_mod_t	*c_rdmamod;	/* RDMA transport info for conn */
331 	struct netbuf	c_raddr;	/* remote address */
332 	struct netbuf	c_laddr;	/* local address */
333 	int		c_ref;		/* no. of clients of connection */
334 	struct conn	*c_next;	/* next in list of connections */
335 	struct conn	*c_prev;	/* prev in list of connections */
336 	caddr_t		c_private;	/* transport specific stuff */
337 	conn_c_state	c_state;	/* state of connection */
338 	rdma_cc_type_t	c_cc_type;	/* client or server, for credit cntrl */
339 	union {
340 		rdma_clnt_cred_ctrl_t	c_clnt_cc;
341 		rdma_srv_cred_ctrl_t	c_srv_cc;
342 	} rdma_conn_cred_ctrl_u;
343 	kmutex_t	c_lock;		/* protect c_state and c_ref fields */
344 	kcondvar_t	c_cv;		/* to signal when pending is done */
345 } CONN;
346 
347 
348 /*
349  * Data transferred from plugin interrupt to svc_queuereq()
350  */
351 typedef struct rdma_recv_data {
352 	CONN		*conn;
353 	int		status;
354 	rdma_buf_t	rpcmsg;
355 } rdma_recv_data_t;
356 
357 /* structure used to pass information for READ over rdma write */
358 typedef enum {
359 	RCI_WRITE_UIO_CHUNK = 1,
360 	RCI_WRITE_ADDR_CHUNK = 2,
361 	RCI_REPLY_CHUNK = 3
362 } rci_type_t;
363 
364 typedef struct {
365 	rci_type_t rci_type;
366 	union {
367 		struct uio *rci_uiop;
368 		caddr_t    rci_addr;
369 	} rci_a;
370 	uint32	rci_len;
371 	struct clist	**rci_clpp; /* point to write chunk list in readargs */
372 } rdma_chunkinfo_t;
373 
374 typedef struct {
375 	uint_t rcil_len;
376 	uint_t rcil_len_alt;
377 } rdma_chunkinfo_lengths_t;
378 
379 typedef struct {
380 	struct	clist	*rwci_wlist;
381 	CONN		*rwci_conn;
382 } rdma_wlist_conn_info_t;
383 
384 /*
385  * Operations vector for RDMA transports.
386  */
387 typedef struct rdmaops {
388 	/* Network */
389 	rdma_stat	(*rdma_reachable)(int addr_type, struct netbuf *,
390 						void **handle);
391 	/* Connection */
392 	rdma_stat	(*rdma_get_conn)(struct netbuf *, int addr_type,
393 						void *, CONN **);
394 	rdma_stat	(*rdma_rel_conn)(CONN *);
395 	/* Server side listner start and stop routines */
396 	void		(*rdma_svc_listen)(struct rdma_svc_data *);
397 	void		(*rdma_svc_stop)(struct rdma_svc_data *);
398 	/* Memory */
399 	rdma_stat	(*rdma_regmem)(CONN *, caddr_t, caddr_t,
400 			    uint_t, struct mrc *);
401 	rdma_stat	(*rdma_deregmem)(CONN *, caddr_t, struct mrc);
402 	rdma_stat	(*rdma_regmemsync)(CONN *, caddr_t, caddr_t, uint_t,
403 				struct mrc *, void **, void *);
404 	rdma_stat	(*rdma_deregmemsync)(CONN *, caddr_t, struct mrc,
405 			    void *, void *);
406 	rdma_stat	(*rdma_syncmem)(CONN *, void *, caddr_t, int, int);
407 	/* Buffer */
408 	rdma_stat	(*rdma_buf_alloc)(CONN *, rdma_buf_t *);
409 	void		(*rdma_buf_free)(CONN *, rdma_buf_t *);
410 	/* Transfer */
411 	rdma_stat	(*rdma_send)(CONN *, clist *, uint32_t);
412 	rdma_stat	(*rdma_send_resp)(CONN *, clist *, uint32_t);
413 	rdma_stat	(*rdma_clnt_recvbuf)(CONN *, clist *, uint32_t);
414 	rdma_stat	(*rdma_clnt_recvbuf_remove)(CONN *, uint32_t);
415 	rdma_stat	(*rdma_svc_recvbuf)(CONN *, clist *);
416 	rdma_stat	(*rdma_recv)(CONN *, clist **, uint32_t);
417 	/* RDMA */
418 	rdma_stat	(*rdma_read)(CONN *, clist *, int);
419 	rdma_stat	(*rdma_write)(CONN *, clist *, int);
420 	/* INFO */
421 	rdma_stat	(*rdma_getinfo)(rdma_info_t *info);
422 } rdmaops_t;
423 
424 /*
425  * RDMA operations.
426  */
427 #define	RDMA_REACHABLE(rdma_ops, addr_type, addr, handle)	\
428 	(*(rdma_ops)->rdma_reachable)(addr_type, addr, handle)
429 
430 #define	RDMA_GET_CONN(rdma_ops, addr, addr_type, handle, conn)	\
431 	(*(rdma_ops)->rdma_get_conn)(addr, addr_type, handle, conn)
432 
433 #define	RDMA_REL_CONN(conn)	\
434 	(*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn)
435 
436 #define	RDMA_REGMEM(conn, adsp, buff, len, handle)	\
437 	(*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, adsp,	\
438 		buff, len, handle)
439 
440 #define	RDMA_DEREGMEM(conn, buff, handle)	\
441 	(*(conn)->c_rdmamod->rdma_ops->rdma_deregmem)(conn, buff, handle)
442 
443 #define	RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle, lrc)	\
444 	(*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \
445 	len, handle, synchandle, lrc)
446 
447 #define	RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle, lrc)	\
448 	(*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff,	\
449 	handle, synchandle, lrc)
450 
451 #define	RDMA_SYNCMEM(conn, handle, buff, len, direction)	\
452 	(*(conn)->c_rdmamod->rdma_ops->rdma_syncmem)(conn, handle, \
453 	    buff, len, direction)
454 
455 #define	RDMA_BUF_ALLOC(conn, rbuf)	\
456 	(*(conn)->c_rdmamod->rdma_ops->rdma_buf_alloc)(conn, rbuf)
457 
458 #define	RDMA_BUF_FREE(conn, rbuf)	\
459 	(*(conn)->c_rdmamod->rdma_ops->rdma_buf_free)(conn, rbuf)
460 
461 #define	RDMA_SEND(conn, sendlist, xid)	\
462 	(*(conn)->c_rdmamod->rdma_ops->rdma_send)(conn, sendlist, xid)
463 
464 #define	RDMA_SEND_RESP(conn, sendlist, xid)	\
465 	(*(conn)->c_rdmamod->rdma_ops->rdma_send_resp)(conn, sendlist, xid)
466 
467 #define	RDMA_CLNT_RECVBUF(conn, cl, xid)	\
468 	(*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf)(conn, cl, xid)
469 
470 #define	RDMA_CLNT_RECVBUF_REMOVE(conn, xid)	\
471 	(*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf_remove)(conn, xid)
472 
473 #define	RDMA_SVC_RECVBUF(conn, cl)	\
474 	(*(conn)->c_rdmamod->rdma_ops->rdma_svc_recvbuf)(conn, cl)
475 
476 #define	RDMA_RECV(conn, recvlist, xid)	\
477 	(*(conn)->c_rdmamod->rdma_ops->rdma_recv)(conn, recvlist, xid)
478 
479 #define	RDMA_READ(conn, cl, wait)	\
480 	(*(conn)->c_rdmamod->rdma_ops->rdma_read)(conn, cl, wait)
481 
482 #define	RDMA_WRITE(conn, cl, wait)	\
483 	(*(conn)->c_rdmamod->rdma_ops->rdma_write)(conn, cl, wait)
484 
485 #define	RDMA_GETINFO(rdma_mod, info)	\
486 	(*(rdma_mod)->rdma_ops->rdma_getinfo)(info)
487 
488 #ifdef _KERNEL
489 extern rdma_registry_t	*rdma_mod_head;
490 extern krwlock_t rdma_lock;		/* protects rdma_mod_head list */
491 extern int rdma_modloaded;		/* flag for loading RDMA plugins */
492 extern int rdma_dev_available;		/* rdma device is loaded or not */
493 extern kmutex_t rdma_modload_lock;	/* protects rdma_modloaded flag */
494 extern uint_t rdma_minchunk;
495 extern ldi_ident_t rpcmod_li; 		/* needed by layed driver framework */
496 
497 /*
498  * General RDMA routines
499  */
500 extern struct clist *clist_alloc(void);
501 extern void clist_add(struct clist **, uint32_t, int,
502 	struct mrc *, caddr_t, struct mrc *, caddr_t);
503 extern void clist_free(struct clist *);
504 extern rdma_stat clist_register(CONN *conn, struct clist *cl, clist_dstsrc);
505 extern rdma_stat clist_deregister(CONN *conn, struct clist *cl, clist_dstsrc);
506 extern rdma_stat clist_syncmem(CONN *conn, struct clist *cl, clist_dstsrc);
507 extern rdma_stat rdma_clnt_postrecv(CONN *conn, uint32_t xid);
508 extern rdma_stat rdma_clnt_postrecv_remove(CONN *conn, uint32_t xid);
509 extern rdma_stat rdma_svc_postrecv(CONN *conn);
510 extern rdma_stat rdma_register_mod(rdma_mod_t *mod);
511 extern rdma_stat rdma_unregister_mod(rdma_mod_t *mod);
512 extern rdma_stat rdma_buf_alloc(CONN *, rdma_buf_t *);
513 extern void rdma_buf_free(CONN *, rdma_buf_t *);
514 extern int rdma_modload();
515 extern bool_t   rdma_get_wchunk(struct svc_req *, iovec_t *, struct clist *);
516 
517 /*
518  * RDMA XDR
519  */
520 extern void xdrrdma_create(XDR *, caddr_t, uint_t, int, struct clist *,
521 	enum xdr_op, CONN *);
522 extern void xdrrdma_destroy(XDR *);
523 
524 extern uint_t xdrrdma_getpos(XDR *);
525 extern bool_t xdrrdma_setpos(XDR *, uint_t);
526 extern bool_t xdr_clist(XDR *, clist *);
527 extern bool_t xdr_do_clist(XDR *, clist **);
528 extern uint_t xdr_getbufsize(XDR *);
529 extern unsigned int xdrrdma_sizeof(xdrproc_t, void *, int, uint_t *, uint_t *);
530 extern unsigned int xdrrdma_authsize(AUTH *, struct cred *, int);
531 
532 extern void xdrrdma_store_wlist(XDR *, struct clist *);
533 extern struct clist *xdrrdma_wclist(XDR *);
534 extern bool_t xdr_decode_reply_wchunk(XDR *, struct clist **);
535 extern bool_t xdr_decode_wlist(XDR *xdrs, struct clist **, bool_t *);
536 extern bool_t xdr_decode_wlist_svc(XDR *xdrs, struct clist **, bool_t *,
537 	uint32_t *, CONN *);
538 extern bool_t xdr_encode_rlist_svc(XDR *, clist *);
539 extern bool_t xdr_encode_wlist(XDR *, clist *);
540 extern bool_t xdr_encode_reply_wchunk(XDR *, struct clist *,
541 		uint32_t seg_array_len);
542 bool_t xdrrdma_getrdmablk(XDR *, struct clist **, uint_t *,
543 	CONN **conn, const uint_t);
544 bool_t xdrrdma_read_from_client(struct clist **, CONN **, uint_t);
545 bool_t xdrrdma_send_read_data(XDR *, struct clist *);
546 bool_t xdrrdma_free_clist(CONN *, struct clist *);
547 #endif /* _KERNEL */
548 
549 #ifdef __cplusplus
550 }
551 #endif
552 
553 #endif	/* _RPC_RPC_RDMA_H */
554