1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 #ifndef _RPC_RPC_RDMA_H 41 #define _RPC_RPC_RDMA_H 42 43 #include <rpc/rpc.h> 44 #include <rpc/rpc_sztypes.h> 45 #include <sys/sunddi.h> 46 #include <sys/sunldi.h> 47 48 #ifdef __cplusplus 49 extern "C" { 50 #endif 51 52 #define RPCRDMA_VERS 1 /* Version of the RPC over RDMA protocol */ 53 #define RDMATF_VERS 1 /* Version of the API used by RPC for RDMA */ 54 #define RDMATF_VERS_1 1 /* Current version of RDMATF */ 55 56 /* 57 * The size of an RPC call or reply message 58 */ 59 #define RPC_MSG_SZ 1024 60 61 /* 62 * RDMA chunk size 63 */ 64 #define RDMA_MINCHUNK 1024 65 66 /* 67 * Storage for a chunk list 68 */ 69 #define RPC_CL_SZ 1024 70 71 /* 72 * Chunk size 73 */ 74 #define MINCHUNK 1024 75 76 /* 77 * Size of receive buffer 78 */ 79 #define RPC_BUF_SIZE 2048 80 81 #define NOWAIT 0 /* don't wait for operation of complete */ 82 #define WAIT 1 /* wait and ensure that operation is complete */ 83 84 /* 85 * RDMA xdr buffer control and other control flags. Add new flags here, 86 * set them in private structure for xdr over RDMA in xdr_rdma.c 87 */ 88 #define XDR_RDMA_CHUNK 0x1 89 #define XDR_RDMA_WLIST_REG 0x2 90 #define XDR_RDMA_RLIST_REG 0x4 91 92 #define LONG_REPLY_LEN 65536 93 #define WCL_BUF_LEN 32768 94 #define RCL_BUF_LEN 32768 95 96 97 #define RDMA_BUFS_RQST 34 /* Num bufs requested by client */ 98 #define RDMA_BUFS_GRANT 32 /* Num bufs granted by server */ 99 100 struct xdr_ops *xdrrdma_xops(void); 101 102 /* 103 * Credit Control Structures. 104 */ 105 typedef enum rdma_cc_type { 106 RDMA_CC_CLNT, /* CONN is for a client */ 107 RDMA_CC_SRV /* CONN is for a server */ 108 } rdma_cc_type_t; 109 110 /* 111 * Client side credit control data structure. 112 */ 113 typedef struct rdma_clnt_cred_ctrl { 114 uint32_t clnt_cc_granted_ops; 115 uint32_t clnt_cc_in_flight_ops; 116 kcondvar_t clnt_cc_cv; 117 } rdma_clnt_cred_ctrl_t; 118 119 /* 120 * Server side credit control data structure. 121 */ 122 typedef struct rdma_srv_cred_ctrl { 123 uint32_t srv_cc_buffers_granted; 124 uint32_t srv_cc_cur_buffers_used; 125 uint32_t srv_cc_posted; 126 uint32_t srv_cc_max_buf_size; /* to be determined by CCP */ 127 uint32_t srv_cc_cur_buf_size; /* to be determined by CCP */ 128 } rdma_srv_cred_ctrl_t; 129 130 typedef enum { 131 RPCCALL_WLIST, 132 RPCCALL_WCHUNK, 133 RPCCALL_NOWRITE 134 }rpccall_write_t; 135 136 typedef enum { 137 CLIST_REG_SOURCE, 138 CLIST_REG_DST 139 } clist_dstsrc; 140 141 /* 142 * Return codes from RDMA operations 143 */ 144 typedef enum { 145 146 RDMA_SUCCESS = 0, /* successful operation */ 147 148 RDMA_INVAL = 1, /* invalid parameter */ 149 RDMA_TIMEDOUT = 2, /* operation timed out */ 150 RDMA_INTR = 3, /* operation interrupted */ 151 RDMA_NORESOURCE = 4, /* insufficient resource */ 152 /* 153 * connection errors 154 */ 155 RDMA_REJECT = 5, /* connection req rejected */ 156 RDMA_NOLISTENER = 6, /* no listener on server */ 157 RDMA_UNREACHABLE = 7, /* host unreachable */ 158 RDMA_CONNLOST = 8, /* connection lost */ 159 160 RDMA_XPRTFAILED = 9, /* RDMA transport failed */ 161 RDMA_PROTECTERR = 10, /* memory protection error */ 162 RDMA_OVERRUN = 11, /* transport overrun */ 163 RDMA_RECVQEMPTY = 12, /* incoming pkt dropped, recv q empty */ 164 RDMA_PROTFAILED = 13, /* RDMA protocol failed */ 165 RDMA_NOTSUPP = 14, /* requested feature not supported */ 166 RDMA_REMOTERR = 15, /* error at remote end */ 167 /* 168 * RDMATF errors 169 */ 170 RDMA_BADVERS = 16, /* mismatch RDMATF versions */ 171 RDMA_REG_EXIST = 17, /* RDMATF registration already exists */ 172 173 /* 174 * fallback error 175 */ 176 RDMA_FAILED = 18 /* generic error */ 177 } rdma_stat; 178 179 /* 180 * Memory region context. This is an RDMA provider generated 181 * handle for a registered arbitrary size contiguous virtual 182 * memory. The RDMA Interface Adapter needs this for local or 183 * remote memory access. 184 * 185 * The mrc_rmr field holds the remote memory region context 186 * which is sent over-the-wire to provide the remote host 187 * with RDMA access to the memory region. 188 */ 189 struct mrc { 190 uint32_t mrc_rmr; /* Remote MR context, sent OTW */ 191 union { 192 struct mr { 193 uint32_t lmr; /* Local MR context */ 194 uint64_t linfo; /* Local memory info */ 195 } mr; 196 } lhdl; 197 }; 198 199 #define mrc_lmr lhdl.mr.lmr 200 #define mrc_linfo lhdl.mr.linfo 201 202 /* 203 * Memory management for the RDMA buffers 204 */ 205 /* 206 * RDMA buffer types 207 */ 208 typedef enum { 209 SEND_BUFFER, /* buf for send msg */ 210 SEND_DESCRIPTOR, /* buf used for send msg descriptor in plugins only */ 211 RECV_BUFFER, /* buf for recv msg */ 212 RECV_DESCRIPTOR, /* buf used for recv msg descriptor in plugins only */ 213 RDMA_LONG_BUFFER /* chunk buf used in RDMATF only and not in plugins */ 214 } rdma_btype; 215 216 /* 217 * RDMA buffer information 218 */ 219 typedef struct rdma_buf { 220 rdma_btype type; /* buffer type */ 221 uint_t len; /* length of buffer */ 222 caddr_t addr; /* buffer address */ 223 struct mrc handle; /* buffer registration handle */ 224 caddr_t rb_private; 225 } rdma_buf_t; 226 227 228 /* 229 * The XDR offset value is used by the XDR 230 * routine to identify the position in the 231 * RPC message where the opaque object would 232 * normally occur. Neither the data content 233 * of the chunk, nor its size field are included 234 * in the RPC message. The XDR offset is calculated 235 * as if the chunks were present. 236 * 237 * The remaining fields identify the chunk of data 238 * on the sender. The c_memhandle identifies a 239 * registered RDMA memory region and the c_addr 240 * and c_len fields identify the chunk within it. 241 */ 242 struct clist { 243 uint32 c_xdroff; /* XDR offset */ 244 uint32 c_len; /* Length */ 245 struct mrc c_smemhandle; /* src memory handle */ 246 uint64 c_ssynchandle; /* src sync handle */ 247 union { 248 uint64 c_saddr; /* src address */ 249 caddr_t c_saddr3; 250 } w; 251 struct mrc c_dmemhandle; /* dst memory handle */ 252 uint64 c_dsynchandle; /* dst sync handle */ 253 union { 254 uint64 c_daddr; /* dst address */ 255 caddr_t c_daddr3; 256 } u; 257 struct as *c_adspc; /* address space for saddr/daddr */ 258 rdma_buf_t rb_longbuf; /* used for long requests/replies */ 259 struct clist *c_next; /* Next chunk */ 260 }; 261 262 typedef struct clist clist; 263 264 /* 265 * max 4M wlist xfer size 266 * This is defined because the rfs3_tsize service requires 267 * svc_req struct (which we don't have that in krecv). 268 */ 269 #define MAX_SVC_XFER_SIZE (4*1024*1024) 270 271 enum rdma_proc { 272 RDMA_MSG = 0, /* chunk list and RPC msg follow */ 273 RDMA_NOMSG = 1, /* only chunk list follows */ 274 RDMA_MSGP = 2, /* chunk list and RPC msg with padding follow */ 275 RDMA_DONE = 3 /* signal completion of chunk transfer */ 276 }; 277 278 /* 279 * Listener information for a service 280 */ 281 struct rdma_svc_data { 282 queue_t q; /* queue_t to place incoming pkts */ 283 int active; /* If active, after registeration startup */ 284 rdma_stat err_code; /* Error code from plugin layer */ 285 int32_t svcid; /* RDMA based service identifier */ 286 }; 287 288 /* 289 * Per RDMA plugin module information. 290 * Will be populated by each plugin 291 * module during its initialization. 292 */ 293 typedef struct rdma_mod { 294 char *rdma_api; /* "kvipl", "ibtf", etc */ 295 uint_t rdma_version; /* RDMATF API version */ 296 int rdma_count; /* # of devices */ 297 struct rdmaops *rdma_ops; /* rdma op vector for api */ 298 } rdma_mod_t; 299 300 /* 301 * Registry of RDMA plugins 302 */ 303 typedef struct rdma_registry { 304 rdma_mod_t *r_mod; /* plugin mod info */ 305 struct rdma_registry *r_next; /* next registered RDMA plugin */ 306 } rdma_registry_t; 307 308 /* 309 * RDMA transport information 310 */ 311 typedef struct rdma_info { 312 uint_t addrlen; /* address length */ 313 uint_t mts; /* max transfer size */ 314 uint_t mtu; /* native mtu size of unlerlying network */ 315 } rdma_info_t; 316 317 typedef enum { 318 C_IDLE = 0x00000001, 319 C_CONN_PEND = 0x00000002, 320 C_CONNECTED = 0x00000004, 321 C_ERROR_CONN = 0x00000008, 322 C_DISCONN_PEND = 0x00000010, 323 C_REMOTE_DOWN = 0x00000020 324 } conn_c_state; 325 326 /* 327 * RDMA Connection information 328 */ 329 typedef struct conn { 330 rdma_mod_t *c_rdmamod; /* RDMA transport info for conn */ 331 struct netbuf c_raddr; /* remote address */ 332 struct netbuf c_laddr; /* local address */ 333 int c_ref; /* no. of clients of connection */ 334 struct conn *c_next; /* next in list of connections */ 335 struct conn *c_prev; /* prev in list of connections */ 336 caddr_t c_private; /* transport specific stuff */ 337 conn_c_state c_state; /* state of connection */ 338 rdma_cc_type_t c_cc_type; /* client or server, for credit cntrl */ 339 union { 340 rdma_clnt_cred_ctrl_t c_clnt_cc; 341 rdma_srv_cred_ctrl_t c_srv_cc; 342 } rdma_conn_cred_ctrl_u; 343 kmutex_t c_lock; /* protect c_state and c_ref fields */ 344 kcondvar_t c_cv; /* to signal when pending is done */ 345 } CONN; 346 347 348 /* 349 * Data transferred from plugin interrupt to svc_queuereq() 350 */ 351 typedef struct rdma_recv_data { 352 CONN *conn; 353 int status; 354 rdma_buf_t rpcmsg; 355 } rdma_recv_data_t; 356 357 /* structure used to pass information for READ over rdma write */ 358 typedef enum { 359 RCI_WRITE_UIO_CHUNK = 1, 360 RCI_WRITE_ADDR_CHUNK = 2, 361 RCI_REPLY_CHUNK = 3 362 } rci_type_t; 363 364 typedef struct { 365 rci_type_t rci_type; 366 union { 367 struct uio *rci_uiop; 368 caddr_t rci_addr; 369 } rci_a; 370 uint32 rci_len; 371 struct clist **rci_clpp; /* point to write chunk list in readargs */ 372 } rdma_chunkinfo_t; 373 374 typedef struct { 375 uint_t rcil_len; 376 uint_t rcil_len_alt; 377 } rdma_chunkinfo_lengths_t; 378 379 typedef struct { 380 struct clist *rwci_wlist; 381 CONN *rwci_conn; 382 } rdma_wlist_conn_info_t; 383 384 /* 385 * Operations vector for RDMA transports. 386 */ 387 typedef struct rdmaops { 388 /* Network */ 389 rdma_stat (*rdma_reachable)(int addr_type, struct netbuf *, 390 void **handle); 391 /* Connection */ 392 rdma_stat (*rdma_get_conn)(struct netbuf *, int addr_type, 393 void *, CONN **); 394 rdma_stat (*rdma_rel_conn)(CONN *); 395 /* Server side listner start and stop routines */ 396 void (*rdma_svc_listen)(struct rdma_svc_data *); 397 void (*rdma_svc_stop)(struct rdma_svc_data *); 398 /* Memory */ 399 rdma_stat (*rdma_regmem)(CONN *, caddr_t, caddr_t, 400 uint_t, struct mrc *); 401 rdma_stat (*rdma_deregmem)(CONN *, caddr_t, struct mrc); 402 rdma_stat (*rdma_regmemsync)(CONN *, caddr_t, caddr_t, uint_t, 403 struct mrc *, void **, void *); 404 rdma_stat (*rdma_deregmemsync)(CONN *, caddr_t, struct mrc, 405 void *, void *); 406 rdma_stat (*rdma_syncmem)(CONN *, void *, caddr_t, int, int); 407 /* Buffer */ 408 rdma_stat (*rdma_buf_alloc)(CONN *, rdma_buf_t *); 409 void (*rdma_buf_free)(CONN *, rdma_buf_t *); 410 /* Transfer */ 411 rdma_stat (*rdma_send)(CONN *, clist *, uint32_t); 412 rdma_stat (*rdma_send_resp)(CONN *, clist *, uint32_t); 413 rdma_stat (*rdma_clnt_recvbuf)(CONN *, clist *, uint32_t); 414 rdma_stat (*rdma_clnt_recvbuf_remove)(CONN *, uint32_t); 415 rdma_stat (*rdma_svc_recvbuf)(CONN *, clist *); 416 rdma_stat (*rdma_recv)(CONN *, clist **, uint32_t); 417 /* RDMA */ 418 rdma_stat (*rdma_read)(CONN *, clist *, int); 419 rdma_stat (*rdma_write)(CONN *, clist *, int); 420 /* INFO */ 421 rdma_stat (*rdma_getinfo)(rdma_info_t *info); 422 } rdmaops_t; 423 424 /* 425 * RDMA operations. 426 */ 427 #define RDMA_REACHABLE(rdma_ops, addr_type, addr, handle) \ 428 (*(rdma_ops)->rdma_reachable)(addr_type, addr, handle) 429 430 #define RDMA_GET_CONN(rdma_ops, addr, addr_type, handle, conn) \ 431 (*(rdma_ops)->rdma_get_conn)(addr, addr_type, handle, conn) 432 433 #define RDMA_REL_CONN(conn) \ 434 (*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn) 435 436 #define RDMA_REGMEM(conn, adsp, buff, len, handle) \ 437 (*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, adsp, \ 438 buff, len, handle) 439 440 #define RDMA_DEREGMEM(conn, buff, handle) \ 441 (*(conn)->c_rdmamod->rdma_ops->rdma_deregmem)(conn, buff, handle) 442 443 #define RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle, lrc) \ 444 (*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \ 445 len, handle, synchandle, lrc) 446 447 #define RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle, lrc) \ 448 (*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \ 449 handle, synchandle, lrc) 450 451 #define RDMA_SYNCMEM(conn, handle, buff, len, direction) \ 452 (*(conn)->c_rdmamod->rdma_ops->rdma_syncmem)(conn, handle, \ 453 buff, len, direction) 454 455 #define RDMA_BUF_ALLOC(conn, rbuf) \ 456 (*(conn)->c_rdmamod->rdma_ops->rdma_buf_alloc)(conn, rbuf) 457 458 #define RDMA_BUF_FREE(conn, rbuf) \ 459 (*(conn)->c_rdmamod->rdma_ops->rdma_buf_free)(conn, rbuf) 460 461 #define RDMA_SEND(conn, sendlist, xid) \ 462 (*(conn)->c_rdmamod->rdma_ops->rdma_send)(conn, sendlist, xid) 463 464 #define RDMA_SEND_RESP(conn, sendlist, xid) \ 465 (*(conn)->c_rdmamod->rdma_ops->rdma_send_resp)(conn, sendlist, xid) 466 467 #define RDMA_CLNT_RECVBUF(conn, cl, xid) \ 468 (*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf)(conn, cl, xid) 469 470 #define RDMA_CLNT_RECVBUF_REMOVE(conn, xid) \ 471 (*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf_remove)(conn, xid) 472 473 #define RDMA_SVC_RECVBUF(conn, cl) \ 474 (*(conn)->c_rdmamod->rdma_ops->rdma_svc_recvbuf)(conn, cl) 475 476 #define RDMA_RECV(conn, recvlist, xid) \ 477 (*(conn)->c_rdmamod->rdma_ops->rdma_recv)(conn, recvlist, xid) 478 479 #define RDMA_READ(conn, cl, wait) \ 480 (*(conn)->c_rdmamod->rdma_ops->rdma_read)(conn, cl, wait) 481 482 #define RDMA_WRITE(conn, cl, wait) \ 483 (*(conn)->c_rdmamod->rdma_ops->rdma_write)(conn, cl, wait) 484 485 #define RDMA_GETINFO(rdma_mod, info) \ 486 (*(rdma_mod)->rdma_ops->rdma_getinfo)(info) 487 488 #ifdef _KERNEL 489 extern rdma_registry_t *rdma_mod_head; 490 extern krwlock_t rdma_lock; /* protects rdma_mod_head list */ 491 extern int rdma_modloaded; /* flag for loading RDMA plugins */ 492 extern int rdma_dev_available; /* rdma device is loaded or not */ 493 extern kmutex_t rdma_modload_lock; /* protects rdma_modloaded flag */ 494 extern uint_t rdma_minchunk; 495 extern ldi_ident_t rpcmod_li; /* needed by layed driver framework */ 496 497 /* 498 * General RDMA routines 499 */ 500 extern struct clist *clist_alloc(void); 501 extern void clist_add(struct clist **, uint32_t, int, 502 struct mrc *, caddr_t, struct mrc *, caddr_t); 503 extern void clist_free(struct clist *); 504 extern rdma_stat clist_register(CONN *conn, struct clist *cl, clist_dstsrc); 505 extern rdma_stat clist_deregister(CONN *conn, struct clist *cl, clist_dstsrc); 506 extern rdma_stat clist_syncmem(CONN *conn, struct clist *cl, clist_dstsrc); 507 extern rdma_stat rdma_clnt_postrecv(CONN *conn, uint32_t xid); 508 extern rdma_stat rdma_clnt_postrecv_remove(CONN *conn, uint32_t xid); 509 extern rdma_stat rdma_svc_postrecv(CONN *conn); 510 extern rdma_stat rdma_register_mod(rdma_mod_t *mod); 511 extern rdma_stat rdma_unregister_mod(rdma_mod_t *mod); 512 extern rdma_stat rdma_buf_alloc(CONN *, rdma_buf_t *); 513 extern void rdma_buf_free(CONN *, rdma_buf_t *); 514 extern int rdma_modload(); 515 extern bool_t rdma_get_wchunk(struct svc_req *, iovec_t *, struct clist *); 516 517 /* 518 * RDMA XDR 519 */ 520 extern void xdrrdma_create(XDR *, caddr_t, uint_t, int, struct clist *, 521 enum xdr_op, CONN *); 522 extern void xdrrdma_destroy(XDR *); 523 524 extern uint_t xdrrdma_getpos(XDR *); 525 extern bool_t xdrrdma_setpos(XDR *, uint_t); 526 extern bool_t xdr_clist(XDR *, clist *); 527 extern bool_t xdr_do_clist(XDR *, clist **); 528 extern uint_t xdr_getbufsize(XDR *); 529 extern unsigned int xdrrdma_sizeof(xdrproc_t, void *, int, uint_t *, uint_t *); 530 extern unsigned int xdrrdma_authsize(AUTH *, struct cred *, int); 531 532 extern void xdrrdma_store_wlist(XDR *, struct clist *); 533 extern struct clist *xdrrdma_wclist(XDR *); 534 extern bool_t xdr_decode_reply_wchunk(XDR *, struct clist **); 535 extern bool_t xdr_decode_wlist(XDR *xdrs, struct clist **, bool_t *); 536 extern bool_t xdr_decode_wlist_svc(XDR *xdrs, struct clist **, bool_t *, 537 uint32_t *, CONN *); 538 extern bool_t xdr_encode_rlist_svc(XDR *, clist *); 539 extern bool_t xdr_encode_wlist(XDR *, clist *); 540 extern bool_t xdr_encode_reply_wchunk(XDR *, struct clist *, 541 uint32_t seg_array_len); 542 bool_t xdrrdma_getrdmablk(XDR *, struct clist **, uint_t *, 543 CONN **conn, const uint_t); 544 bool_t xdrrdma_read_from_client(struct clist **, CONN **, uint_t); 545 bool_t xdrrdma_send_read_data(XDR *, struct clist *); 546 bool_t xdrrdma_free_clist(CONN *, struct clist *); 547 #endif /* _KERNEL */ 548 549 #ifdef __cplusplus 550 } 551 #endif 552 553 #endif /* _RPC_RPC_RDMA_H */ 554