1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 #ifndef _RPC_RPC_RDMA_H 41 #define _RPC_RPC_RDMA_H 42 43 #include <rpc/rpc.h> 44 #include <rpc/rpc_sztypes.h> 45 #include <sys/sunddi.h> 46 #include <sys/sunldi.h> 47 48 #ifdef __cplusplus 49 extern "C" { 50 #endif 51 52 #define RPCRDMA_VERS 1 /* Version of the RPC over RDMA protocol */ 53 #define RDMATF_VERS 1 /* Version of the API used by RPC for RDMA */ 54 #define RDMATF_VERS_1 1 /* Current version of RDMATF */ 55 56 /* 57 * The size of an RPC call or reply message 58 */ 59 #define RPC_MSG_SZ 1024 60 61 /* 62 * RDMA chunk size 63 */ 64 #define RDMA_MINCHUNK 1024 65 66 /* 67 * Storage for a chunk list 68 */ 69 #define RPC_CL_SZ 1024 70 71 /* 72 * Chunk size 73 */ 74 #define MINCHUNK 1024 75 76 /* 77 * Size of receive buffer 78 */ 79 #define RPC_BUF_SIZE 2048 80 81 #define NOWAIT 0 /* don't wait for operation of complete */ 82 #define WAIT 1 /* wait and ensure that operation is complete */ 83 84 /* 85 * RDMA xdr buffer control and other control flags. Add new flags here, 86 * set them in private structure for xdr over RDMA in xdr_rdma.c 87 */ 88 #define XDR_RDMA_CHUNK 0x1 89 #define XDR_RDMA_WLIST_REG 0x2 90 #define XDR_RDMA_RLIST_REG 0x4 91 92 #define LONG_REPLY_LEN 65536 93 #define WCL_BUF_LEN 32768 94 #define RCL_BUF_LEN 32768 95 96 97 #define RDMA_BUFS_RQST 34 /* Num bufs requested by client */ 98 #define RDMA_BUFS_GRANT 32 /* Num bufs granted by server */ 99 100 struct xdr_ops *xdrrdma_xops(void); 101 102 /* 103 * Credit Control Structures. 104 */ 105 typedef enum rdma_cc_type { 106 RDMA_CC_CLNT, /* CONN is for a client */ 107 RDMA_CC_SRV /* CONN is for a server */ 108 } rdma_cc_type_t; 109 110 /* 111 * Client side credit control data structure. 112 */ 113 typedef struct rdma_clnt_cred_ctrl { 114 uint32_t clnt_cc_granted_ops; 115 uint32_t clnt_cc_in_flight_ops; 116 kcondvar_t clnt_cc_cv; 117 } rdma_clnt_cred_ctrl_t; 118 119 /* 120 * Server side credit control data structure. 121 */ 122 typedef struct rdma_srv_cred_ctrl { 123 uint32_t srv_cc_buffers_granted; 124 uint32_t srv_cc_cur_buffers_used; 125 uint32_t srv_cc_posted; 126 uint32_t srv_cc_max_buf_size; /* to be determined by CCP */ 127 uint32_t srv_cc_cur_buf_size; /* to be determined by CCP */ 128 } rdma_srv_cred_ctrl_t; 129 130 typedef enum { 131 RPCCALL_WLIST, 132 RPCCALL_WCHUNK, 133 RPCCALL_NOWRITE 134 }rpccall_write_t; 135 136 typedef enum { 137 CLIST_REG_SOURCE, 138 CLIST_REG_DST 139 } clist_dstsrc; 140 141 /* 142 * Return codes from RDMA operations 143 */ 144 typedef enum { 145 146 RDMA_SUCCESS = 0, /* successful operation */ 147 148 RDMA_INVAL = 1, /* invalid parameter */ 149 RDMA_TIMEDOUT = 2, /* operation timed out */ 150 RDMA_INTR = 3, /* operation interrupted */ 151 RDMA_NORESOURCE = 4, /* insufficient resource */ 152 /* 153 * connection errors 154 */ 155 RDMA_REJECT = 5, /* connection req rejected */ 156 RDMA_NOLISTENER = 6, /* no listener on server */ 157 RDMA_UNREACHABLE = 7, /* host unreachable */ 158 RDMA_CONNLOST = 8, /* connection lost */ 159 160 RDMA_XPRTFAILED = 9, /* RDMA transport failed */ 161 RDMA_PROTECTERR = 10, /* memory protection error */ 162 RDMA_OVERRUN = 11, /* transport overrun */ 163 RDMA_RECVQEMPTY = 12, /* incoming pkt dropped, recv q empty */ 164 RDMA_PROTFAILED = 13, /* RDMA protocol failed */ 165 RDMA_NOTSUPP = 14, /* requested feature not supported */ 166 RDMA_REMOTERR = 15, /* error at remote end */ 167 /* 168 * RDMATF errors 169 */ 170 RDMA_BADVERS = 16, /* mismatch RDMATF versions */ 171 RDMA_REG_EXIST = 17, /* RDMATF registration already exists */ 172 RDMA_HCA_ATTACH = 18, 173 RDMA_HCA_DETACH = 19, 174 175 /* 176 * fallback error 177 */ 178 RDMA_FAILED = 20 /* generic error */ 179 } rdma_stat; 180 181 /* 182 * Memory region context. This is an RDMA provider generated 183 * handle for a registered arbitrary size contiguous virtual 184 * memory. The RDMA Interface Adapter needs this for local or 185 * remote memory access. 186 * 187 * The mrc_rmr field holds the remote memory region context 188 * which is sent over-the-wire to provide the remote host 189 * with RDMA access to the memory region. 190 */ 191 struct mrc { 192 uint32_t mrc_rmr; /* Remote MR context, sent OTW */ 193 union { 194 struct mr { 195 uint32_t lmr; /* Local MR context */ 196 uint64_t linfo; /* Local memory info */ 197 } mr; 198 } lhdl; 199 }; 200 201 #define mrc_lmr lhdl.mr.lmr 202 #define mrc_linfo lhdl.mr.linfo 203 204 /* 205 * Memory management for the RDMA buffers 206 */ 207 /* 208 * RDMA buffer types 209 */ 210 typedef enum { 211 SEND_BUFFER, /* buf for send msg */ 212 SEND_DESCRIPTOR, /* buf used for send msg descriptor in plugins only */ 213 RECV_BUFFER, /* buf for recv msg */ 214 RECV_DESCRIPTOR, /* buf used for recv msg descriptor in plugins only */ 215 RDMA_LONG_BUFFER /* chunk buf used in RDMATF only and not in plugins */ 216 } rdma_btype; 217 218 /* 219 * RDMA buffer information 220 */ 221 typedef struct rdma_buf { 222 rdma_btype type; /* buffer type */ 223 uint_t len; /* length of buffer */ 224 caddr_t addr; /* buffer address */ 225 struct mrc handle; /* buffer registration handle */ 226 caddr_t rb_private; 227 } rdma_buf_t; 228 229 230 /* 231 * The XDR offset value is used by the XDR 232 * routine to identify the position in the 233 * RPC message where the opaque object would 234 * normally occur. Neither the data content 235 * of the chunk, nor its size field are included 236 * in the RPC message. The XDR offset is calculated 237 * as if the chunks were present. 238 * 239 * The remaining fields identify the chunk of data 240 * on the sender. The c_memhandle identifies a 241 * registered RDMA memory region and the c_addr 242 * and c_len fields identify the chunk within it. 243 */ 244 struct clist { 245 uint32 c_xdroff; /* XDR offset */ 246 uint32 c_len; /* Length */ 247 struct mrc c_smemhandle; /* src memory handle */ 248 uint64 c_ssynchandle; /* src sync handle */ 249 union { 250 uint64 c_saddr; /* src address */ 251 caddr_t c_saddr3; 252 } w; 253 struct mrc c_dmemhandle; /* dst memory handle */ 254 uint64 c_dsynchandle; /* dst sync handle */ 255 union { 256 uint64 c_daddr; /* dst address */ 257 caddr_t c_daddr3; 258 } u; 259 struct as *c_adspc; /* address space for saddr/daddr */ 260 rdma_buf_t rb_longbuf; /* used for long requests/replies */ 261 struct clist *c_next; /* Next chunk */ 262 }; 263 264 typedef struct clist clist; 265 266 /* 267 * max 4M wlist xfer size 268 * This is defined because the rfs3_tsize service requires 269 * svc_req struct (which we don't have that in krecv). 270 */ 271 #define MAX_SVC_XFER_SIZE (4*1024*1024) 272 273 enum rdma_proc { 274 RDMA_MSG = 0, /* chunk list and RPC msg follow */ 275 RDMA_NOMSG = 1, /* only chunk list follows */ 276 RDMA_MSGP = 2, /* chunk list and RPC msg with padding follow */ 277 RDMA_DONE = 3 /* signal completion of chunk transfer */ 278 }; 279 280 /* 281 * Listener information for a service 282 */ 283 struct rdma_svc_data { 284 queue_t q; /* queue_t to place incoming pkts */ 285 int active; /* If active, after registeration startup */ 286 rdma_stat err_code; /* Error code from plugin layer */ 287 int32_t svcid; /* RDMA based service identifier */ 288 }; 289 290 /* 291 * Per RDMA plugin module information. 292 * Will be populated by each plugin 293 * module during its initialization. 294 */ 295 typedef struct rdma_mod { 296 char *rdma_api; /* "kvipl", "ibtf", etc */ 297 uint_t rdma_version; /* RDMATF API version */ 298 int rdma_count; /* # of devices */ 299 struct rdmaops *rdma_ops; /* rdma op vector for api */ 300 } rdma_mod_t; 301 302 /* 303 * Registry of RDMA plugins 304 */ 305 typedef struct rdma_registry { 306 rdma_mod_t *r_mod; /* plugin mod info */ 307 uint32_t r_mod_state; 308 struct rdma_registry *r_next; /* next registered RDMA plugin */ 309 } rdma_registry_t; 310 311 /* 312 * RDMA MODULE state flags (r_mod_state). 313 */ 314 #define RDMA_MOD_ACTIVE 1 315 #define RDMA_MOD_INACTIVE 0 316 317 /* 318 * RDMA transport information 319 */ 320 typedef struct rdma_info { 321 uint_t addrlen; /* address length */ 322 uint_t mts; /* max transfer size */ 323 uint_t mtu; /* native mtu size of unlerlying network */ 324 } rdma_info_t; 325 326 typedef enum { 327 C_IDLE = 0x00000001, 328 C_CONN_PEND = 0x00000002, 329 C_CONNECTED = 0x00000004, 330 C_ERROR_CONN = 0x00000008, 331 C_DISCONN_PEND = 0x00000010, 332 C_REMOTE_DOWN = 0x00000020 333 } conn_c_state; 334 335 /* 336 * RDMA Connection information 337 */ 338 typedef struct conn { 339 rdma_mod_t *c_rdmamod; /* RDMA transport info for conn */ 340 struct netbuf c_raddr; /* remote address */ 341 struct netbuf c_laddr; /* local address */ 342 int c_ref; /* no. of clients of connection */ 343 struct conn *c_next; /* next in list of connections */ 344 struct conn *c_prev; /* prev in list of connections */ 345 caddr_t c_private; /* transport specific stuff */ 346 conn_c_state c_state; /* state of connection */ 347 rdma_cc_type_t c_cc_type; /* client or server, for credit cntrl */ 348 union { 349 rdma_clnt_cred_ctrl_t c_clnt_cc; 350 rdma_srv_cred_ctrl_t c_srv_cc; 351 } rdma_conn_cred_ctrl_u; 352 kmutex_t c_lock; /* protect c_state and c_ref fields */ 353 kcondvar_t c_cv; /* to signal when pending is done */ 354 } CONN; 355 356 357 /* 358 * Data transferred from plugin interrupt to svc_queuereq() 359 */ 360 typedef struct rdma_recv_data { 361 CONN *conn; 362 int status; 363 rdma_buf_t rpcmsg; 364 } rdma_recv_data_t; 365 366 /* structure used to pass information for READ over rdma write */ 367 typedef enum { 368 RCI_WRITE_UIO_CHUNK = 1, 369 RCI_WRITE_ADDR_CHUNK = 2, 370 RCI_REPLY_CHUNK = 3 371 } rci_type_t; 372 373 typedef struct { 374 rci_type_t rci_type; 375 union { 376 struct uio *rci_uiop; 377 caddr_t rci_addr; 378 } rci_a; 379 uint32 rci_len; 380 struct clist **rci_clpp; /* point to write chunk list in readargs */ 381 } rdma_chunkinfo_t; 382 383 typedef struct { 384 uint_t rcil_len; 385 uint_t rcil_len_alt; 386 } rdma_chunkinfo_lengths_t; 387 388 typedef struct { 389 struct clist *rwci_wlist; 390 CONN *rwci_conn; 391 } rdma_wlist_conn_info_t; 392 393 /* 394 * Operations vector for RDMA transports. 395 */ 396 typedef struct rdmaops { 397 /* Network */ 398 rdma_stat (*rdma_reachable)(int addr_type, struct netbuf *, 399 void **handle); 400 /* Connection */ 401 rdma_stat (*rdma_get_conn)(struct netbuf *, int addr_type, 402 void *, CONN **); 403 rdma_stat (*rdma_rel_conn)(CONN *); 404 /* Server side listner start and stop routines */ 405 void (*rdma_svc_listen)(struct rdma_svc_data *); 406 void (*rdma_svc_stop)(struct rdma_svc_data *); 407 /* Memory */ 408 rdma_stat (*rdma_regmem)(CONN *, caddr_t, caddr_t, 409 uint_t, struct mrc *); 410 rdma_stat (*rdma_deregmem)(CONN *, caddr_t, struct mrc); 411 rdma_stat (*rdma_regmemsync)(CONN *, caddr_t, caddr_t, uint_t, 412 struct mrc *, void **, void *); 413 rdma_stat (*rdma_deregmemsync)(CONN *, caddr_t, struct mrc, 414 void *, void *); 415 rdma_stat (*rdma_syncmem)(CONN *, void *, caddr_t, int, int); 416 /* Buffer */ 417 rdma_stat (*rdma_buf_alloc)(CONN *, rdma_buf_t *); 418 void (*rdma_buf_free)(CONN *, rdma_buf_t *); 419 /* Transfer */ 420 rdma_stat (*rdma_send)(CONN *, clist *, uint32_t); 421 rdma_stat (*rdma_send_resp)(CONN *, clist *, uint32_t); 422 rdma_stat (*rdma_clnt_recvbuf)(CONN *, clist *, uint32_t); 423 rdma_stat (*rdma_clnt_recvbuf_remove)(CONN *, uint32_t); 424 rdma_stat (*rdma_svc_recvbuf)(CONN *, clist *); 425 rdma_stat (*rdma_recv)(CONN *, clist **, uint32_t); 426 /* RDMA */ 427 rdma_stat (*rdma_read)(CONN *, clist *, int); 428 rdma_stat (*rdma_write)(CONN *, clist *, int); 429 /* INFO */ 430 rdma_stat (*rdma_getinfo)(rdma_info_t *info); 431 } rdmaops_t; 432 433 typedef struct rdma_svc_wait { 434 kmutex_t svc_lock; 435 kcondvar_t svc_cv; 436 rdma_stat svc_stat; 437 } rdma_svc_wait_t; 438 439 extern rdma_svc_wait_t rdma_wait; 440 441 /* 442 * RDMA operations. 443 */ 444 #define RDMA_REACHABLE(rdma_ops, addr_type, addr, handle) \ 445 (*(rdma_ops)->rdma_reachable)(addr_type, addr, handle) 446 447 #define RDMA_GET_CONN(rdma_ops, addr, addr_type, handle, conn) \ 448 (*(rdma_ops)->rdma_get_conn)(addr, addr_type, handle, conn) 449 450 #define RDMA_REL_CONN(conn) \ 451 (*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn) 452 453 #define RDMA_REGMEM(conn, adsp, buff, len, handle) \ 454 (*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, adsp, \ 455 buff, len, handle) 456 457 #define RDMA_DEREGMEM(conn, buff, handle) \ 458 (*(conn)->c_rdmamod->rdma_ops->rdma_deregmem)(conn, buff, handle) 459 460 #define RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle, lrc) \ 461 (*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \ 462 len, handle, synchandle, lrc) 463 464 #define RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle, lrc) \ 465 (*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \ 466 handle, synchandle, lrc) 467 468 #define RDMA_SYNCMEM(conn, handle, buff, len, direction) \ 469 (*(conn)->c_rdmamod->rdma_ops->rdma_syncmem)(conn, handle, \ 470 buff, len, direction) 471 472 #define RDMA_BUF_ALLOC(conn, rbuf) \ 473 (*(conn)->c_rdmamod->rdma_ops->rdma_buf_alloc)(conn, rbuf) 474 475 #define RDMA_BUF_FREE(conn, rbuf) \ 476 (*(conn)->c_rdmamod->rdma_ops->rdma_buf_free)(conn, rbuf) 477 478 #define RDMA_SEND(conn, sendlist, xid) \ 479 (*(conn)->c_rdmamod->rdma_ops->rdma_send)(conn, sendlist, xid) 480 481 #define RDMA_SEND_RESP(conn, sendlist, xid) \ 482 (*(conn)->c_rdmamod->rdma_ops->rdma_send_resp)(conn, sendlist, xid) 483 484 #define RDMA_CLNT_RECVBUF(conn, cl, xid) \ 485 (*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf)(conn, cl, xid) 486 487 #define RDMA_CLNT_RECVBUF_REMOVE(conn, xid) \ 488 (*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf_remove)(conn, xid) 489 490 #define RDMA_SVC_RECVBUF(conn, cl) \ 491 (*(conn)->c_rdmamod->rdma_ops->rdma_svc_recvbuf)(conn, cl) 492 493 #define RDMA_RECV(conn, recvlist, xid) \ 494 (*(conn)->c_rdmamod->rdma_ops->rdma_recv)(conn, recvlist, xid) 495 496 #define RDMA_READ(conn, cl, wait) \ 497 (*(conn)->c_rdmamod->rdma_ops->rdma_read)(conn, cl, wait) 498 499 #define RDMA_WRITE(conn, cl, wait) \ 500 (*(conn)->c_rdmamod->rdma_ops->rdma_write)(conn, cl, wait) 501 502 #define RDMA_GETINFO(rdma_mod, info) \ 503 (*(rdma_mod)->rdma_ops->rdma_getinfo)(info) 504 505 #ifdef _KERNEL 506 extern rdma_registry_t *rdma_mod_head; 507 extern krwlock_t rdma_lock; /* protects rdma_mod_head list */ 508 extern int rdma_modloaded; /* flag for loading RDMA plugins */ 509 extern int rdma_dev_available; /* rdma device is loaded or not */ 510 extern kmutex_t rdma_modload_lock; /* protects rdma_modloaded flag */ 511 extern uint_t rdma_minchunk; 512 extern ldi_ident_t rpcmod_li; /* needed by layed driver framework */ 513 514 /* 515 * General RDMA routines 516 */ 517 extern struct clist *clist_alloc(void); 518 extern void clist_add(struct clist **, uint32_t, int, 519 struct mrc *, caddr_t, struct mrc *, caddr_t); 520 extern void clist_free(struct clist *); 521 extern rdma_stat clist_register(CONN *conn, struct clist *cl, clist_dstsrc); 522 extern rdma_stat clist_deregister(CONN *conn, struct clist *cl, clist_dstsrc); 523 extern rdma_stat clist_syncmem(CONN *conn, struct clist *cl, clist_dstsrc); 524 extern rdma_stat rdma_clnt_postrecv(CONN *conn, uint32_t xid); 525 extern rdma_stat rdma_clnt_postrecv_remove(CONN *conn, uint32_t xid); 526 extern rdma_stat rdma_svc_postrecv(CONN *conn); 527 extern rdma_stat rdma_register_mod(rdma_mod_t *mod); 528 extern rdma_stat rdma_unregister_mod(rdma_mod_t *mod); 529 extern rdma_stat rdma_buf_alloc(CONN *, rdma_buf_t *); 530 extern void rdma_buf_free(CONN *, rdma_buf_t *); 531 extern int rdma_modload(); 532 extern bool_t rdma_get_wchunk(struct svc_req *, iovec_t *, struct clist *); 533 extern rdma_stat rdma_kwait(void); 534 535 /* 536 * RDMA XDR 537 */ 538 extern void xdrrdma_create(XDR *, caddr_t, uint_t, int, struct clist *, 539 enum xdr_op, CONN *); 540 extern void xdrrdma_destroy(XDR *); 541 542 extern uint_t xdrrdma_getpos(XDR *); 543 extern bool_t xdrrdma_setpos(XDR *, uint_t); 544 extern bool_t xdr_clist(XDR *, clist *); 545 extern bool_t xdr_do_clist(XDR *, clist **); 546 extern uint_t xdr_getbufsize(XDR *); 547 extern unsigned int xdrrdma_sizeof(xdrproc_t, void *, int, uint_t *, uint_t *); 548 extern unsigned int xdrrdma_authsize(AUTH *, struct cred *, int); 549 550 extern void xdrrdma_store_wlist(XDR *, struct clist *); 551 extern struct clist *xdrrdma_wclist(XDR *); 552 extern bool_t xdr_decode_reply_wchunk(XDR *, struct clist **); 553 extern bool_t xdr_decode_wlist(XDR *xdrs, struct clist **, bool_t *); 554 extern bool_t xdr_decode_wlist_svc(XDR *xdrs, struct clist **, bool_t *, 555 uint32_t *, CONN *); 556 extern bool_t xdr_encode_rlist_svc(XDR *, clist *); 557 extern bool_t xdr_encode_wlist(XDR *, clist *); 558 extern bool_t xdr_encode_reply_wchunk(XDR *, struct clist *, 559 uint32_t seg_array_len); 560 bool_t xdrrdma_getrdmablk(XDR *, struct clist **, uint_t *, 561 CONN **conn, const uint_t); 562 bool_t xdrrdma_read_from_client(struct clist **, CONN **, uint_t); 563 bool_t xdrrdma_send_read_data(XDR *, struct clist *); 564 bool_t xdrrdma_free_clist(CONN *, struct clist *); 565 #endif /* _KERNEL */ 566 567 #ifdef __cplusplus 568 } 569 #endif 570 571 #endif /* _RPC_RPC_RDMA_H */ 572