1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * Copyright (c) 2007, The Ohio State University. All rights reserved. 28 * 29 * Portions of this source code is developed by the team members of 30 * The Ohio State University's Network-Based Computing Laboratory (NBCL), 31 * headed by Professor Dhabaleswar K. (DK) Panda. 32 * 33 * Acknowledgements to contributions from developors: 34 * Ranjit Noronha: noronha@cse.ohio-state.edu 35 * Lei Chai : chail@cse.ohio-state.edu 36 * Weikuan Yu : yuw@cse.ohio-state.edu 37 * 38 */ 39 40 #ifndef _RPC_RPC_RDMA_H 41 #define _RPC_RPC_RDMA_H 42 43 #include <rpc/rpc.h> 44 #include <rpc/rpc_sztypes.h> 45 #include <sys/sunddi.h> 46 #include <sys/sunldi.h> 47 48 #ifdef __cplusplus 49 extern "C" { 50 #endif 51 52 #define RPCRDMA_VERS 1 /* Version of the RPC over RDMA protocol */ 53 #define RDMATF_VERS 1 /* Version of the API used by RPC for RDMA */ 54 #define RDMATF_VERS_1 1 /* Current version of RDMATF */ 55 56 /* 57 * The size of an RPC call or reply message 58 */ 59 #define RPC_MSG_SZ 1024 60 61 /* 62 * RDMA chunk size 63 */ 64 #define RDMA_MINCHUNK 1024 65 66 /* 67 * Storage for a chunk list 68 */ 69 #define RPC_CL_SZ 1024 70 71 /* 72 * Chunk size 73 */ 74 #define MINCHUNK 1024 75 76 /* 77 * Size of receive buffer 78 */ 79 #define RPC_BUF_SIZE 2048 80 81 #define NOWAIT 0 /* don't wait for operation of complete */ 82 #define WAIT 1 /* wait and ensure that operation is complete */ 83 84 /* 85 * RDMA xdr buffer control and other control flags. Add new flags here, 86 * set them in private structure for xdr over RDMA in xdr_rdma.c 87 */ 88 #define XDR_RDMA_CHUNK 0x1 89 #define XDR_RDMA_WLIST_REG 0x2 90 #define XDR_RDMA_RLIST_REG 0x4 91 92 #define LONG_REPLY_LEN 65536 93 #define WCL_BUF_LEN 32768 94 #define RCL_BUF_LEN 32768 95 96 97 #define RDMA_BUFS_RQST 34 /* Num bufs requested by client */ 98 #define RDMA_BUFS_GRANT 32 /* Num bufs granted by server */ 99 100 struct xdr_ops *xdrrdma_xops(void); 101 102 /* 103 * Credit Control Structures. 104 */ 105 typedef enum rdma_cc_type { 106 RDMA_CC_CLNT, /* CONN is for a client */ 107 RDMA_CC_SRV /* CONN is for a server */ 108 } rdma_cc_type_t; 109 110 /* 111 * Client side credit control data structure. 112 */ 113 typedef struct rdma_clnt_cred_ctrl { 114 uint32_t clnt_cc_granted_ops; 115 uint32_t clnt_cc_in_flight_ops; 116 kcondvar_t clnt_cc_cv; 117 } rdma_clnt_cred_ctrl_t; 118 119 /* 120 * Server side credit control data structure. 121 */ 122 typedef struct rdma_srv_cred_ctrl { 123 uint32_t srv_cc_buffers_granted; 124 uint32_t srv_cc_cur_buffers_used; 125 uint32_t srv_cc_posted; 126 uint32_t srv_cc_max_buf_size; /* to be determined by CCP */ 127 uint32_t srv_cc_cur_buf_size; /* to be determined by CCP */ 128 } rdma_srv_cred_ctrl_t; 129 130 typedef enum { 131 RPCCALL_WLIST, 132 RPCCALL_WCHUNK, 133 RPCCALL_NOWRITE 134 }rpccall_write_t; 135 136 typedef enum { 137 CLIST_REG_SOURCE = 1, 138 CLIST_REG_DST 139 } clist_dstsrc; 140 141 /* 142 * Return codes from RDMA operations 143 */ 144 typedef enum { 145 146 RDMA_SUCCESS = 0, /* successful operation */ 147 148 RDMA_INVAL = 1, /* invalid parameter */ 149 RDMA_TIMEDOUT = 2, /* operation timed out */ 150 RDMA_INTR = 3, /* operation interrupted */ 151 RDMA_NORESOURCE = 4, /* insufficient resource */ 152 /* 153 * connection errors 154 */ 155 RDMA_REJECT = 5, /* connection req rejected */ 156 RDMA_NOLISTENER = 6, /* no listener on server */ 157 RDMA_UNREACHABLE = 7, /* host unreachable */ 158 RDMA_CONNLOST = 8, /* connection lost */ 159 160 RDMA_XPRTFAILED = 9, /* RDMA transport failed */ 161 RDMA_PROTECTERR = 10, /* memory protection error */ 162 RDMA_OVERRUN = 11, /* transport overrun */ 163 RDMA_RECVQEMPTY = 12, /* incoming pkt dropped, recv q empty */ 164 RDMA_PROTFAILED = 13, /* RDMA protocol failed */ 165 RDMA_NOTSUPP = 14, /* requested feature not supported */ 166 RDMA_REMOTERR = 15, /* error at remote end */ 167 /* 168 * RDMATF errors 169 */ 170 RDMA_BADVERS = 16, /* mismatch RDMATF versions */ 171 RDMA_REG_EXIST = 17, /* RDMATF registration already exists */ 172 RDMA_HCA_ATTACH = 18, 173 RDMA_HCA_DETACH = 19, 174 175 /* 176 * fallback error 177 */ 178 RDMA_FAILED = 20 /* generic error */ 179 } rdma_stat; 180 181 /* 182 * Memory region context. This is an RDMA provider generated 183 * handle for a registered arbitrary size contiguous virtual 184 * memory. The RDMA Interface Adapter needs this for local or 185 * remote memory access. 186 * 187 * The mrc_rmr field holds the remote memory region context 188 * which is sent over-the-wire to provide the remote host 189 * with RDMA access to the memory region. 190 */ 191 struct mrc { 192 uint32_t mrc_rmr; /* Remote MR context, sent OTW */ 193 union { 194 struct mr { 195 uint32_t lmr; /* Local MR context */ 196 uint64_t linfo; /* Local memory info */ 197 } mr; 198 } lhdl; 199 }; 200 201 #define mrc_lmr lhdl.mr.lmr 202 #define mrc_linfo lhdl.mr.linfo 203 204 /* 205 * Memory management for the RDMA buffers 206 */ 207 /* 208 * RDMA buffer types 209 */ 210 typedef enum { 211 SEND_BUFFER, /* buf for send msg */ 212 SEND_DESCRIPTOR, /* buf used for send msg descriptor in plugins only */ 213 RECV_BUFFER, /* buf for recv msg */ 214 RECV_DESCRIPTOR, /* buf used for recv msg descriptor in plugins only */ 215 RDMA_LONG_BUFFER /* chunk buf used in RDMATF only and not in plugins */ 216 } rdma_btype; 217 218 /* 219 * RDMA buffer information 220 */ 221 typedef struct rdma_buf { 222 rdma_btype type; /* buffer type */ 223 uint_t len; /* length of buffer */ 224 caddr_t addr; /* buffer address */ 225 struct mrc handle; /* buffer registration handle */ 226 caddr_t rb_private; 227 } rdma_buf_t; 228 229 230 /* 231 * The XDR offset value is used by the XDR 232 * routine to identify the position in the 233 * RPC message where the opaque object would 234 * normally occur. Neither the data content 235 * of the chunk, nor its size field are included 236 * in the RPC message. The XDR offset is calculated 237 * as if the chunks were present. 238 * 239 * The remaining fields identify the chunk of data 240 * on the sender. The c_memhandle identifies a 241 * registered RDMA memory region and the c_addr 242 * and c_len fields identify the chunk within it. 243 */ 244 struct clist { 245 uint32 c_xdroff; /* XDR offset */ 246 uint32 c_len; /* Length */ 247 clist_dstsrc c_regtype; /* type of registration */ 248 struct mrc c_smemhandle; /* src memory handle */ 249 uint64 c_ssynchandle; /* src sync handle */ 250 union { 251 uint64 c_saddr; /* src address */ 252 caddr_t c_saddr3; 253 } w; 254 struct mrc c_dmemhandle; /* dst memory handle */ 255 uint64 c_dsynchandle; /* dst sync handle */ 256 union { 257 uint64 c_daddr; /* dst address */ 258 caddr_t c_daddr3; 259 } u; 260 struct as *c_adspc; /* address space for saddr/daddr */ 261 rdma_buf_t rb_longbuf; /* used for long requests/replies */ 262 struct clist *c_next; /* Next chunk */ 263 }; 264 265 typedef struct clist clist; 266 267 /* 268 * max 4M wlist xfer size 269 * This is defined because the rfs3_tsize service requires 270 * svc_req struct (which we don't have that in krecv). 271 */ 272 #define MAX_SVC_XFER_SIZE (4*1024*1024) 273 274 enum rdma_proc { 275 RDMA_MSG = 0, /* chunk list and RPC msg follow */ 276 RDMA_NOMSG = 1, /* only chunk list follows */ 277 RDMA_MSGP = 2, /* chunk list and RPC msg with padding follow */ 278 RDMA_DONE = 3 /* signal completion of chunk transfer */ 279 }; 280 281 /* 282 * Listener information for a service 283 */ 284 struct rdma_svc_data { 285 queue_t q; /* queue_t to place incoming pkts */ 286 int active; /* If active, after registeration startup */ 287 rdma_stat err_code; /* Error code from plugin layer */ 288 int32_t svcid; /* RDMA based service identifier */ 289 }; 290 291 /* 292 * Per RDMA plugin module information. 293 * Will be populated by each plugin 294 * module during its initialization. 295 */ 296 typedef struct rdma_mod { 297 char *rdma_api; /* "kvipl", "ibtf", etc */ 298 uint_t rdma_version; /* RDMATF API version */ 299 int rdma_count; /* # of devices */ 300 struct rdmaops *rdma_ops; /* rdma op vector for api */ 301 } rdma_mod_t; 302 303 /* 304 * Registry of RDMA plugins 305 */ 306 typedef struct rdma_registry { 307 rdma_mod_t *r_mod; /* plugin mod info */ 308 uint32_t r_mod_state; 309 struct rdma_registry *r_next; /* next registered RDMA plugin */ 310 } rdma_registry_t; 311 312 /* 313 * RDMA MODULE state flags (r_mod_state). 314 */ 315 #define RDMA_MOD_ACTIVE 1 316 #define RDMA_MOD_INACTIVE 0 317 318 /* 319 * RDMA transport information 320 */ 321 typedef struct rdma_info { 322 uint_t addrlen; /* address length */ 323 uint_t mts; /* max transfer size */ 324 uint_t mtu; /* native mtu size of unlerlying network */ 325 } rdma_info_t; 326 327 typedef enum { 328 C_IDLE = 0x00000001, 329 C_CONN_PEND = 0x00000002, 330 C_CONNECTED = 0x00000004, 331 C_ERROR_CONN = 0x00000008, 332 C_DISCONN_PEND = 0x00000010, 333 C_REMOTE_DOWN = 0x00000020 334 } conn_c_state; 335 336 /* 337 * RDMA Connection information 338 */ 339 typedef struct conn { 340 rdma_mod_t *c_rdmamod; /* RDMA transport info for conn */ 341 struct netbuf c_raddr; /* remote address */ 342 struct netbuf c_laddr; /* local address */ 343 int c_ref; /* no. of clients of connection */ 344 struct conn *c_next; /* next in list of connections */ 345 struct conn *c_prev; /* prev in list of connections */ 346 caddr_t c_private; /* transport specific stuff */ 347 conn_c_state c_state; /* state of connection */ 348 rdma_cc_type_t c_cc_type; /* client or server, for credit cntrl */ 349 union { 350 rdma_clnt_cred_ctrl_t c_clnt_cc; 351 rdma_srv_cred_ctrl_t c_srv_cc; 352 } rdma_conn_cred_ctrl_u; 353 kmutex_t c_lock; /* protect c_state and c_ref fields */ 354 kcondvar_t c_cv; /* to signal when pending is done */ 355 } CONN; 356 357 358 /* 359 * Data transferred from plugin interrupt to svc_queuereq() 360 */ 361 typedef struct rdma_recv_data { 362 CONN *conn; 363 int status; 364 rdma_buf_t rpcmsg; 365 } rdma_recv_data_t; 366 367 /* structure used to pass information for READ over rdma write */ 368 typedef enum { 369 RCI_WRITE_UIO_CHUNK = 1, 370 RCI_WRITE_ADDR_CHUNK = 2, 371 RCI_REPLY_CHUNK = 3 372 } rci_type_t; 373 374 typedef struct { 375 rci_type_t rci_type; 376 union { 377 struct uio *rci_uiop; 378 caddr_t rci_addr; 379 } rci_a; 380 uint32 rci_len; 381 struct clist **rci_clpp; /* point to write chunk list in readargs */ 382 } rdma_chunkinfo_t; 383 384 typedef struct { 385 uint_t rcil_len; 386 uint_t rcil_len_alt; 387 } rdma_chunkinfo_lengths_t; 388 389 typedef struct { 390 struct clist *rwci_wlist; 391 CONN *rwci_conn; 392 } rdma_wlist_conn_info_t; 393 394 /* 395 * Operations vector for RDMA transports. 396 */ 397 typedef struct rdmaops { 398 /* Network */ 399 rdma_stat (*rdma_reachable)(int addr_type, struct netbuf *, 400 void **handle); 401 /* Connection */ 402 rdma_stat (*rdma_get_conn)(struct netbuf *, int addr_type, 403 void *, CONN **); 404 rdma_stat (*rdma_rel_conn)(CONN *); 405 /* Server side listner start and stop routines */ 406 void (*rdma_svc_listen)(struct rdma_svc_data *); 407 void (*rdma_svc_stop)(struct rdma_svc_data *); 408 /* Memory */ 409 rdma_stat (*rdma_regmem)(CONN *, caddr_t, caddr_t, 410 uint_t, struct mrc *); 411 rdma_stat (*rdma_deregmem)(CONN *, caddr_t, struct mrc); 412 rdma_stat (*rdma_regmemsync)(CONN *, caddr_t, caddr_t, uint_t, 413 struct mrc *, void **, void *); 414 rdma_stat (*rdma_deregmemsync)(CONN *, caddr_t, struct mrc, 415 void *, void *); 416 rdma_stat (*rdma_syncmem)(CONN *, void *, caddr_t, int, int); 417 /* Buffer */ 418 rdma_stat (*rdma_buf_alloc)(CONN *, rdma_buf_t *); 419 void (*rdma_buf_free)(CONN *, rdma_buf_t *); 420 /* Transfer */ 421 rdma_stat (*rdma_send)(CONN *, clist *, uint32_t); 422 rdma_stat (*rdma_send_resp)(CONN *, clist *, uint32_t); 423 rdma_stat (*rdma_clnt_recvbuf)(CONN *, clist *, uint32_t); 424 rdma_stat (*rdma_clnt_recvbuf_remove)(CONN *, uint32_t); 425 rdma_stat (*rdma_svc_recvbuf)(CONN *, clist *); 426 rdma_stat (*rdma_recv)(CONN *, clist **, uint32_t); 427 /* RDMA */ 428 rdma_stat (*rdma_read)(CONN *, clist *, int); 429 rdma_stat (*rdma_write)(CONN *, clist *, int); 430 /* INFO */ 431 rdma_stat (*rdma_getinfo)(rdma_info_t *info); 432 } rdmaops_t; 433 434 typedef struct rdma_svc_wait { 435 kmutex_t svc_lock; 436 kcondvar_t svc_cv; 437 rdma_stat svc_stat; 438 } rdma_svc_wait_t; 439 440 extern rdma_svc_wait_t rdma_wait; 441 442 /* 443 * RDMA operations. 444 */ 445 #define RDMA_REACHABLE(rdma_ops, addr_type, addr, handle) \ 446 (*(rdma_ops)->rdma_reachable)(addr_type, addr, handle) 447 448 #define RDMA_GET_CONN(rdma_ops, addr, addr_type, handle, conn) \ 449 (*(rdma_ops)->rdma_get_conn)(addr, addr_type, handle, conn) 450 451 #define RDMA_REL_CONN(conn) \ 452 (*(conn)->c_rdmamod->rdma_ops->rdma_rel_conn)(conn) 453 454 #define RDMA_REGMEM(conn, adsp, buff, len, handle) \ 455 (*(conn)->c_rdmamod->rdma_ops->rdma_regmem)(conn, adsp, \ 456 buff, len, handle) 457 458 #define RDMA_DEREGMEM(conn, buff, handle) \ 459 (*(conn)->c_rdmamod->rdma_ops->rdma_deregmem)(conn, buff, handle) 460 461 #define RDMA_REGMEMSYNC(conn, adsp, buff, len, handle, synchandle, lrc) \ 462 (*(conn)->c_rdmamod->rdma_ops->rdma_regmemsync)(conn, adsp, buff, \ 463 len, handle, synchandle, lrc) 464 465 #define RDMA_DEREGMEMSYNC(conn, buff, handle, synchandle, lrc) \ 466 (*(conn)->c_rdmamod->rdma_ops->rdma_deregmemsync)(conn, buff, \ 467 handle, synchandle, lrc) 468 469 #define RDMA_SYNCMEM(conn, handle, buff, len, direction) \ 470 (*(conn)->c_rdmamod->rdma_ops->rdma_syncmem)(conn, handle, \ 471 buff, len, direction) 472 473 #define RDMA_BUF_ALLOC(conn, rbuf) \ 474 (*(conn)->c_rdmamod->rdma_ops->rdma_buf_alloc)(conn, rbuf) 475 476 #define RDMA_BUF_FREE(conn, rbuf) \ 477 (*(conn)->c_rdmamod->rdma_ops->rdma_buf_free)(conn, rbuf) 478 479 #define RDMA_SEND(conn, sendlist, xid) \ 480 (*(conn)->c_rdmamod->rdma_ops->rdma_send)(conn, sendlist, xid) 481 482 #define RDMA_SEND_RESP(conn, sendlist, xid) \ 483 (*(conn)->c_rdmamod->rdma_ops->rdma_send_resp)(conn, sendlist, xid) 484 485 #define RDMA_CLNT_RECVBUF(conn, cl, xid) \ 486 (*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf)(conn, cl, xid) 487 488 #define RDMA_CLNT_RECVBUF_REMOVE(conn, xid) \ 489 (*(conn)->c_rdmamod->rdma_ops->rdma_clnt_recvbuf_remove)(conn, xid) 490 491 #define RDMA_SVC_RECVBUF(conn, cl) \ 492 (*(conn)->c_rdmamod->rdma_ops->rdma_svc_recvbuf)(conn, cl) 493 494 #define RDMA_RECV(conn, recvlist, xid) \ 495 (*(conn)->c_rdmamod->rdma_ops->rdma_recv)(conn, recvlist, xid) 496 497 #define RDMA_READ(conn, cl, wait) \ 498 (*(conn)->c_rdmamod->rdma_ops->rdma_read)(conn, cl, wait) 499 500 #define RDMA_WRITE(conn, cl, wait) \ 501 (*(conn)->c_rdmamod->rdma_ops->rdma_write)(conn, cl, wait) 502 503 #define RDMA_GETINFO(rdma_mod, info) \ 504 (*(rdma_mod)->rdma_ops->rdma_getinfo)(info) 505 506 #ifdef _KERNEL 507 extern rdma_registry_t *rdma_mod_head; 508 extern krwlock_t rdma_lock; /* protects rdma_mod_head list */ 509 extern int rdma_modloaded; /* flag for loading RDMA plugins */ 510 extern int rdma_dev_available; /* rdma device is loaded or not */ 511 extern kmutex_t rdma_modload_lock; /* protects rdma_modloaded flag */ 512 extern uint_t rdma_minchunk; 513 extern ldi_ident_t rpcmod_li; /* needed by layed driver framework */ 514 515 /* 516 * General RDMA routines 517 */ 518 extern struct clist *clist_alloc(void); 519 extern void clist_add(struct clist **, uint32_t, int, 520 struct mrc *, caddr_t, struct mrc *, caddr_t); 521 extern void clist_free(struct clist *); 522 extern uint32_t clist_len(struct clist *); 523 extern void clist_zero_len(struct clist *); 524 extern rdma_stat clist_register(CONN *conn, struct clist *cl, clist_dstsrc); 525 extern rdma_stat clist_deregister(CONN *conn, struct clist *cl); 526 extern rdma_stat clist_syncmem(CONN *conn, struct clist *cl, clist_dstsrc); 527 extern rdma_stat rdma_clnt_postrecv(CONN *conn, uint32_t xid); 528 extern rdma_stat rdma_clnt_postrecv_remove(CONN *conn, uint32_t xid); 529 extern rdma_stat rdma_svc_postrecv(CONN *conn); 530 extern rdma_stat rdma_register_mod(rdma_mod_t *mod); 531 extern rdma_stat rdma_unregister_mod(rdma_mod_t *mod); 532 extern rdma_stat rdma_buf_alloc(CONN *, rdma_buf_t *); 533 extern void rdma_buf_free(CONN *, rdma_buf_t *); 534 extern int rdma_modload(); 535 extern bool_t rdma_get_wchunk(struct svc_req *, iovec_t *, struct clist *); 536 extern rdma_stat rdma_kwait(void); 537 extern int rdma_setup_read_chunks(struct clist *, uint32_t, int *); 538 539 /* 540 * RDMA XDR 541 */ 542 extern void xdrrdma_create(XDR *, caddr_t, uint_t, int, struct clist *, 543 enum xdr_op, CONN *); 544 extern void xdrrdma_destroy(XDR *); 545 546 extern uint_t xdrrdma_getpos(XDR *); 547 extern bool_t xdrrdma_setpos(XDR *, uint_t); 548 extern bool_t xdr_clist(XDR *, clist *); 549 extern bool_t xdr_do_clist(XDR *, clist **); 550 extern uint_t xdr_getbufsize(XDR *); 551 extern unsigned int xdrrdma_sizeof(xdrproc_t, void *, int, uint_t *, uint_t *); 552 extern unsigned int xdrrdma_authsize(AUTH *, struct cred *, int); 553 554 extern void xdrrdma_store_wlist(XDR *, struct clist *); 555 extern struct clist *xdrrdma_wclist(XDR *); 556 extern bool_t xdr_decode_reply_wchunk(XDR *, struct clist **); 557 extern bool_t xdr_decode_wlist(XDR *xdrs, struct clist **, bool_t *); 558 extern bool_t xdr_decode_wlist_svc(XDR *xdrs, struct clist **, bool_t *, 559 uint32_t *, CONN *); 560 extern bool_t xdr_encode_rlist_svc(XDR *, clist *); 561 extern bool_t xdr_encode_wlist(XDR *, clist *); 562 extern bool_t xdr_encode_reply_wchunk(XDR *, struct clist *, 563 uint32_t seg_array_len); 564 bool_t xdrrdma_getrdmablk(XDR *, struct clist **, uint_t *, 565 CONN **conn, const uint_t); 566 bool_t xdrrdma_read_from_client(struct clist *, CONN **, uint_t); 567 bool_t xdrrdma_send_read_data(XDR *, uint_t, struct clist *); 568 bool_t xdrrdma_free_clist(CONN *, struct clist *); 569 #endif /* _KERNEL */ 570 571 #ifdef __cplusplus 572 } 573 #endif 574 575 #endif /* _RPC_RPC_RDMA_H */ 576