1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ 27 /* All Rights Reserved */ 28 /* 29 * Portions of this source code were derived from Berkeley 30 * 4.3 BSD under license from the Regents of the University of 31 * California. 32 */ 33 34 #pragma ident "%Z%%M% %I% %E% SMI" 35 36 #include <sys/param.h> 37 #include <sys/types.h> 38 #include <sys/user.h> 39 #include <sys/systm.h> 40 #include <sys/sysmacros.h> 41 #include <sys/errno.h> 42 #include <sys/kmem.h> 43 #include <sys/debug.h> 44 #include <sys/systm.h> 45 #include <sys/kstat.h> 46 #include <sys/t_lock.h> 47 #include <sys/ddi.h> 48 #include <sys/cmn_err.h> 49 #include <sys/time.h> 50 #include <sys/isa_defs.h> 51 #include <sys/zone.h> 52 53 #include <rpc/types.h> 54 #include <rpc/xdr.h> 55 #include <rpc/auth.h> 56 #include <rpc/clnt.h> 57 #include <rpc/rpc_msg.h> 58 #include <rpc/rpc_rdma.h> 59 60 61 static enum clnt_stat clnt_rdma_kcallit(CLIENT *, rpcproc_t, xdrproc_t, 62 caddr_t, xdrproc_t, caddr_t, struct timeval); 63 static void clnt_rdma_kabort(CLIENT *); 64 static void clnt_rdma_kerror(CLIENT *, struct rpc_err *); 65 static bool_t clnt_rdma_kfreeres(CLIENT *, xdrproc_t, caddr_t); 66 static void clnt_rdma_kdestroy(CLIENT *); 67 static bool_t clnt_rdma_kcontrol(CLIENT *, int, char *); 68 static int clnt_rdma_ksettimers(CLIENT *, struct rpc_timers *, 69 struct rpc_timers *, int, void(*)(int, int, caddr_t), caddr_t, uint32_t); 70 71 /* 72 * Operations vector for RDMA based RPC 73 */ 74 static struct clnt_ops rdma_clnt_ops = { 75 clnt_rdma_kcallit, /* do rpc call */ 76 clnt_rdma_kabort, /* abort call */ 77 clnt_rdma_kerror, /* return error status */ 78 clnt_rdma_kfreeres, /* free results */ 79 clnt_rdma_kdestroy, /* destroy rpc handle */ 80 clnt_rdma_kcontrol, /* the ioctl() of rpc */ 81 clnt_rdma_ksettimers, /* set retry timers */ 82 }; 83 84 /* 85 * The size of the preserialized RPC header information. 86 */ 87 #define CKU_HDRSIZE 20 88 89 /* 90 * Per RPC RDMA endpoint details 91 */ 92 typedef struct cku_private { 93 CLIENT cku_client; /* client handle */ 94 rdma_mod_t *cku_rd_mod; /* underlying RDMA mod */ 95 void *cku_rd_handle; /* underlying RDMA device */ 96 struct netbuf cku_addr; /* remote netbuf address */ 97 int cku_addrfmly; /* for finding addr_type */ 98 struct rpc_err cku_err; /* error status */ 99 struct cred *cku_cred; /* credentials */ 100 XDR cku_outxdr; /* xdr stream for output */ 101 uint32_t cku_outsz; 102 XDR cku_inxdr; /* xdr stream for input */ 103 char cku_rpchdr[CKU_HDRSIZE+4]; /* rpc header */ 104 uint32_t cku_xid; /* current XID */ 105 } cku_private_t; 106 107 #define CLNT_RDMA_DELAY 10 /* secs to delay after a connection failure */ 108 static int clnt_rdma_min_delay = CLNT_RDMA_DELAY; 109 110 struct { 111 kstat_named_t rccalls; 112 kstat_named_t rcbadcalls; 113 kstat_named_t rcbadxids; 114 kstat_named_t rctimeouts; 115 kstat_named_t rcnewcreds; 116 kstat_named_t rcbadverfs; 117 kstat_named_t rctimers; 118 kstat_named_t rccantconn; 119 kstat_named_t rcnomem; 120 kstat_named_t rcintrs; 121 kstat_named_t rclongrpcs; 122 } rdmarcstat = { 123 { "calls", KSTAT_DATA_UINT64 }, 124 { "badcalls", KSTAT_DATA_UINT64 }, 125 { "badxids", KSTAT_DATA_UINT64 }, 126 { "timeouts", KSTAT_DATA_UINT64 }, 127 { "newcreds", KSTAT_DATA_UINT64 }, 128 { "badverfs", KSTAT_DATA_UINT64 }, 129 { "timers", KSTAT_DATA_UINT64 }, 130 { "cantconn", KSTAT_DATA_UINT64 }, 131 { "nomem", KSTAT_DATA_UINT64 }, 132 { "interrupts", KSTAT_DATA_UINT64 }, 133 { "longrpc", KSTAT_DATA_UINT64 } 134 }; 135 136 kstat_named_t *rdmarcstat_ptr = (kstat_named_t *)&rdmarcstat; 137 uint_t rdmarcstat_ndata = sizeof (rdmarcstat) / sizeof (kstat_named_t); 138 139 #ifdef DEBUG 140 int rdma_clnt_debug = 0; 141 #endif 142 143 #ifdef accurate_stats 144 extern kmutex_t rdmarcstat_lock; /* mutex for rcstat updates */ 145 146 #define RCSTAT_INCR(x) \ 147 mutex_enter(&rdmarcstat_lock); \ 148 rdmarcstat.x.value.ui64++; \ 149 mutex_exit(&rdmarcstat_lock); 150 #else 151 #define RCSTAT_INCR(x) \ 152 rdmarcstat.x.value.ui64++; 153 #endif 154 155 #define ptoh(p) (&((p)->cku_client)) 156 #define htop(h) ((cku_private_t *)((h)->cl_private)) 157 158 int 159 clnt_rdma_kcreate(char *proto, void *handle, struct netbuf *raddr, int family, 160 rpcprog_t pgm, rpcvers_t vers, struct cred *cred, CLIENT **cl) 161 { 162 CLIENT *h; 163 struct cku_private *p; 164 struct rpc_msg call_msg; 165 rdma_registry_t *rp; 166 167 ASSERT(INGLOBALZONE(curproc)); 168 169 if (cl == NULL) 170 return (EINVAL); 171 *cl = NULL; 172 173 p = kmem_zalloc(sizeof (*p), KM_SLEEP); 174 175 /* 176 * Find underlying RDMATF plugin 177 */ 178 rw_enter(&rdma_lock, RW_READER); 179 rp = rdma_mod_head; 180 while (rp != NULL) { 181 if (strcmp(rp->r_mod->rdma_api, proto)) 182 rp = rp->r_next; 183 else { 184 p->cku_rd_mod = rp->r_mod; 185 p->cku_rd_handle = handle; 186 break; 187 } 188 } 189 rw_exit(&rdma_lock); 190 191 if (p->cku_rd_mod == NULL) { 192 /* 193 * Should not happen. 194 * No matching RDMATF plugin. 195 */ 196 kmem_free(p, sizeof (struct cku_private)); 197 return (EINVAL); 198 } 199 200 h = ptoh(p); 201 h->cl_ops = &rdma_clnt_ops; 202 h->cl_private = (caddr_t)p; 203 h->cl_auth = authkern_create(); 204 205 /* call message, just used to pre-serialize below */ 206 call_msg.rm_xid = 0; 207 call_msg.rm_direction = CALL; 208 call_msg.rm_call.cb_rpcvers = RPC_MSG_VERSION; 209 call_msg.rm_call.cb_prog = pgm; 210 call_msg.rm_call.cb_vers = vers; 211 212 xdrmem_create(&p->cku_outxdr, p->cku_rpchdr, CKU_HDRSIZE, XDR_ENCODE); 213 /* pre-serialize call message header */ 214 if (!xdr_callhdr(&p->cku_outxdr, &call_msg)) { 215 XDR_DESTROY(&p->cku_outxdr); 216 auth_destroy(h->cl_auth); 217 kmem_free(p, sizeof (struct cku_private)); 218 return (EINVAL); 219 } 220 221 /* 222 * Set up the rpc information 223 */ 224 p->cku_cred = cred; 225 p->cku_addr.buf = kmem_zalloc(raddr->maxlen, KM_SLEEP); 226 p->cku_addr.maxlen = raddr->maxlen; 227 p->cku_addr.len = raddr->len; 228 bcopy(raddr->buf, p->cku_addr.buf, raddr->len); 229 p->cku_addrfmly = family; 230 231 *cl = h; 232 return (0); 233 } 234 235 static void 236 clnt_rdma_kdestroy(CLIENT *h) 237 { 238 struct cku_private *p = htop(h); 239 240 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 241 kmem_free(p, sizeof (*p)); 242 } 243 244 void 245 clnt_rdma_kinit(CLIENT *h, char *proto, void *handle, struct netbuf *raddr, 246 struct cred *cred) 247 { 248 struct cku_private *p = htop(h); 249 rdma_registry_t *rp; 250 251 ASSERT(INGLOBALZONE(curproc)); 252 /* 253 * Find underlying RDMATF plugin 254 */ 255 p->cku_rd_mod = NULL; 256 rw_enter(&rdma_lock, RW_READER); 257 rp = rdma_mod_head; 258 while (rp != NULL) { 259 if (strcmp(rp->r_mod->rdma_api, proto)) 260 rp = rp->r_next; 261 else { 262 p->cku_rd_mod = rp->r_mod; 263 p->cku_rd_handle = handle; 264 break; 265 } 266 267 } 268 rw_exit(&rdma_lock); 269 270 /* 271 * Set up the rpc information 272 */ 273 p->cku_cred = cred; 274 p->cku_xid = 0; 275 276 if (p->cku_addr.maxlen < raddr->len) { 277 if (p->cku_addr.maxlen != 0 && p->cku_addr.buf != NULL) 278 kmem_free(p->cku_addr.buf, p->cku_addr.maxlen); 279 p->cku_addr.buf = kmem_zalloc(raddr->maxlen, KM_SLEEP); 280 p->cku_addr.maxlen = raddr->maxlen; 281 } 282 283 p->cku_addr.len = raddr->len; 284 bcopy(raddr->buf, p->cku_addr.buf, raddr->len); 285 h->cl_ops = &rdma_clnt_ops; 286 } 287 288 /* ARGSUSED */ 289 static enum clnt_stat 290 clnt_rdma_kcallit(CLIENT *h, rpcproc_t procnum, xdrproc_t xdr_args, 291 caddr_t argsp, xdrproc_t xdr_results, caddr_t resultsp, struct timeval wait) 292 { 293 cku_private_t *p = htop(h); 294 int status; 295 XDR *xdrs; 296 XDR *cxdrp = NULL, callxdr; /* for xdrrdma encoding the RPC call */ 297 XDR *rxdrp = NULL, replxdr; /* for xdrrdma decoding the RPC reply */ 298 struct rpc_msg reply_msg; 299 struct clist *sendlist, *recvlist = NULL; 300 struct clist *cl = NULL, *cle = NULL; 301 uint_t vers, op; 302 uint_t off; 303 uint32_t xid; 304 CONN *conn = NULL; 305 rdma_buf_t clmsg, rpcmsg, longmsg, rpcreply; 306 int msglen; 307 clock_t ticks; 308 309 RCSTAT_INCR(rccalls); 310 /* 311 * Get unique xid 312 */ 313 if (p->cku_xid == 0) 314 p->cku_xid = alloc_xid(); 315 316 status = RDMA_GET_CONN(p->cku_rd_mod->rdma_ops, &p->cku_addr, 317 p->cku_addrfmly, p->cku_rd_handle, &conn); 318 319 if (conn == NULL) { 320 /* 321 * Connect failed to server. Could be because of one 322 * of several things. In some cases we don't want 323 * the caller to retry immediately - delay before 324 * returning to caller. 325 */ 326 switch (status) { 327 case RDMA_TIMEDOUT: 328 /* 329 * Already timed out. No need to delay 330 * some more. 331 */ 332 p->cku_err.re_status = RPC_TIMEDOUT; 333 p->cku_err.re_errno = ETIMEDOUT; 334 break; 335 case RDMA_INTR: 336 /* 337 * Failed because of an signal. Very likely 338 * the caller will not retry. 339 */ 340 p->cku_err.re_status = RPC_INTR; 341 p->cku_err.re_errno = EINTR; 342 break; 343 default: 344 /* 345 * All other failures - server down or service 346 * down or temporary resource failure. Delay before 347 * returning to caller. 348 */ 349 ticks = clnt_rdma_min_delay * drv_usectohz(1000000); 350 p->cku_err.re_status = RPC_CANTCONNECT; 351 p->cku_err.re_errno = EIO; 352 353 if (h->cl_nosignal == TRUE) { 354 delay(ticks); 355 } else { 356 if (delay_sig(ticks) == EINTR) { 357 p->cku_err.re_status = RPC_INTR; 358 p->cku_err.re_errno = EINTR; 359 } 360 } 361 break; 362 } 363 364 return (p->cku_err.re_status); 365 } 366 /* 367 * Get the size of the rpc call message. Need this 368 * to determine if the rpc call message will fit in 369 * the pre-allocated RDMA buffers. If the rpc call 370 * message length is greater that the pre-allocated 371 * buffers then, it is a Long RPC. A one time use 372 * buffer is allocated and registered for the Long 373 * RPC call. 374 */ 375 xdrs = &callxdr; 376 msglen = CKU_HDRSIZE + BYTES_PER_XDR_UNIT; 377 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { 378 msglen += xdrrdma_authsize(h->cl_auth, p->cku_cred, 379 rdma_minchunk); 380 msglen += xdrrdma_sizeof(xdr_args, argsp, rdma_minchunk); 381 382 if (msglen > RPC_MSG_SZ) { 383 384 /* 385 * Long RPC. Allocate one time use custom buffer. 386 */ 387 rpcmsg.type = CHUNK_BUFFER; 388 rpcmsg.addr = kmem_zalloc(msglen, KM_SLEEP); 389 cle = kmem_zalloc(sizeof (*cle), KM_SLEEP); 390 cle->c_xdroff = 0; 391 cle->c_len = rpcmsg.len = msglen; 392 cle->c_saddr = (uint64)(uintptr_t)rpcmsg.addr; 393 cle->c_next = NULL; 394 xdrrdma_create(xdrs, rpcmsg.addr, msglen, 395 rdma_minchunk, cle, XDR_ENCODE, NULL); 396 cxdrp = xdrs; 397 op = RDMA_NOMSG; 398 } else { 399 /* 400 * Get a pre-allocated buffer for rpc call 401 */ 402 rpcmsg.type = SEND_BUFFER; 403 if (RDMA_BUF_ALLOC(conn, &rpcmsg)) { 404 p->cku_err.re_status = RPC_CANTSEND; 405 p->cku_err.re_errno = EIO; 406 RCSTAT_INCR(rcnomem); 407 cmn_err(CE_WARN, 408 "clnt_rdma_kcallit: no buffers!"); 409 goto done; 410 } 411 xdrrdma_create(xdrs, rpcmsg.addr, rpcmsg.len, 412 rdma_minchunk, NULL, XDR_ENCODE, NULL); 413 cxdrp = xdrs; 414 op = RDMA_MSG; 415 } 416 } else { 417 /* 418 * For RPCSEC_GSS since we cannot accurately presize the 419 * buffer required for encoding, we assume that its going 420 * to be a Long RPC to start with. We also create the 421 * the XDR stream with min_chunk set to 0 which instructs 422 * the XDR layer to not chunk the incoming byte stream. 423 */ 424 425 msglen += 2 * MAX_AUTH_BYTES + 2 * sizeof (struct opaque_auth); 426 msglen += xdr_sizeof(xdr_args, argsp); 427 428 /* 429 * Long RPC. Allocate one time use custom buffer. 430 */ 431 longmsg.type = CHUNK_BUFFER; 432 longmsg.addr = kmem_zalloc(msglen, KM_SLEEP); 433 cle = kmem_zalloc(sizeof (*cle), KM_SLEEP); 434 cle->c_xdroff = 0; 435 cle->c_len = longmsg.len = msglen; 436 cle->c_saddr = (uint64)(uintptr_t)longmsg.addr; 437 cle->c_next = NULL; 438 xdrrdma_create(xdrs, longmsg.addr, msglen, 0, cle, 439 XDR_ENCODE, NULL); 440 cxdrp = xdrs; 441 op = RDMA_NOMSG; 442 } 443 444 if (h->cl_auth->ah_cred.oa_flavor != RPCSEC_GSS) { 445 /* 446 * Copy in the preserialized RPC header 447 * information. 448 */ 449 bcopy(p->cku_rpchdr, rpcmsg.addr, CKU_HDRSIZE); 450 451 /* 452 * transaction id is the 1st thing in the output 453 * buffer. 454 */ 455 /* LINTED pointer alignment */ 456 (*(uint32_t *)(rpcmsg.addr)) = p->cku_xid; 457 458 /* Skip the preserialized stuff. */ 459 XDR_SETPOS(xdrs, CKU_HDRSIZE); 460 461 /* Serialize dynamic stuff into the output buffer. */ 462 if ((!XDR_PUTINT32(xdrs, (int32_t *)&procnum)) || 463 (!AUTH_MARSHALL(h->cl_auth, xdrs, p->cku_cred)) || 464 (!(*xdr_args)(xdrs, argsp))) { 465 rdma_buf_free(conn, &rpcmsg); 466 if (cle) 467 clist_free(cle); 468 p->cku_err.re_status = RPC_CANTENCODEARGS; 469 p->cku_err.re_errno = EIO; 470 cmn_err(CE_WARN, 471 "clnt_rdma_kcallit: XDR_PUTINT32/AUTH_MARSHAL/xdr_args failed"); 472 goto done; 473 } 474 p->cku_outsz = XDR_GETPOS(xdrs); 475 } else { 476 uint32_t *uproc = (uint32_t *)&p->cku_rpchdr[CKU_HDRSIZE]; 477 IXDR_PUT_U_INT32(uproc, procnum); 478 (*(uint32_t *)(&p->cku_rpchdr[0])) = p->cku_xid; 479 XDR_SETPOS(xdrs, 0); 480 481 /* Serialize the procedure number and the arguments. */ 482 if (!AUTH_WRAP(h->cl_auth, (caddr_t)p->cku_rpchdr, 483 CKU_HDRSIZE+4, xdrs, xdr_args, argsp)) { 484 if (longmsg.addr != xdrs->x_base) { 485 longmsg.addr = xdrs->x_base; 486 longmsg.len = xdr_getbufsize(xdrs); 487 } 488 rdma_buf_free(conn, &longmsg); 489 clist_free(cle); 490 p->cku_err.re_status = RPC_CANTENCODEARGS; 491 p->cku_err.re_errno = EIO; 492 cmn_err(CE_WARN, 493 "clnt_rdma_kcallit: AUTH_WRAP failed"); 494 goto done; 495 } 496 /* 497 * If we had to allocate a new buffer while encoding 498 * then update the addr and len. 499 */ 500 if (longmsg.addr != xdrs->x_base) { 501 longmsg.addr = xdrs->x_base; 502 longmsg.len = xdr_getbufsize(xdrs); 503 } 504 505 /* 506 * If it so happens that the encoded message is after all 507 * not long enough to be a Long RPC then allocate a 508 * SEND_BUFFER and copy the encoded message into it. 509 */ 510 p->cku_outsz = XDR_GETPOS(xdrs); 511 if (p->cku_outsz > RPC_MSG_SZ) { 512 rpcmsg.type = CHUNK_BUFFER; 513 rpcmsg.addr = longmsg.addr; 514 rpcmsg.len = longmsg.len; 515 } else { 516 clist_free(cle); 517 XDR_DESTROY(cxdrp); 518 cxdrp = NULL; 519 /* 520 * Get a pre-allocated buffer for rpc call 521 */ 522 rpcmsg.type = SEND_BUFFER; 523 if (RDMA_BUF_ALLOC(conn, &rpcmsg)) { 524 p->cku_err.re_status = RPC_CANTSEND; 525 p->cku_err.re_errno = EIO; 526 RCSTAT_INCR(rcnomem); 527 cmn_err(CE_WARN, 528 "clnt_rdma_kcallit: no buffers!"); 529 rdma_buf_free(conn, &longmsg); 530 goto done; 531 } 532 bcopy(longmsg.addr, rpcmsg.addr, p->cku_outsz); 533 xdrrdma_create(xdrs, rpcmsg.addr, p->cku_outsz, 0, 534 NULL, XDR_ENCODE, NULL); 535 cxdrp = xdrs; 536 rdma_buf_free(conn, &longmsg); 537 op = RDMA_MSG; 538 } 539 } 540 541 cl = xdrrdma_clist(xdrs); 542 543 /* 544 * Update the chunk size information for the Long RPC msg. 545 */ 546 if (cl && op == RDMA_NOMSG) 547 cl->c_len = p->cku_outsz; 548 549 /* 550 * Set up the RDMA chunk message 551 */ 552 vers = RPCRDMA_VERS; 553 clmsg.type = SEND_BUFFER; 554 if (RDMA_BUF_ALLOC(conn, &clmsg)) { 555 p->cku_err.re_status = RPC_CANTSEND; 556 p->cku_err.re_errno = EIO; 557 rdma_buf_free(conn, &rpcmsg); 558 RCSTAT_INCR(rcnomem); 559 cmn_err(CE_WARN, "clnt_rdma_kcallit: no free buffers!!"); 560 goto done; 561 } 562 xdrs = &p->cku_outxdr; 563 xdrmem_create(xdrs, clmsg.addr, clmsg.len, XDR_ENCODE); 564 /* 565 * Treat xid as opaque (xid is the first entity 566 * in the rpc rdma message). 567 */ 568 (*(uint32_t *)clmsg.addr) = p->cku_xid; 569 /* Skip xid and set the xdr position accordingly. */ 570 XDR_SETPOS(xdrs, sizeof (uint32_t)); 571 (void) xdr_u_int(xdrs, &vers); 572 (void) xdr_u_int(xdrs, &op); 573 574 /* 575 * Now XDR the chunk list 576 */ 577 if (cl != NULL) { 578 579 /* 580 * Register the chunks in the list 581 */ 582 status = clist_register(conn, cl, 1); 583 if (status != RDMA_SUCCESS) { 584 cmn_err(CE_WARN, 585 "clnt_rdma_kcallit: clist register failed"); 586 rdma_buf_free(conn, &clmsg); 587 rdma_buf_free(conn, &rpcmsg); 588 clist_free(cl); 589 p->cku_err.re_status = RPC_CANTSEND; 590 p->cku_err.re_errno = EIO; 591 goto done; 592 } 593 594 } 595 (void) xdr_do_clist(xdrs, &cl); 596 597 /* 598 * Start with the RDMA header and clist (if any) 599 */ 600 sendlist = NULL; 601 clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &clmsg.handle, 602 clmsg.addr, NULL, NULL); 603 604 /* 605 * Put the RPC call message in the send list if small RPC 606 */ 607 if (op == RDMA_MSG) { 608 clist_add(&sendlist, 0, p->cku_outsz, &rpcmsg.handle, 609 rpcmsg.addr, NULL, NULL); 610 } else { 611 /* Long RPC already in chunk list */ 612 RCSTAT_INCR(rclongrpcs); 613 } 614 615 /* 616 * Set up a reply buffer ready for the reply 617 */ 618 status = rdma_clnt_postrecv(conn, p->cku_xid); 619 if (status != RDMA_SUCCESS) { 620 rdma_buf_free(conn, &clmsg); 621 rdma_buf_free(conn, &rpcmsg); 622 if (cl) { 623 (void) clist_deregister(conn, cl, 1); 624 clist_free(cl); 625 } 626 clist_free(sendlist); 627 p->cku_err.re_status = RPC_CANTSEND; 628 p->cku_err.re_errno = EIO; 629 goto done; 630 } 631 /* 632 * sync the memory for dma 633 */ 634 if (cl != NULL) { 635 status = clist_syncmem(conn, cl, 1); 636 if (status != RDMA_SUCCESS) { 637 rdma_buf_free(conn, &clmsg); 638 rdma_buf_free(conn, &rpcmsg); 639 (void) clist_deregister(conn, cl, 1); 640 clist_free(cl); 641 clist_free(sendlist); 642 p->cku_err.re_status = RPC_CANTSEND; 643 p->cku_err.re_errno = EIO; 644 goto done; 645 } 646 } 647 648 /* 649 * Send the call message to the server 650 */ 651 status = RDMA_SEND(conn, sendlist, p->cku_xid); 652 if (status != RDMA_SUCCESS) { 653 if (cl) { 654 (void) clist_deregister(conn, cl, 1); 655 clist_free(cl); 656 /* 657 * If this was a long RPC message, need 658 * to free that buffer. 659 */ 660 if (rpcmsg.type == CHUNK_BUFFER) 661 rdma_buf_free(conn, &rpcmsg); 662 } 663 clist_free(sendlist); 664 p->cku_err.re_status = RPC_CANTSEND; 665 p->cku_err.re_errno = EIO; 666 goto done; 667 } else { 668 /* 669 * RDMA plugin now owns the send msg buffers. 670 * Clear them out and don't free them here. 671 */ 672 clmsg.addr = NULL; 673 if (rpcmsg.type == SEND_BUFFER) 674 rpcmsg.addr = NULL; 675 } 676 clist_free(sendlist); 677 #ifdef DEBUG 678 if (rdma_clnt_debug) { 679 printf("clnt_rdma_kcallit: send request xid %u\n", p->cku_xid); 680 } 681 #endif 682 683 /* 684 * Recv rpc reply 685 */ 686 status = RDMA_RECV(conn, &recvlist, p->cku_xid); 687 688 /* 689 * Deregister chunks sent. Do this only after the reply 690 * is received as that is a sure indication that the 691 * remote end has completed RDMA of the chunks. 692 */ 693 if (cl != NULL) { 694 /* 695 * Deregister the chunks 696 */ 697 (void) clist_deregister(conn, cl, 1); 698 clist_free(cl); 699 /* 700 * If long RPC free chunk 701 */ 702 rdma_buf_free(conn, &rpcmsg); 703 } 704 705 /* 706 * Now check recv status 707 */ 708 if (status != 0) { 709 #ifdef DEBUG 710 if (rdma_clnt_debug) 711 cmn_err(CE_NOTE, 712 "clnt_rdma_kcallit: reply failed %u status %d", 713 p->cku_xid, status); 714 #endif 715 if (status == RDMA_INTR) { 716 p->cku_err.re_status = RPC_INTR; 717 p->cku_err.re_errno = EINTR; 718 RCSTAT_INCR(rcintrs); 719 } else if (status == RPC_TIMEDOUT) { 720 p->cku_err.re_status = RPC_TIMEDOUT; 721 p->cku_err.re_errno = ETIMEDOUT; 722 RCSTAT_INCR(rctimeouts); 723 } else { 724 p->cku_err.re_status = RPC_CANTRECV; 725 p->cku_err.re_errno = EIO; 726 } 727 goto done; 728 } 729 #ifdef DEBUG 730 if (rdma_clnt_debug) 731 printf("clnt_rdma_kcallit: got response xid %u\n", p->cku_xid); 732 #endif 733 /* 734 * Process the reply message. 735 * 736 * First the chunk list (if any) 737 */ 738 xdrs = &(p->cku_inxdr); 739 xdrmem_create(xdrs, (caddr_t)(uintptr_t)recvlist->c_saddr, 740 recvlist->c_len, XDR_DECODE); 741 /* 742 * Treat xid as opaque (xid is the first entity 743 * in the rpc rdma message). 744 */ 745 xid = *(uint32_t *)(uintptr_t)recvlist->c_saddr; 746 /* Skip xid and set the xdr position accordingly. */ 747 XDR_SETPOS(xdrs, sizeof (uint32_t)); 748 (void) xdr_u_int(xdrs, &vers); 749 (void) xdr_u_int(xdrs, &op); 750 (void) xdr_do_clist(xdrs, &cl); 751 off = xdr_getpos(xdrs); 752 753 /* 754 * Now the RPC reply message itself. If the reply 755 * came as a chunk item, then RDMA the reply over. 756 */ 757 xdrs = &replxdr; 758 if (cl && op == RDMA_NOMSG) { 759 struct clist *cle = cl; 760 761 rpcreply.type = CHUNK_BUFFER; 762 rpcreply.addr = kmem_alloc(cle->c_len, KM_SLEEP); 763 rpcreply.len = cle->c_len; 764 cle->c_daddr = (uint64)(uintptr_t)rpcreply.addr; 765 cl = cl->c_next; 766 cle->c_next = NULL; 767 768 /* 769 * Register the rpc reply chunk destination 770 */ 771 status = clist_register(conn, cle, 0); 772 if (status) { 773 rdma_buf_free(conn, &rpcreply); 774 clist_free(cle); 775 p->cku_err.re_status = RPC_CANTDECODERES; 776 p->cku_err.re_errno = EIO; 777 cmn_err(CE_WARN, 778 "clnt_rdma_kcallit: clist_register failed"); 779 goto rdma_done; 780 } 781 782 /* 783 * Now read rpc reply in 784 */ 785 #ifdef DEBUG 786 if (rdma_clnt_debug) 787 printf("clnt_rdma_kcallit: read chunk, len %d, xid %u, \ 788 reply xid %u\n", cle->c_len, p->cku_xid, xid); 789 #endif 790 status = RDMA_READ(conn, cle, WAIT); 791 if (status) { 792 (void) clist_deregister(conn, cle, 0); 793 rdma_buf_free(conn, &rpcreply); 794 clist_free(cle); 795 p->cku_err.re_status = RPC_CANTDECODERES; 796 p->cku_err.re_errno = EIO; 797 cmn_err(CE_WARN, 798 "clnt_rdma_kcallit: RDMA_READ failed"); 799 goto rdma_done; 800 } 801 802 /* 803 * sync the memory for dma 804 */ 805 status = clist_syncmem(conn, cle, 0); 806 if (status != RDMA_SUCCESS) { 807 (void) clist_deregister(conn, cle, 0); 808 rdma_buf_free(conn, &rpcreply); 809 clist_free(cle); 810 p->cku_err.re_status = RPC_CANTDECODERES; 811 p->cku_err.re_errno = EIO; 812 goto rdma_done; 813 } 814 815 /* 816 * Deregister the Long RPC chunk 817 */ 818 (void) clist_deregister(conn, cle, 0); 819 clist_free(cle); 820 xdrrdma_create(xdrs, rpcreply.addr, rpcreply.len, 0, cl, 821 XDR_DECODE, conn); 822 rxdrp = xdrs; 823 } else { 824 rpcreply.addr = NULL; 825 xdrrdma_create(xdrs, 826 (caddr_t)(uintptr_t)(recvlist->c_saddr + off), 827 recvlist->c_len - off, 0, cl, XDR_DECODE, conn); 828 rxdrp = xdrs; 829 } 830 831 reply_msg.rm_direction = REPLY; 832 reply_msg.rm_reply.rp_stat = MSG_ACCEPTED; 833 reply_msg.acpted_rply.ar_stat = SUCCESS; 834 reply_msg.acpted_rply.ar_verf = _null_auth; 835 /* 836 * xdr_results will be done in AUTH_UNWRAP. 837 */ 838 reply_msg.acpted_rply.ar_results.where = NULL; 839 reply_msg.acpted_rply.ar_results.proc = xdr_void; 840 841 /* 842 * Decode and validate the response. 843 */ 844 if (xdr_replymsg(xdrs, &reply_msg)) { 845 enum clnt_stat re_status; 846 847 _seterr_reply(&reply_msg, &(p->cku_err)); 848 849 re_status = p->cku_err.re_status; 850 if (re_status == RPC_SUCCESS) { 851 /* 852 * Reply is good, check auth. 853 */ 854 if (!AUTH_VALIDATE(h->cl_auth, 855 &reply_msg.acpted_rply.ar_verf)) { 856 p->cku_err.re_status = RPC_AUTHERROR; 857 p->cku_err.re_why = AUTH_INVALIDRESP; 858 RCSTAT_INCR(rcbadverfs); 859 cmn_err(CE_WARN, 860 "clnt_rdma_kcallit: AUTH_VALIDATE failed"); 861 } else if (!AUTH_UNWRAP(h->cl_auth, xdrs, 862 xdr_results, resultsp)) { 863 p->cku_err.re_status = RPC_CANTDECODERES; 864 p->cku_err.re_errno = EIO; 865 cmn_err(CE_WARN, 866 "clnt_rdma_kcallit: AUTH_UNWRAP failed"); 867 } 868 } else { 869 /* set errno in case we can't recover */ 870 if (re_status != RPC_VERSMISMATCH && 871 re_status != RPC_AUTHERROR && 872 re_status != RPC_PROGVERSMISMATCH) 873 p->cku_err.re_errno = EIO; 874 875 if (re_status == RPC_AUTHERROR) { 876 /* 877 * Map recoverable and unrecoverable 878 * authentication errors to appropriate 879 * errno 880 */ 881 switch (p->cku_err.re_why) { 882 case AUTH_BADCRED: 883 case AUTH_BADVERF: 884 case AUTH_INVALIDRESP: 885 case AUTH_TOOWEAK: 886 case AUTH_FAILED: 887 case RPCSEC_GSS_NOCRED: 888 case RPCSEC_GSS_FAILED: 889 p->cku_err.re_errno = EACCES; 890 break; 891 case AUTH_REJECTEDCRED: 892 case AUTH_REJECTEDVERF: 893 default: 894 p->cku_err.re_errno = EIO; 895 break; 896 } 897 RPCLOG(1, "clnt_rdma_kcallit : " 898 "authentication failed with " 899 "RPC_AUTHERROR of type %d\n", 900 p->cku_err.re_why); 901 } 902 cmn_err(CE_WARN, 903 "clnt_rdma_kcallit: RPC failed"); 904 905 } 906 } else { 907 p->cku_err.re_status = RPC_CANTDECODERES; 908 p->cku_err.re_errno = EIO; 909 cmn_err(CE_WARN, "clnt_rdma_kcallit: xdr_replymsg failed"); 910 } 911 912 /* 913 * If rpc reply is in a chunk, free it now. 914 */ 915 if (rpcreply.addr != NULL) 916 rdma_buf_free(conn, &rpcreply); 917 918 rdma_done: 919 if ((cl != NULL) || (op == RDMA_NOMSG)) { 920 rdma_buf_t donemsg; 921 922 /* 923 * Free the list holding the chunk info 924 */ 925 if (cl) { 926 clist_free(cl); 927 cl = NULL; 928 } 929 930 /* 931 * Tell the server that the reads are done 932 */ 933 donemsg.type = SEND_BUFFER; 934 if (RDMA_BUF_ALLOC(conn, &donemsg)) { 935 p->cku_err.re_status = RPC_CANTSEND; 936 p->cku_err.re_errno = EIO; 937 RCSTAT_INCR(rcnomem); 938 cmn_err(CE_WARN, "clnt_rdma_kcallit: no free buffer"); 939 goto done; 940 } 941 xdrs = &p->cku_outxdr; 942 xdrmem_create(xdrs, donemsg.addr, donemsg.len, XDR_ENCODE); 943 vers = RPCRDMA_VERS; 944 op = RDMA_DONE; 945 946 /* 947 * Treat xid as opaque (xid is the first entity 948 * in the rpc rdma message). 949 */ 950 (*(uint32_t *)donemsg.addr) = p->cku_xid; 951 /* Skip xid and set the xdr position accordingly. */ 952 XDR_SETPOS(xdrs, sizeof (uint32_t)); 953 if (!xdr_u_int(xdrs, &vers) || 954 !xdr_u_int(xdrs, &op)) { 955 cmn_err(CE_WARN, 956 "clnt_rdma_kcallit: xdr_u_int failed"); 957 rdma_buf_free(conn, &donemsg); 958 goto done; 959 } 960 961 sendlist = NULL; 962 clist_add(&sendlist, 0, XDR_GETPOS(xdrs), &donemsg.handle, 963 donemsg.addr, NULL, NULL); 964 965 status = RDMA_SEND(conn, sendlist, p->cku_xid); 966 if (status != RDMA_SUCCESS) { 967 cmn_err(CE_WARN, 968 "clnt_rdma_kcallit: RDMA_SEND failed xid %u", 969 p->cku_xid); 970 } 971 #ifdef DEBUG 972 else { 973 if (rdma_clnt_debug) 974 printf("clnt_rdma_kcallit: sent RDMA_DONE xid %u\n", 975 p->cku_xid); 976 } 977 #endif 978 clist_free(sendlist); 979 } 980 981 done: 982 if (cxdrp) 983 XDR_DESTROY(cxdrp); 984 if (rxdrp) { 985 (void) xdr_rpc_free_verifier(rxdrp, &reply_msg); 986 XDR_DESTROY(rxdrp); 987 } 988 989 if (recvlist) { 990 rdma_buf_t recvmsg; 991 992 recvmsg.addr = (caddr_t)(uintptr_t)recvlist->c_saddr; 993 recvmsg.type = RECV_BUFFER; 994 RDMA_BUF_FREE(conn, &recvmsg); 995 clist_free(recvlist); 996 } 997 RDMA_REL_CONN(conn); 998 if (p->cku_err.re_status != RPC_SUCCESS) { 999 RCSTAT_INCR(rcbadcalls); 1000 } 1001 return (p->cku_err.re_status); 1002 } 1003 1004 /* ARGSUSED */ 1005 static void 1006 clnt_rdma_kabort(CLIENT *h) 1007 { 1008 } 1009 1010 static void 1011 clnt_rdma_kerror(CLIENT *h, struct rpc_err *err) 1012 { 1013 struct cku_private *p = htop(h); 1014 1015 *err = p->cku_err; 1016 } 1017 1018 static bool_t 1019 clnt_rdma_kfreeres(CLIENT *h, xdrproc_t xdr_res, caddr_t res_ptr) 1020 { 1021 struct cku_private *p = htop(h); 1022 XDR *xdrs; 1023 1024 xdrs = &(p->cku_outxdr); 1025 xdrs->x_op = XDR_FREE; 1026 return ((*xdr_res)(xdrs, res_ptr)); 1027 } 1028 1029 /* ARGSUSED */ 1030 static bool_t 1031 clnt_rdma_kcontrol(CLIENT *h, int cmd, char *arg) 1032 { 1033 return (TRUE); 1034 } 1035 1036 /* ARGSUSED */ 1037 static int 1038 clnt_rdma_ksettimers(CLIENT *h, struct rpc_timers *t, struct rpc_timers *all, 1039 int minimum, void(*feedback)(int, int, caddr_t), caddr_t arg, 1040 uint32_t xid) 1041 { 1042 RCSTAT_INCR(rctimers); 1043 return (0); 1044 } 1045 1046 int 1047 rdma_reachable(int addr_type, struct netbuf *addr, struct knetconfig **knconf) 1048 { 1049 rdma_registry_t *rp; 1050 void *handle = NULL; 1051 struct knetconfig *knc; 1052 char *pf, *p; 1053 rdma_stat status; 1054 int error = 0; 1055 1056 if (!INGLOBALZONE(curproc)) 1057 return (-1); 1058 /* 1059 * modload the RDMA plugins if not already done. 1060 */ 1061 if (!rdma_modloaded) { 1062 mutex_enter(&rdma_modload_lock); 1063 if (!rdma_modloaded) { 1064 error = rdma_modload(); 1065 } 1066 mutex_exit(&rdma_modload_lock); 1067 if (error) 1068 return (-1); 1069 } 1070 1071 if (!rdma_dev_available) 1072 return (-1); 1073 1074 rw_enter(&rdma_lock, RW_READER); 1075 rp = rdma_mod_head; 1076 while (rp != NULL) { 1077 status = RDMA_REACHABLE(rp->r_mod->rdma_ops, addr_type, addr, 1078 &handle); 1079 if (status == RDMA_SUCCESS) { 1080 knc = kmem_zalloc(sizeof (struct knetconfig), 1081 KM_SLEEP); 1082 knc->knc_semantics = NC_TPI_RDMA; 1083 pf = kmem_alloc(KNC_STRSIZE, KM_SLEEP); 1084 p = kmem_alloc(KNC_STRSIZE, KM_SLEEP); 1085 if (addr_type == AF_INET) 1086 (void) strncpy(pf, NC_INET, KNC_STRSIZE); 1087 else if (addr_type == AF_INET6) 1088 (void) strncpy(pf, NC_INET6, KNC_STRSIZE); 1089 pf[KNC_STRSIZE - 1] = '\0'; 1090 1091 (void) strncpy(p, rp->r_mod->rdma_api, KNC_STRSIZE); 1092 p[KNC_STRSIZE - 1] = '\0'; 1093 1094 knc->knc_protofmly = pf; 1095 knc->knc_proto = p; 1096 knc->knc_rdev = (dev_t)handle; 1097 *knconf = knc; 1098 rw_exit(&rdma_lock); 1099 return (0); 1100 } 1101 rp = rp->r_next; 1102 } 1103 rw_exit(&rdma_lock); 1104 return (-1); 1105 } 1106