1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 /* 3 * Copyright (c) 2014-2020, Oracle and/or its affiliates. 4 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the BSD-type 10 * license below: 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 16 * Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 19 * Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials provided 22 * with the distribution. 23 * 24 * Neither the name of the Network Appliance, Inc. nor the names of 25 * its contributors may be used to endorse or promote products 26 * derived from this software without specific prior written 27 * permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 30 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 31 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 32 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 33 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 34 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 35 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 36 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 37 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 38 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 39 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 40 */ 41 42 /* 43 * rpc_rdma.c 44 * 45 * This file contains the guts of the RPC RDMA protocol, and 46 * does marshaling/unmarshaling, etc. It is also where interfacing 47 * to the Linux RPC framework lives. 48 */ 49 50 #include <linux/highmem.h> 51 52 #include <linux/sunrpc/svc_rdma.h> 53 54 #include "xprt_rdma.h" 55 #include <trace/events/rpcrdma.h> 56 57 /* Returns size of largest RPC-over-RDMA header in a Call message 58 * 59 * The largest Call header contains a full-size Read list and a 60 * minimal Reply chunk. 61 */ 62 static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) 63 { 64 unsigned int size; 65 66 /* Fixed header fields and list discriminators */ 67 size = RPCRDMA_HDRLEN_MIN; 68 69 /* Maximum Read list size */ 70 size += maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); 71 72 /* Minimal Read chunk size */ 73 size += sizeof(__be32); /* segment count */ 74 size += rpcrdma_segment_maxsz * sizeof(__be32); 75 size += sizeof(__be32); /* list discriminator */ 76 77 return size; 78 } 79 80 /* Returns size of largest RPC-over-RDMA header in a Reply message 81 * 82 * There is only one Write list or one Reply chunk per Reply 83 * message. The larger list is the Write list. 84 */ 85 static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) 86 { 87 unsigned int size; 88 89 /* Fixed header fields and list discriminators */ 90 size = RPCRDMA_HDRLEN_MIN; 91 92 /* Maximum Write list size */ 93 size += sizeof(__be32); /* segment count */ 94 size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); 95 size += sizeof(__be32); /* list discriminator */ 96 97 return size; 98 } 99 100 /** 101 * rpcrdma_set_max_header_sizes - Initialize inline payload sizes 102 * @ep: endpoint to initialize 103 * 104 * The max_inline fields contain the maximum size of an RPC message 105 * so the marshaling code doesn't have to repeat this calculation 106 * for every RPC. 107 */ 108 void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep) 109 { 110 unsigned int maxsegs = ep->re_max_rdma_segs; 111 112 ep->re_max_inline_send = 113 ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs); 114 ep->re_max_inline_recv = 115 ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs); 116 } 117 118 /* The client can send a request inline as long as the RPCRDMA header 119 * plus the RPC call fit under the transport's inline limit. If the 120 * combined call message size exceeds that limit, the client must use 121 * a Read chunk for this operation. 122 * 123 * A Read chunk is also required if sending the RPC call inline would 124 * exceed this device's max_sge limit. 125 */ 126 static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt, 127 struct rpc_rqst *rqst) 128 { 129 struct xdr_buf *xdr = &rqst->rq_snd_buf; 130 struct rpcrdma_ep *ep = r_xprt->rx_ep; 131 unsigned int count, remaining, offset; 132 133 if (xdr->len > ep->re_max_inline_send) 134 return false; 135 136 if (xdr->page_len) { 137 remaining = xdr->page_len; 138 offset = offset_in_page(xdr->page_base); 139 count = RPCRDMA_MIN_SEND_SGES; 140 while (remaining) { 141 remaining -= min_t(unsigned int, 142 PAGE_SIZE - offset, remaining); 143 offset = 0; 144 if (++count > ep->re_attr.cap.max_send_sge) 145 return false; 146 } 147 } 148 149 return true; 150 } 151 152 /* The client can't know how large the actual reply will be. Thus it 153 * plans for the largest possible reply for that particular ULP 154 * operation. If the maximum combined reply message size exceeds that 155 * limit, the client must provide a write list or a reply chunk for 156 * this request. 157 */ 158 static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt, 159 struct rpc_rqst *rqst) 160 { 161 return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv; 162 } 163 164 /* The client is required to provide a Reply chunk if the maximum 165 * size of the non-payload part of the RPC Reply is larger than 166 * the inline threshold. 167 */ 168 static bool 169 rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt, 170 const struct rpc_rqst *rqst) 171 { 172 const struct xdr_buf *buf = &rqst->rq_rcv_buf; 173 174 return (buf->head[0].iov_len + buf->tail[0].iov_len) < 175 r_xprt->rx_ep->re_max_inline_recv; 176 } 177 178 /* ACL likes to be lazy in allocating pages. For TCP, these 179 * pages can be allocated during receive processing. Not true 180 * for RDMA, which must always provision receive buffers 181 * up front. 182 */ 183 static noinline int 184 rpcrdma_alloc_sparse_pages(struct xdr_buf *buf) 185 { 186 struct page **ppages; 187 int len; 188 189 len = buf->page_len; 190 ppages = buf->pages + (buf->page_base >> PAGE_SHIFT); 191 while (len > 0) { 192 if (!*ppages) 193 *ppages = alloc_page(GFP_NOWAIT | __GFP_NOWARN); 194 if (!*ppages) 195 return -ENOBUFS; 196 ppages++; 197 len -= PAGE_SIZE; 198 } 199 200 return 0; 201 } 202 203 /* Convert @vec to a single SGL element. 204 * 205 * Returns pointer to next available SGE, and bumps the total number 206 * of SGEs consumed. 207 */ 208 static struct rpcrdma_mr_seg * 209 rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, 210 unsigned int *n) 211 { 212 seg->mr_page = virt_to_page(vec->iov_base); 213 seg->mr_offset = offset_in_page(vec->iov_base); 214 seg->mr_len = vec->iov_len; 215 ++seg; 216 ++(*n); 217 return seg; 218 } 219 220 /* Convert @xdrbuf into SGEs no larger than a page each. As they 221 * are registered, these SGEs are then coalesced into RDMA segments 222 * when the selected memreg mode supports it. 223 * 224 * Returns positive number of SGEs consumed, or a negative errno. 225 */ 226 227 static int 228 rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf, 229 unsigned int pos, enum rpcrdma_chunktype type, 230 struct rpcrdma_mr_seg *seg) 231 { 232 unsigned long page_base; 233 unsigned int len, n; 234 struct page **ppages; 235 236 n = 0; 237 if (pos == 0) 238 seg = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, &n); 239 240 len = xdrbuf->page_len; 241 ppages = xdrbuf->pages + (xdrbuf->page_base >> PAGE_SHIFT); 242 page_base = offset_in_page(xdrbuf->page_base); 243 while (len) { 244 seg->mr_page = *ppages; 245 seg->mr_offset = page_base; 246 seg->mr_len = min_t(u32, PAGE_SIZE - page_base, len); 247 len -= seg->mr_len; 248 ++ppages; 249 ++seg; 250 ++n; 251 page_base = 0; 252 } 253 254 if (type == rpcrdma_readch || type == rpcrdma_writech) 255 goto out; 256 257 if (xdrbuf->tail[0].iov_len) 258 rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, &n); 259 260 out: 261 if (unlikely(n > RPCRDMA_MAX_SEGS)) 262 return -EIO; 263 return n; 264 } 265 266 static int 267 encode_rdma_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr) 268 { 269 __be32 *p; 270 271 p = xdr_reserve_space(xdr, 4 * sizeof(*p)); 272 if (unlikely(!p)) 273 return -EMSGSIZE; 274 275 xdr_encode_rdma_segment(p, mr->mr_handle, mr->mr_length, mr->mr_offset); 276 return 0; 277 } 278 279 static int 280 encode_read_segment(struct xdr_stream *xdr, struct rpcrdma_mr *mr, 281 u32 position) 282 { 283 __be32 *p; 284 285 p = xdr_reserve_space(xdr, 6 * sizeof(*p)); 286 if (unlikely(!p)) 287 return -EMSGSIZE; 288 289 *p++ = xdr_one; /* Item present */ 290 xdr_encode_read_segment(p, position, mr->mr_handle, mr->mr_length, 291 mr->mr_offset); 292 return 0; 293 } 294 295 static struct rpcrdma_mr_seg *rpcrdma_mr_prepare(struct rpcrdma_xprt *r_xprt, 296 struct rpcrdma_req *req, 297 struct rpcrdma_mr_seg *seg, 298 int nsegs, bool writing, 299 struct rpcrdma_mr **mr) 300 { 301 *mr = rpcrdma_mr_pop(&req->rl_free_mrs); 302 if (!*mr) { 303 *mr = rpcrdma_mr_get(r_xprt); 304 if (!*mr) 305 goto out_getmr_err; 306 (*mr)->mr_req = req; 307 } 308 309 rpcrdma_mr_push(*mr, &req->rl_registered); 310 return frwr_map(r_xprt, seg, nsegs, writing, req->rl_slot.rq_xid, *mr); 311 312 out_getmr_err: 313 trace_xprtrdma_nomrs_err(r_xprt, req); 314 xprt_wait_for_buffer_space(&r_xprt->rx_xprt); 315 rpcrdma_mrs_refresh(r_xprt); 316 return ERR_PTR(-EAGAIN); 317 } 318 319 /* Register and XDR encode the Read list. Supports encoding a list of read 320 * segments that belong to a single read chunk. 321 * 322 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 323 * 324 * Read chunklist (a linked list): 325 * N elements, position P (same P for all chunks of same arg!): 326 * 1 - PHLOO - 1 - PHLOO - ... - 1 - PHLOO - 0 327 * 328 * Returns zero on success, or a negative errno if a failure occurred. 329 * @xdr is advanced to the next position in the stream. 330 * 331 * Only a single @pos value is currently supported. 332 */ 333 static int rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt, 334 struct rpcrdma_req *req, 335 struct rpc_rqst *rqst, 336 enum rpcrdma_chunktype rtype) 337 { 338 struct xdr_stream *xdr = &req->rl_stream; 339 struct rpcrdma_mr_seg *seg; 340 struct rpcrdma_mr *mr; 341 unsigned int pos; 342 int nsegs; 343 344 if (rtype == rpcrdma_noch_pullup || rtype == rpcrdma_noch_mapped) 345 goto done; 346 347 pos = rqst->rq_snd_buf.head[0].iov_len; 348 if (rtype == rpcrdma_areadch) 349 pos = 0; 350 seg = req->rl_segments; 351 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_snd_buf, pos, 352 rtype, seg); 353 if (nsegs < 0) 354 return nsegs; 355 356 do { 357 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, false, &mr); 358 if (IS_ERR(seg)) 359 return PTR_ERR(seg); 360 361 if (encode_read_segment(xdr, mr, pos) < 0) 362 return -EMSGSIZE; 363 364 trace_xprtrdma_chunk_read(rqst->rq_task, pos, mr, nsegs); 365 r_xprt->rx_stats.read_chunk_count++; 366 nsegs -= mr->mr_nents; 367 } while (nsegs); 368 369 done: 370 if (xdr_stream_encode_item_absent(xdr) < 0) 371 return -EMSGSIZE; 372 return 0; 373 } 374 375 /* Register and XDR encode the Write list. Supports encoding a list 376 * containing one array of plain segments that belong to a single 377 * write chunk. 378 * 379 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 380 * 381 * Write chunklist (a list of (one) counted array): 382 * N elements: 383 * 1 - N - HLOO - HLOO - ... - HLOO - 0 384 * 385 * Returns zero on success, or a negative errno if a failure occurred. 386 * @xdr is advanced to the next position in the stream. 387 * 388 * Only a single Write chunk is currently supported. 389 */ 390 static int rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, 391 struct rpcrdma_req *req, 392 struct rpc_rqst *rqst, 393 enum rpcrdma_chunktype wtype) 394 { 395 struct xdr_stream *xdr = &req->rl_stream; 396 struct rpcrdma_ep *ep = r_xprt->rx_ep; 397 struct rpcrdma_mr_seg *seg; 398 struct rpcrdma_mr *mr; 399 int nsegs, nchunks; 400 __be32 *segcount; 401 402 if (wtype != rpcrdma_writech) 403 goto done; 404 405 seg = req->rl_segments; 406 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 407 rqst->rq_rcv_buf.head[0].iov_len, 408 wtype, seg); 409 if (nsegs < 0) 410 return nsegs; 411 412 if (xdr_stream_encode_item_present(xdr) < 0) 413 return -EMSGSIZE; 414 segcount = xdr_reserve_space(xdr, sizeof(*segcount)); 415 if (unlikely(!segcount)) 416 return -EMSGSIZE; 417 /* Actual value encoded below */ 418 419 nchunks = 0; 420 do { 421 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); 422 if (IS_ERR(seg)) 423 return PTR_ERR(seg); 424 425 if (encode_rdma_segment(xdr, mr) < 0) 426 return -EMSGSIZE; 427 428 trace_xprtrdma_chunk_write(rqst->rq_task, mr, nsegs); 429 r_xprt->rx_stats.write_chunk_count++; 430 r_xprt->rx_stats.total_rdma_request += mr->mr_length; 431 nchunks++; 432 nsegs -= mr->mr_nents; 433 } while (nsegs); 434 435 if (xdr_pad_size(rqst->rq_rcv_buf.page_len)) { 436 if (encode_rdma_segment(xdr, ep->re_write_pad_mr) < 0) 437 return -EMSGSIZE; 438 439 trace_xprtrdma_chunk_wp(rqst->rq_task, ep->re_write_pad_mr, 440 nsegs); 441 r_xprt->rx_stats.write_chunk_count++; 442 r_xprt->rx_stats.total_rdma_request += mr->mr_length; 443 nchunks++; 444 nsegs -= mr->mr_nents; 445 } 446 447 /* Update count of segments in this Write chunk */ 448 *segcount = cpu_to_be32(nchunks); 449 450 done: 451 if (xdr_stream_encode_item_absent(xdr) < 0) 452 return -EMSGSIZE; 453 return 0; 454 } 455 456 /* Register and XDR encode the Reply chunk. Supports encoding an array 457 * of plain segments that belong to a single write (reply) chunk. 458 * 459 * Encoding key for single-list chunks (HLOO = Handle32 Length32 Offset64): 460 * 461 * Reply chunk (a counted array): 462 * N elements: 463 * 1 - N - HLOO - HLOO - ... - HLOO 464 * 465 * Returns zero on success, or a negative errno if a failure occurred. 466 * @xdr is advanced to the next position in the stream. 467 */ 468 static int rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt, 469 struct rpcrdma_req *req, 470 struct rpc_rqst *rqst, 471 enum rpcrdma_chunktype wtype) 472 { 473 struct xdr_stream *xdr = &req->rl_stream; 474 struct rpcrdma_mr_seg *seg; 475 struct rpcrdma_mr *mr; 476 int nsegs, nchunks; 477 __be32 *segcount; 478 479 if (wtype != rpcrdma_replych) { 480 if (xdr_stream_encode_item_absent(xdr) < 0) 481 return -EMSGSIZE; 482 return 0; 483 } 484 485 seg = req->rl_segments; 486 nsegs = rpcrdma_convert_iovs(r_xprt, &rqst->rq_rcv_buf, 0, wtype, seg); 487 if (nsegs < 0) 488 return nsegs; 489 490 if (xdr_stream_encode_item_present(xdr) < 0) 491 return -EMSGSIZE; 492 segcount = xdr_reserve_space(xdr, sizeof(*segcount)); 493 if (unlikely(!segcount)) 494 return -EMSGSIZE; 495 /* Actual value encoded below */ 496 497 nchunks = 0; 498 do { 499 seg = rpcrdma_mr_prepare(r_xprt, req, seg, nsegs, true, &mr); 500 if (IS_ERR(seg)) 501 return PTR_ERR(seg); 502 503 if (encode_rdma_segment(xdr, mr) < 0) 504 return -EMSGSIZE; 505 506 trace_xprtrdma_chunk_reply(rqst->rq_task, mr, nsegs); 507 r_xprt->rx_stats.reply_chunk_count++; 508 r_xprt->rx_stats.total_rdma_request += mr->mr_length; 509 nchunks++; 510 nsegs -= mr->mr_nents; 511 } while (nsegs); 512 513 /* Update count of segments in the Reply chunk */ 514 *segcount = cpu_to_be32(nchunks); 515 516 return 0; 517 } 518 519 static void rpcrdma_sendctx_done(struct kref *kref) 520 { 521 struct rpcrdma_req *req = 522 container_of(kref, struct rpcrdma_req, rl_kref); 523 struct rpcrdma_rep *rep = req->rl_reply; 524 525 rpcrdma_complete_rqst(rep); 526 rep->rr_rxprt->rx_stats.reply_waits_for_send++; 527 } 528 529 /** 530 * rpcrdma_sendctx_unmap - DMA-unmap Send buffer 531 * @sc: sendctx containing SGEs to unmap 532 * 533 */ 534 void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc) 535 { 536 struct rpcrdma_regbuf *rb = sc->sc_req->rl_sendbuf; 537 struct ib_sge *sge; 538 539 if (!sc->sc_unmap_count) 540 return; 541 542 /* The first two SGEs contain the transport header and 543 * the inline buffer. These are always left mapped so 544 * they can be cheaply re-used. 545 */ 546 for (sge = &sc->sc_sges[2]; sc->sc_unmap_count; 547 ++sge, --sc->sc_unmap_count) 548 ib_dma_unmap_page(rdmab_device(rb), sge->addr, sge->length, 549 DMA_TO_DEVICE); 550 551 kref_put(&sc->sc_req->rl_kref, rpcrdma_sendctx_done); 552 } 553 554 /* Prepare an SGE for the RPC-over-RDMA transport header. 555 */ 556 static void rpcrdma_prepare_hdr_sge(struct rpcrdma_xprt *r_xprt, 557 struct rpcrdma_req *req, u32 len) 558 { 559 struct rpcrdma_sendctx *sc = req->rl_sendctx; 560 struct rpcrdma_regbuf *rb = req->rl_rdmabuf; 561 struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; 562 563 sge->addr = rdmab_addr(rb); 564 sge->length = len; 565 sge->lkey = rdmab_lkey(rb); 566 567 ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, 568 DMA_TO_DEVICE); 569 } 570 571 /* The head iovec is straightforward, as it is usually already 572 * DMA-mapped. Sync the content that has changed. 573 */ 574 static bool rpcrdma_prepare_head_iov(struct rpcrdma_xprt *r_xprt, 575 struct rpcrdma_req *req, unsigned int len) 576 { 577 struct rpcrdma_sendctx *sc = req->rl_sendctx; 578 struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; 579 struct rpcrdma_regbuf *rb = req->rl_sendbuf; 580 581 if (!rpcrdma_regbuf_dma_map(r_xprt, rb)) 582 return false; 583 584 sge->addr = rdmab_addr(rb); 585 sge->length = len; 586 sge->lkey = rdmab_lkey(rb); 587 588 ib_dma_sync_single_for_device(rdmab_device(rb), sge->addr, sge->length, 589 DMA_TO_DEVICE); 590 return true; 591 } 592 593 /* If there is a page list present, DMA map and prepare an 594 * SGE for each page to be sent. 595 */ 596 static bool rpcrdma_prepare_pagelist(struct rpcrdma_req *req, 597 struct xdr_buf *xdr) 598 { 599 struct rpcrdma_sendctx *sc = req->rl_sendctx; 600 struct rpcrdma_regbuf *rb = req->rl_sendbuf; 601 unsigned int page_base, len, remaining; 602 struct page **ppages; 603 struct ib_sge *sge; 604 605 ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); 606 page_base = offset_in_page(xdr->page_base); 607 remaining = xdr->page_len; 608 while (remaining) { 609 sge = &sc->sc_sges[req->rl_wr.num_sge++]; 610 len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); 611 sge->addr = ib_dma_map_page(rdmab_device(rb), *ppages, 612 page_base, len, DMA_TO_DEVICE); 613 if (ib_dma_mapping_error(rdmab_device(rb), sge->addr)) 614 goto out_mapping_err; 615 616 sge->length = len; 617 sge->lkey = rdmab_lkey(rb); 618 619 sc->sc_unmap_count++; 620 ppages++; 621 remaining -= len; 622 page_base = 0; 623 } 624 625 return true; 626 627 out_mapping_err: 628 trace_xprtrdma_dma_maperr(sge->addr); 629 return false; 630 } 631 632 /* The tail iovec may include an XDR pad for the page list, 633 * as well as additional content, and may not reside in the 634 * same page as the head iovec. 635 */ 636 static bool rpcrdma_prepare_tail_iov(struct rpcrdma_req *req, 637 struct xdr_buf *xdr, 638 unsigned int page_base, unsigned int len) 639 { 640 struct rpcrdma_sendctx *sc = req->rl_sendctx; 641 struct ib_sge *sge = &sc->sc_sges[req->rl_wr.num_sge++]; 642 struct rpcrdma_regbuf *rb = req->rl_sendbuf; 643 struct page *page = virt_to_page(xdr->tail[0].iov_base); 644 645 sge->addr = ib_dma_map_page(rdmab_device(rb), page, page_base, len, 646 DMA_TO_DEVICE); 647 if (ib_dma_mapping_error(rdmab_device(rb), sge->addr)) 648 goto out_mapping_err; 649 650 sge->length = len; 651 sge->lkey = rdmab_lkey(rb); 652 ++sc->sc_unmap_count; 653 return true; 654 655 out_mapping_err: 656 trace_xprtrdma_dma_maperr(sge->addr); 657 return false; 658 } 659 660 /* Copy the tail to the end of the head buffer. 661 */ 662 static void rpcrdma_pullup_tail_iov(struct rpcrdma_xprt *r_xprt, 663 struct rpcrdma_req *req, 664 struct xdr_buf *xdr) 665 { 666 unsigned char *dst; 667 668 dst = (unsigned char *)xdr->head[0].iov_base; 669 dst += xdr->head[0].iov_len + xdr->page_len; 670 memmove(dst, xdr->tail[0].iov_base, xdr->tail[0].iov_len); 671 r_xprt->rx_stats.pullup_copy_count += xdr->tail[0].iov_len; 672 } 673 674 /* Copy pagelist content into the head buffer. 675 */ 676 static void rpcrdma_pullup_pagelist(struct rpcrdma_xprt *r_xprt, 677 struct rpcrdma_req *req, 678 struct xdr_buf *xdr) 679 { 680 unsigned int len, page_base, remaining; 681 struct page **ppages; 682 unsigned char *src, *dst; 683 684 dst = (unsigned char *)xdr->head[0].iov_base; 685 dst += xdr->head[0].iov_len; 686 ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT); 687 page_base = offset_in_page(xdr->page_base); 688 remaining = xdr->page_len; 689 while (remaining) { 690 src = page_address(*ppages); 691 src += page_base; 692 len = min_t(unsigned int, PAGE_SIZE - page_base, remaining); 693 memcpy(dst, src, len); 694 r_xprt->rx_stats.pullup_copy_count += len; 695 696 ppages++; 697 dst += len; 698 remaining -= len; 699 page_base = 0; 700 } 701 } 702 703 /* Copy the contents of @xdr into @rl_sendbuf and DMA sync it. 704 * When the head, pagelist, and tail are small, a pull-up copy 705 * is considerably less costly than DMA mapping the components 706 * of @xdr. 707 * 708 * Assumptions: 709 * - the caller has already verified that the total length 710 * of the RPC Call body will fit into @rl_sendbuf. 711 */ 712 static bool rpcrdma_prepare_noch_pullup(struct rpcrdma_xprt *r_xprt, 713 struct rpcrdma_req *req, 714 struct xdr_buf *xdr) 715 { 716 if (unlikely(xdr->tail[0].iov_len)) 717 rpcrdma_pullup_tail_iov(r_xprt, req, xdr); 718 719 if (unlikely(xdr->page_len)) 720 rpcrdma_pullup_pagelist(r_xprt, req, xdr); 721 722 /* The whole RPC message resides in the head iovec now */ 723 return rpcrdma_prepare_head_iov(r_xprt, req, xdr->len); 724 } 725 726 static bool rpcrdma_prepare_noch_mapped(struct rpcrdma_xprt *r_xprt, 727 struct rpcrdma_req *req, 728 struct xdr_buf *xdr) 729 { 730 struct kvec *tail = &xdr->tail[0]; 731 732 if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) 733 return false; 734 if (xdr->page_len) 735 if (!rpcrdma_prepare_pagelist(req, xdr)) 736 return false; 737 if (tail->iov_len) 738 if (!rpcrdma_prepare_tail_iov(req, xdr, 739 offset_in_page(tail->iov_base), 740 tail->iov_len)) 741 return false; 742 743 if (req->rl_sendctx->sc_unmap_count) 744 kref_get(&req->rl_kref); 745 return true; 746 } 747 748 static bool rpcrdma_prepare_readch(struct rpcrdma_xprt *r_xprt, 749 struct rpcrdma_req *req, 750 struct xdr_buf *xdr) 751 { 752 if (!rpcrdma_prepare_head_iov(r_xprt, req, xdr->head[0].iov_len)) 753 return false; 754 755 /* If there is a Read chunk, the page list is being handled 756 * via explicit RDMA, and thus is skipped here. 757 */ 758 759 /* Do not include the tail if it is only an XDR pad */ 760 if (xdr->tail[0].iov_len > 3) { 761 unsigned int page_base, len; 762 763 /* If the content in the page list is an odd length, 764 * xdr_write_pages() adds a pad at the beginning of 765 * the tail iovec. Force the tail's non-pad content to 766 * land at the next XDR position in the Send message. 767 */ 768 page_base = offset_in_page(xdr->tail[0].iov_base); 769 len = xdr->tail[0].iov_len; 770 page_base += len & 3; 771 len -= len & 3; 772 if (!rpcrdma_prepare_tail_iov(req, xdr, page_base, len)) 773 return false; 774 kref_get(&req->rl_kref); 775 } 776 777 return true; 778 } 779 780 /** 781 * rpcrdma_prepare_send_sges - Construct SGEs for a Send WR 782 * @r_xprt: controlling transport 783 * @req: context of RPC Call being marshalled 784 * @hdrlen: size of transport header, in bytes 785 * @xdr: xdr_buf containing RPC Call 786 * @rtype: chunk type being encoded 787 * 788 * Returns 0 on success; otherwise a negative errno is returned. 789 */ 790 inline int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt, 791 struct rpcrdma_req *req, u32 hdrlen, 792 struct xdr_buf *xdr, 793 enum rpcrdma_chunktype rtype) 794 { 795 int ret; 796 797 ret = -EAGAIN; 798 req->rl_sendctx = rpcrdma_sendctx_get_locked(r_xprt); 799 if (!req->rl_sendctx) 800 goto out_nosc; 801 req->rl_sendctx->sc_unmap_count = 0; 802 req->rl_sendctx->sc_req = req; 803 kref_init(&req->rl_kref); 804 req->rl_wr.wr_cqe = &req->rl_sendctx->sc_cqe; 805 req->rl_wr.sg_list = req->rl_sendctx->sc_sges; 806 req->rl_wr.num_sge = 0; 807 req->rl_wr.opcode = IB_WR_SEND; 808 809 rpcrdma_prepare_hdr_sge(r_xprt, req, hdrlen); 810 811 ret = -EIO; 812 switch (rtype) { 813 case rpcrdma_noch_pullup: 814 if (!rpcrdma_prepare_noch_pullup(r_xprt, req, xdr)) 815 goto out_unmap; 816 break; 817 case rpcrdma_noch_mapped: 818 if (!rpcrdma_prepare_noch_mapped(r_xprt, req, xdr)) 819 goto out_unmap; 820 break; 821 case rpcrdma_readch: 822 if (!rpcrdma_prepare_readch(r_xprt, req, xdr)) 823 goto out_unmap; 824 break; 825 case rpcrdma_areadch: 826 break; 827 default: 828 goto out_unmap; 829 } 830 831 return 0; 832 833 out_unmap: 834 rpcrdma_sendctx_unmap(req->rl_sendctx); 835 out_nosc: 836 trace_xprtrdma_prepsend_failed(&req->rl_slot, ret); 837 return ret; 838 } 839 840 /** 841 * rpcrdma_marshal_req - Marshal and send one RPC request 842 * @r_xprt: controlling transport 843 * @rqst: RPC request to be marshaled 844 * 845 * For the RPC in "rqst", this function: 846 * - Chooses the transfer mode (eg., RDMA_MSG or RDMA_NOMSG) 847 * - Registers Read, Write, and Reply chunks 848 * - Constructs the transport header 849 * - Posts a Send WR to send the transport header and request 850 * 851 * Returns: 852 * %0 if the RPC was sent successfully, 853 * %-ENOTCONN if the connection was lost, 854 * %-EAGAIN if the caller should call again with the same arguments, 855 * %-ENOBUFS if the caller should call again after a delay, 856 * %-EMSGSIZE if the transport header is too small, 857 * %-EIO if a permanent problem occurred while marshaling. 858 */ 859 int 860 rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) 861 { 862 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 863 struct xdr_stream *xdr = &req->rl_stream; 864 enum rpcrdma_chunktype rtype, wtype; 865 struct xdr_buf *buf = &rqst->rq_snd_buf; 866 bool ddp_allowed; 867 __be32 *p; 868 int ret; 869 870 if (unlikely(rqst->rq_rcv_buf.flags & XDRBUF_SPARSE_PAGES)) { 871 ret = rpcrdma_alloc_sparse_pages(&rqst->rq_rcv_buf); 872 if (ret) 873 return ret; 874 } 875 876 rpcrdma_set_xdrlen(&req->rl_hdrbuf, 0); 877 xdr_init_encode(xdr, &req->rl_hdrbuf, rdmab_data(req->rl_rdmabuf), 878 rqst); 879 880 /* Fixed header fields */ 881 ret = -EMSGSIZE; 882 p = xdr_reserve_space(xdr, 4 * sizeof(*p)); 883 if (!p) 884 goto out_err; 885 *p++ = rqst->rq_xid; 886 *p++ = rpcrdma_version; 887 *p++ = r_xprt->rx_buf.rb_max_requests; 888 889 /* When the ULP employs a GSS flavor that guarantees integrity 890 * or privacy, direct data placement of individual data items 891 * is not allowed. 892 */ 893 ddp_allowed = !test_bit(RPCAUTH_AUTH_DATATOUCH, 894 &rqst->rq_cred->cr_auth->au_flags); 895 896 /* 897 * Chunks needed for results? 898 * 899 * o If the expected result is under the inline threshold, all ops 900 * return as inline. 901 * o Large read ops return data as write chunk(s), header as 902 * inline. 903 * o Large non-read ops return as a single reply chunk. 904 */ 905 if (rpcrdma_results_inline(r_xprt, rqst)) 906 wtype = rpcrdma_noch; 907 else if ((ddp_allowed && rqst->rq_rcv_buf.flags & XDRBUF_READ) && 908 rpcrdma_nonpayload_inline(r_xprt, rqst)) 909 wtype = rpcrdma_writech; 910 else 911 wtype = rpcrdma_replych; 912 913 /* 914 * Chunks needed for arguments? 915 * 916 * o If the total request is under the inline threshold, all ops 917 * are sent as inline. 918 * o Large write ops transmit data as read chunk(s), header as 919 * inline. 920 * o Large non-write ops are sent with the entire message as a 921 * single read chunk (protocol 0-position special case). 922 * 923 * This assumes that the upper layer does not present a request 924 * that both has a data payload, and whose non-data arguments 925 * by themselves are larger than the inline threshold. 926 */ 927 if (rpcrdma_args_inline(r_xprt, rqst)) { 928 *p++ = rdma_msg; 929 rtype = buf->len < rdmab_length(req->rl_sendbuf) ? 930 rpcrdma_noch_pullup : rpcrdma_noch_mapped; 931 } else if (ddp_allowed && buf->flags & XDRBUF_WRITE) { 932 *p++ = rdma_msg; 933 rtype = rpcrdma_readch; 934 } else { 935 r_xprt->rx_stats.nomsg_call_count++; 936 *p++ = rdma_nomsg; 937 rtype = rpcrdma_areadch; 938 } 939 940 /* This implementation supports the following combinations 941 * of chunk lists in one RPC-over-RDMA Call message: 942 * 943 * - Read list 944 * - Write list 945 * - Reply chunk 946 * - Read list + Reply chunk 947 * 948 * It might not yet support the following combinations: 949 * 950 * - Read list + Write list 951 * 952 * It does not support the following combinations: 953 * 954 * - Write list + Reply chunk 955 * - Read list + Write list + Reply chunk 956 * 957 * This implementation supports only a single chunk in each 958 * Read or Write list. Thus for example the client cannot 959 * send a Call message with a Position Zero Read chunk and a 960 * regular Read chunk at the same time. 961 */ 962 ret = rpcrdma_encode_read_list(r_xprt, req, rqst, rtype); 963 if (ret) 964 goto out_err; 965 ret = rpcrdma_encode_write_list(r_xprt, req, rqst, wtype); 966 if (ret) 967 goto out_err; 968 ret = rpcrdma_encode_reply_chunk(r_xprt, req, rqst, wtype); 969 if (ret) 970 goto out_err; 971 972 ret = rpcrdma_prepare_send_sges(r_xprt, req, req->rl_hdrbuf.len, 973 buf, rtype); 974 if (ret) 975 goto out_err; 976 977 trace_xprtrdma_marshal(req, rtype, wtype); 978 return 0; 979 980 out_err: 981 trace_xprtrdma_marshal_failed(rqst, ret); 982 r_xprt->rx_stats.failed_marshal_count++; 983 frwr_reset(req); 984 return ret; 985 } 986 987 static void __rpcrdma_update_cwnd_locked(struct rpc_xprt *xprt, 988 struct rpcrdma_buffer *buf, 989 u32 grant) 990 { 991 buf->rb_credits = grant; 992 xprt->cwnd = grant << RPC_CWNDSHIFT; 993 } 994 995 static void rpcrdma_update_cwnd(struct rpcrdma_xprt *r_xprt, u32 grant) 996 { 997 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 998 999 spin_lock(&xprt->transport_lock); 1000 __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, grant); 1001 spin_unlock(&xprt->transport_lock); 1002 } 1003 1004 /** 1005 * rpcrdma_reset_cwnd - Reset the xprt's congestion window 1006 * @r_xprt: controlling transport instance 1007 * 1008 * Prepare @r_xprt for the next connection by reinitializing 1009 * its credit grant to one (see RFC 8166, Section 3.3.3). 1010 */ 1011 void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt) 1012 { 1013 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1014 1015 spin_lock(&xprt->transport_lock); 1016 xprt->cong = 0; 1017 __rpcrdma_update_cwnd_locked(xprt, &r_xprt->rx_buf, 1); 1018 spin_unlock(&xprt->transport_lock); 1019 } 1020 1021 /** 1022 * rpcrdma_inline_fixup - Scatter inline received data into rqst's iovecs 1023 * @rqst: controlling RPC request 1024 * @srcp: points to RPC message payload in receive buffer 1025 * @copy_len: remaining length of receive buffer content 1026 * @pad: Write chunk pad bytes needed (zero for pure inline) 1027 * 1028 * The upper layer has set the maximum number of bytes it can 1029 * receive in each component of rq_rcv_buf. These values are set in 1030 * the head.iov_len, page_len, tail.iov_len, and buflen fields. 1031 * 1032 * Unlike the TCP equivalent (xdr_partial_copy_from_skb), in 1033 * many cases this function simply updates iov_base pointers in 1034 * rq_rcv_buf to point directly to the received reply data, to 1035 * avoid copying reply data. 1036 * 1037 * Returns the count of bytes which had to be memcopied. 1038 */ 1039 static unsigned long 1040 rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad) 1041 { 1042 unsigned long fixup_copy_count; 1043 int i, npages, curlen; 1044 char *destp; 1045 struct page **ppages; 1046 int page_base; 1047 1048 /* The head iovec is redirected to the RPC reply message 1049 * in the receive buffer, to avoid a memcopy. 1050 */ 1051 rqst->rq_rcv_buf.head[0].iov_base = srcp; 1052 rqst->rq_private_buf.head[0].iov_base = srcp; 1053 1054 /* The contents of the receive buffer that follow 1055 * head.iov_len bytes are copied into the page list. 1056 */ 1057 curlen = rqst->rq_rcv_buf.head[0].iov_len; 1058 if (curlen > copy_len) 1059 curlen = copy_len; 1060 srcp += curlen; 1061 copy_len -= curlen; 1062 1063 ppages = rqst->rq_rcv_buf.pages + 1064 (rqst->rq_rcv_buf.page_base >> PAGE_SHIFT); 1065 page_base = offset_in_page(rqst->rq_rcv_buf.page_base); 1066 fixup_copy_count = 0; 1067 if (copy_len && rqst->rq_rcv_buf.page_len) { 1068 int pagelist_len; 1069 1070 pagelist_len = rqst->rq_rcv_buf.page_len; 1071 if (pagelist_len > copy_len) 1072 pagelist_len = copy_len; 1073 npages = PAGE_ALIGN(page_base + pagelist_len) >> PAGE_SHIFT; 1074 for (i = 0; i < npages; i++) { 1075 curlen = PAGE_SIZE - page_base; 1076 if (curlen > pagelist_len) 1077 curlen = pagelist_len; 1078 1079 destp = kmap_atomic(ppages[i]); 1080 memcpy(destp + page_base, srcp, curlen); 1081 flush_dcache_page(ppages[i]); 1082 kunmap_atomic(destp); 1083 srcp += curlen; 1084 copy_len -= curlen; 1085 fixup_copy_count += curlen; 1086 pagelist_len -= curlen; 1087 if (!pagelist_len) 1088 break; 1089 page_base = 0; 1090 } 1091 1092 /* Implicit padding for the last segment in a Write 1093 * chunk is inserted inline at the front of the tail 1094 * iovec. The upper layer ignores the content of 1095 * the pad. Simply ensure inline content in the tail 1096 * that follows the Write chunk is properly aligned. 1097 */ 1098 if (pad) 1099 srcp -= pad; 1100 } 1101 1102 /* The tail iovec is redirected to the remaining data 1103 * in the receive buffer, to avoid a memcopy. 1104 */ 1105 if (copy_len || pad) { 1106 rqst->rq_rcv_buf.tail[0].iov_base = srcp; 1107 rqst->rq_private_buf.tail[0].iov_base = srcp; 1108 } 1109 1110 if (fixup_copy_count) 1111 trace_xprtrdma_fixup(rqst, fixup_copy_count); 1112 return fixup_copy_count; 1113 } 1114 1115 /* By convention, backchannel calls arrive via rdma_msg type 1116 * messages, and never populate the chunk lists. This makes 1117 * the RPC/RDMA header small and fixed in size, so it is 1118 * straightforward to check the RPC header's direction field. 1119 */ 1120 static bool 1121 rpcrdma_is_bcall(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) 1122 #if defined(CONFIG_SUNRPC_BACKCHANNEL) 1123 { 1124 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1125 struct xdr_stream *xdr = &rep->rr_stream; 1126 __be32 *p; 1127 1128 if (rep->rr_proc != rdma_msg) 1129 return false; 1130 1131 /* Peek at stream contents without advancing. */ 1132 p = xdr_inline_decode(xdr, 0); 1133 1134 /* Chunk lists */ 1135 if (xdr_item_is_present(p++)) 1136 return false; 1137 if (xdr_item_is_present(p++)) 1138 return false; 1139 if (xdr_item_is_present(p++)) 1140 return false; 1141 1142 /* RPC header */ 1143 if (*p++ != rep->rr_xid) 1144 return false; 1145 if (*p != cpu_to_be32(RPC_CALL)) 1146 return false; 1147 1148 /* No bc service. */ 1149 if (xprt->bc_serv == NULL) 1150 return false; 1151 1152 /* Now that we are sure this is a backchannel call, 1153 * advance to the RPC header. 1154 */ 1155 p = xdr_inline_decode(xdr, 3 * sizeof(*p)); 1156 if (unlikely(!p)) 1157 return true; 1158 1159 rpcrdma_bc_receive_call(r_xprt, rep); 1160 return true; 1161 } 1162 #else /* CONFIG_SUNRPC_BACKCHANNEL */ 1163 { 1164 return false; 1165 } 1166 #endif /* CONFIG_SUNRPC_BACKCHANNEL */ 1167 1168 static int decode_rdma_segment(struct xdr_stream *xdr, u32 *length) 1169 { 1170 u32 handle; 1171 u64 offset; 1172 __be32 *p; 1173 1174 p = xdr_inline_decode(xdr, 4 * sizeof(*p)); 1175 if (unlikely(!p)) 1176 return -EIO; 1177 1178 xdr_decode_rdma_segment(p, &handle, length, &offset); 1179 trace_xprtrdma_decode_seg(handle, *length, offset); 1180 return 0; 1181 } 1182 1183 static int decode_write_chunk(struct xdr_stream *xdr, u32 *length) 1184 { 1185 u32 segcount, seglength; 1186 __be32 *p; 1187 1188 p = xdr_inline_decode(xdr, sizeof(*p)); 1189 if (unlikely(!p)) 1190 return -EIO; 1191 1192 *length = 0; 1193 segcount = be32_to_cpup(p); 1194 while (segcount--) { 1195 if (decode_rdma_segment(xdr, &seglength)) 1196 return -EIO; 1197 *length += seglength; 1198 } 1199 1200 return 0; 1201 } 1202 1203 /* In RPC-over-RDMA Version One replies, a Read list is never 1204 * expected. This decoder is a stub that returns an error if 1205 * a Read list is present. 1206 */ 1207 static int decode_read_list(struct xdr_stream *xdr) 1208 { 1209 __be32 *p; 1210 1211 p = xdr_inline_decode(xdr, sizeof(*p)); 1212 if (unlikely(!p)) 1213 return -EIO; 1214 if (unlikely(xdr_item_is_present(p))) 1215 return -EIO; 1216 return 0; 1217 } 1218 1219 /* Supports only one Write chunk in the Write list 1220 */ 1221 static int decode_write_list(struct xdr_stream *xdr, u32 *length) 1222 { 1223 u32 chunklen; 1224 bool first; 1225 __be32 *p; 1226 1227 *length = 0; 1228 first = true; 1229 do { 1230 p = xdr_inline_decode(xdr, sizeof(*p)); 1231 if (unlikely(!p)) 1232 return -EIO; 1233 if (xdr_item_is_absent(p)) 1234 break; 1235 if (!first) 1236 return -EIO; 1237 1238 if (decode_write_chunk(xdr, &chunklen)) 1239 return -EIO; 1240 *length += chunklen; 1241 first = false; 1242 } while (true); 1243 return 0; 1244 } 1245 1246 static int decode_reply_chunk(struct xdr_stream *xdr, u32 *length) 1247 { 1248 __be32 *p; 1249 1250 p = xdr_inline_decode(xdr, sizeof(*p)); 1251 if (unlikely(!p)) 1252 return -EIO; 1253 1254 *length = 0; 1255 if (xdr_item_is_present(p)) 1256 if (decode_write_chunk(xdr, length)) 1257 return -EIO; 1258 return 0; 1259 } 1260 1261 static int 1262 rpcrdma_decode_msg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 1263 struct rpc_rqst *rqst) 1264 { 1265 struct xdr_stream *xdr = &rep->rr_stream; 1266 u32 writelist, replychunk, rpclen; 1267 char *base; 1268 1269 /* Decode the chunk lists */ 1270 if (decode_read_list(xdr)) 1271 return -EIO; 1272 if (decode_write_list(xdr, &writelist)) 1273 return -EIO; 1274 if (decode_reply_chunk(xdr, &replychunk)) 1275 return -EIO; 1276 1277 /* RDMA_MSG sanity checks */ 1278 if (unlikely(replychunk)) 1279 return -EIO; 1280 1281 /* Build the RPC reply's Payload stream in rqst->rq_rcv_buf */ 1282 base = (char *)xdr_inline_decode(xdr, 0); 1283 rpclen = xdr_stream_remaining(xdr); 1284 r_xprt->rx_stats.fixup_copy_count += 1285 rpcrdma_inline_fixup(rqst, base, rpclen, writelist & 3); 1286 1287 r_xprt->rx_stats.total_rdma_reply += writelist; 1288 return rpclen + xdr_align_size(writelist); 1289 } 1290 1291 static noinline int 1292 rpcrdma_decode_nomsg(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep) 1293 { 1294 struct xdr_stream *xdr = &rep->rr_stream; 1295 u32 writelist, replychunk; 1296 1297 /* Decode the chunk lists */ 1298 if (decode_read_list(xdr)) 1299 return -EIO; 1300 if (decode_write_list(xdr, &writelist)) 1301 return -EIO; 1302 if (decode_reply_chunk(xdr, &replychunk)) 1303 return -EIO; 1304 1305 /* RDMA_NOMSG sanity checks */ 1306 if (unlikely(writelist)) 1307 return -EIO; 1308 if (unlikely(!replychunk)) 1309 return -EIO; 1310 1311 /* Reply chunk buffer already is the reply vector */ 1312 r_xprt->rx_stats.total_rdma_reply += replychunk; 1313 return replychunk; 1314 } 1315 1316 static noinline int 1317 rpcrdma_decode_error(struct rpcrdma_xprt *r_xprt, struct rpcrdma_rep *rep, 1318 struct rpc_rqst *rqst) 1319 { 1320 struct xdr_stream *xdr = &rep->rr_stream; 1321 __be32 *p; 1322 1323 p = xdr_inline_decode(xdr, sizeof(*p)); 1324 if (unlikely(!p)) 1325 return -EIO; 1326 1327 switch (*p) { 1328 case err_vers: 1329 p = xdr_inline_decode(xdr, 2 * sizeof(*p)); 1330 if (!p) 1331 break; 1332 trace_xprtrdma_err_vers(rqst, p, p + 1); 1333 break; 1334 case err_chunk: 1335 trace_xprtrdma_err_chunk(rqst); 1336 break; 1337 default: 1338 trace_xprtrdma_err_unrecognized(rqst, p); 1339 } 1340 1341 return -EIO; 1342 } 1343 1344 /** 1345 * rpcrdma_unpin_rqst - Release rqst without completing it 1346 * @rep: RPC/RDMA Receive context 1347 * 1348 * This is done when a connection is lost so that a Reply 1349 * can be dropped and its matching Call can be subsequently 1350 * retransmitted on a new connection. 1351 */ 1352 void rpcrdma_unpin_rqst(struct rpcrdma_rep *rep) 1353 { 1354 struct rpc_xprt *xprt = &rep->rr_rxprt->rx_xprt; 1355 struct rpc_rqst *rqst = rep->rr_rqst; 1356 struct rpcrdma_req *req = rpcr_to_rdmar(rqst); 1357 1358 req->rl_reply = NULL; 1359 rep->rr_rqst = NULL; 1360 1361 spin_lock(&xprt->queue_lock); 1362 xprt_unpin_rqst(rqst); 1363 spin_unlock(&xprt->queue_lock); 1364 } 1365 1366 /** 1367 * rpcrdma_complete_rqst - Pass completed rqst back to RPC 1368 * @rep: RPC/RDMA Receive context 1369 * 1370 * Reconstruct the RPC reply and complete the transaction 1371 * while @rqst is still pinned to ensure the rep, rqst, and 1372 * rq_task pointers remain stable. 1373 */ 1374 void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) 1375 { 1376 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; 1377 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1378 struct rpc_rqst *rqst = rep->rr_rqst; 1379 int status; 1380 1381 switch (rep->rr_proc) { 1382 case rdma_msg: 1383 status = rpcrdma_decode_msg(r_xprt, rep, rqst); 1384 break; 1385 case rdma_nomsg: 1386 status = rpcrdma_decode_nomsg(r_xprt, rep); 1387 break; 1388 case rdma_error: 1389 status = rpcrdma_decode_error(r_xprt, rep, rqst); 1390 break; 1391 default: 1392 status = -EIO; 1393 } 1394 if (status < 0) 1395 goto out_badheader; 1396 1397 out: 1398 spin_lock(&xprt->queue_lock); 1399 xprt_complete_rqst(rqst->rq_task, status); 1400 xprt_unpin_rqst(rqst); 1401 spin_unlock(&xprt->queue_lock); 1402 return; 1403 1404 out_badheader: 1405 trace_xprtrdma_reply_hdr_err(rep); 1406 r_xprt->rx_stats.bad_reply_count++; 1407 rqst->rq_task->tk_status = status; 1408 status = 0; 1409 goto out; 1410 } 1411 1412 static void rpcrdma_reply_done(struct kref *kref) 1413 { 1414 struct rpcrdma_req *req = 1415 container_of(kref, struct rpcrdma_req, rl_kref); 1416 1417 rpcrdma_complete_rqst(req->rl_reply); 1418 } 1419 1420 /** 1421 * rpcrdma_reply_handler - Process received RPC/RDMA messages 1422 * @rep: Incoming rpcrdma_rep object to process 1423 * 1424 * Errors must result in the RPC task either being awakened, or 1425 * allowed to timeout, to discover the errors at that time. 1426 */ 1427 void rpcrdma_reply_handler(struct rpcrdma_rep *rep) 1428 { 1429 struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; 1430 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 1431 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1432 struct rpcrdma_req *req; 1433 struct rpc_rqst *rqst; 1434 u32 credits; 1435 __be32 *p; 1436 1437 /* Any data means we had a useful conversation, so 1438 * then we don't need to delay the next reconnect. 1439 */ 1440 if (xprt->reestablish_timeout) 1441 xprt->reestablish_timeout = 0; 1442 1443 /* Fixed transport header fields */ 1444 xdr_init_decode(&rep->rr_stream, &rep->rr_hdrbuf, 1445 rep->rr_hdrbuf.head[0].iov_base, NULL); 1446 p = xdr_inline_decode(&rep->rr_stream, 4 * sizeof(*p)); 1447 if (unlikely(!p)) 1448 goto out_shortreply; 1449 rep->rr_xid = *p++; 1450 rep->rr_vers = *p++; 1451 credits = be32_to_cpu(*p++); 1452 rep->rr_proc = *p++; 1453 1454 if (rep->rr_vers != rpcrdma_version) 1455 goto out_badversion; 1456 1457 if (rpcrdma_is_bcall(r_xprt, rep)) 1458 return; 1459 1460 /* Match incoming rpcrdma_rep to an rpcrdma_req to 1461 * get context for handling any incoming chunks. 1462 */ 1463 spin_lock(&xprt->queue_lock); 1464 rqst = xprt_lookup_rqst(xprt, rep->rr_xid); 1465 if (!rqst) 1466 goto out_norqst; 1467 xprt_pin_rqst(rqst); 1468 spin_unlock(&xprt->queue_lock); 1469 1470 if (credits == 0) 1471 credits = 1; /* don't deadlock */ 1472 else if (credits > r_xprt->rx_ep->re_max_requests) 1473 credits = r_xprt->rx_ep->re_max_requests; 1474 rpcrdma_post_recvs(r_xprt, credits + (buf->rb_bc_srv_max_requests << 1)); 1475 if (buf->rb_credits != credits) 1476 rpcrdma_update_cwnd(r_xprt, credits); 1477 1478 req = rpcr_to_rdmar(rqst); 1479 if (unlikely(req->rl_reply)) 1480 rpcrdma_rep_put(buf, req->rl_reply); 1481 req->rl_reply = rep; 1482 rep->rr_rqst = rqst; 1483 1484 trace_xprtrdma_reply(rqst->rq_task, rep, credits); 1485 1486 if (rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) 1487 frwr_reminv(rep, &req->rl_registered); 1488 if (!list_empty(&req->rl_registered)) 1489 frwr_unmap_async(r_xprt, req); 1490 /* LocalInv completion will complete the RPC */ 1491 else 1492 kref_put(&req->rl_kref, rpcrdma_reply_done); 1493 return; 1494 1495 out_badversion: 1496 trace_xprtrdma_reply_vers_err(rep); 1497 goto out; 1498 1499 out_norqst: 1500 spin_unlock(&xprt->queue_lock); 1501 trace_xprtrdma_reply_rqst_err(rep); 1502 goto out; 1503 1504 out_shortreply: 1505 trace_xprtrdma_reply_short_err(rep); 1506 1507 out: 1508 rpcrdma_rep_put(buf, rep); 1509 } 1510