1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 /* 3 * Copyright (c) 2014-2017 Oracle. All rights reserved. 4 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the BSD-type 10 * license below: 11 * 12 * Redistribution and use in source and binary forms, with or without 13 * modification, are permitted provided that the following conditions 14 * are met: 15 * 16 * Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 19 * Redistributions in binary form must reproduce the above 20 * copyright notice, this list of conditions and the following 21 * disclaimer in the documentation and/or other materials provided 22 * with the distribution. 23 * 24 * Neither the name of the Network Appliance, Inc. nor the names of 25 * its contributors may be used to endorse or promote products 26 * derived from this software without specific prior written 27 * permission. 28 * 29 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 30 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 31 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 32 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 33 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 34 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 35 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 36 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 37 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 38 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 39 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 40 */ 41 42 /* 43 * verbs.c 44 * 45 * Encapsulates the major functions managing: 46 * o adapters 47 * o endpoints 48 * o connections 49 * o buffer memory 50 */ 51 52 #include <linux/bitops.h> 53 #include <linux/interrupt.h> 54 #include <linux/slab.h> 55 #include <linux/sunrpc/addr.h> 56 #include <linux/sunrpc/svc_rdma.h> 57 #include <linux/log2.h> 58 59 #include <asm/barrier.h> 60 61 #include <rdma/ib_cm.h> 62 63 #include "xprt_rdma.h" 64 #include <trace/events/rpcrdma.h> 65 66 static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt); 67 static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt); 68 static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf, 69 unsigned long item); 70 static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, 71 struct rpcrdma_sendctx *sc); 72 static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt); 73 static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt); 74 static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt); 75 static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt); 76 static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); 77 static void rpcrdma_ep_get(struct rpcrdma_ep *ep); 78 static int rpcrdma_ep_put(struct rpcrdma_ep *ep); 79 static struct rpcrdma_regbuf * 80 rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction, 81 int node); 82 static struct rpcrdma_regbuf * 83 rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction); 84 static bool rpcrdma_regbuf_realloc_node(struct rpcrdma_regbuf *rb, 85 size_t size, gfp_t flags, int node); 86 static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb); 87 static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); 88 89 /* Wait for outstanding transport work to finish. ib_drain_qp 90 * handles the drains in the wrong order for us, so open code 91 * them here. 92 */ 93 static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt) 94 { 95 struct rpcrdma_ep *ep = r_xprt->rx_ep; 96 struct rdma_cm_id *id = ep->re_id; 97 98 /* Wait for rpcrdma_post_recvs() to leave its critical 99 * section. 100 */ 101 if (atomic_inc_return(&ep->re_receiving) > 1) 102 wait_for_completion(&ep->re_done); 103 104 /* Flush Receives, then wait for deferred Reply work 105 * to complete. 106 */ 107 ib_drain_rq(id->qp); 108 109 /* Deferred Reply processing might have scheduled 110 * local invalidations. 111 */ 112 ib_drain_sq(id->qp); 113 114 rpcrdma_ep_put(ep); 115 } 116 117 /* Ensure xprt_force_disconnect() is invoked exactly once when a 118 * connection is closed or lost. (The important thing is it needs 119 * to be invoked "at least" once). 120 */ 121 void rpcrdma_force_disconnect(struct rpcrdma_ep *ep) 122 { 123 if (atomic_add_unless(&ep->re_force_disconnect, 1, 1)) 124 xprt_force_disconnect(ep->re_xprt); 125 } 126 127 /** 128 * rpcrdma_flush_disconnect - Disconnect on flushed completion 129 * @r_xprt: transport to disconnect 130 * @wc: work completion entry 131 * 132 * Must be called in process context. 133 */ 134 void rpcrdma_flush_disconnect(struct rpcrdma_xprt *r_xprt, struct ib_wc *wc) 135 { 136 if (wc->status != IB_WC_SUCCESS) 137 rpcrdma_force_disconnect(r_xprt->rx_ep); 138 } 139 140 /** 141 * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC 142 * @cq: completion queue 143 * @wc: WCE for a completed Send WR 144 * 145 */ 146 static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc) 147 { 148 struct ib_cqe *cqe = wc->wr_cqe; 149 struct rpcrdma_sendctx *sc = 150 container_of(cqe, struct rpcrdma_sendctx, sc_cqe); 151 struct rpcrdma_xprt *r_xprt = cq->cq_context; 152 153 /* WARNING: Only wr_cqe and status are reliable at this point */ 154 trace_xprtrdma_wc_send(wc, &sc->sc_cid); 155 rpcrdma_sendctx_put_locked(r_xprt, sc); 156 rpcrdma_flush_disconnect(r_xprt, wc); 157 } 158 159 /** 160 * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC 161 * @cq: completion queue 162 * @wc: WCE for a completed Receive WR 163 * 164 */ 165 static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc) 166 { 167 struct ib_cqe *cqe = wc->wr_cqe; 168 struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep, 169 rr_cqe); 170 struct rpcrdma_xprt *r_xprt = cq->cq_context; 171 172 /* WARNING: Only wr_cqe and status are reliable at this point */ 173 trace_xprtrdma_wc_receive(wc, &rep->rr_cid); 174 --r_xprt->rx_ep->re_receive_count; 175 if (wc->status != IB_WC_SUCCESS) 176 goto out_flushed; 177 178 /* status == SUCCESS means all fields in wc are trustworthy */ 179 rpcrdma_set_xdrlen(&rep->rr_hdrbuf, wc->byte_len); 180 rep->rr_wc_flags = wc->wc_flags; 181 rep->rr_inv_rkey = wc->ex.invalidate_rkey; 182 183 ib_dma_sync_single_for_cpu(rdmab_device(rep->rr_rdmabuf), 184 rdmab_addr(rep->rr_rdmabuf), 185 wc->byte_len, DMA_FROM_DEVICE); 186 187 rpcrdma_reply_handler(rep); 188 return; 189 190 out_flushed: 191 rpcrdma_flush_disconnect(r_xprt, wc); 192 rpcrdma_rep_put(&r_xprt->rx_buf, rep); 193 } 194 195 static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep, 196 struct rdma_conn_param *param) 197 { 198 const struct rpcrdma_connect_private *pmsg = param->private_data; 199 unsigned int rsize, wsize; 200 201 /* Default settings for RPC-over-RDMA Version One */ 202 rsize = RPCRDMA_V1_DEF_INLINE_SIZE; 203 wsize = RPCRDMA_V1_DEF_INLINE_SIZE; 204 205 if (pmsg && 206 pmsg->cp_magic == rpcrdma_cmp_magic && 207 pmsg->cp_version == RPCRDMA_CMP_VERSION) { 208 rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size); 209 wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size); 210 } 211 212 if (rsize < ep->re_inline_recv) 213 ep->re_inline_recv = rsize; 214 if (wsize < ep->re_inline_send) 215 ep->re_inline_send = wsize; 216 217 rpcrdma_set_max_header_sizes(ep); 218 } 219 220 /** 221 * rpcrdma_cm_event_handler - Handle RDMA CM events 222 * @id: rdma_cm_id on which an event has occurred 223 * @event: details of the event 224 * 225 * Called with @id's mutex held. Returns 1 if caller should 226 * destroy @id, otherwise 0. 227 */ 228 static int 229 rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) 230 { 231 struct rpcrdma_ep *ep = id->context; 232 233 might_sleep(); 234 235 switch (event->event) { 236 case RDMA_CM_EVENT_ADDR_RESOLVED: 237 case RDMA_CM_EVENT_ROUTE_RESOLVED: 238 ep->re_async_rc = 0; 239 complete(&ep->re_done); 240 return 0; 241 case RDMA_CM_EVENT_ADDR_ERROR: 242 ep->re_async_rc = -EPROTO; 243 complete(&ep->re_done); 244 return 0; 245 case RDMA_CM_EVENT_ROUTE_ERROR: 246 ep->re_async_rc = -ENETUNREACH; 247 complete(&ep->re_done); 248 return 0; 249 case RDMA_CM_EVENT_ADDR_CHANGE: 250 switch (xchg(&ep->re_connect_status, -ENODEV)) { 251 case 0: 252 goto wake_connect_worker; 253 case 1: 254 /* The later DISCONNECTED event balances the 255 * ESTABLISHED get; do not put here. 256 */ 257 rpcrdma_force_disconnect(ep); 258 return 0; 259 } 260 return 0; 261 case RDMA_CM_EVENT_ESTABLISHED: 262 rpcrdma_ep_get(ep); 263 ep->re_connect_status = 1; 264 rpcrdma_update_cm_private(ep, &event->param.conn); 265 trace_xprtrdma_inline_thresh(ep); 266 wake_up_all(&ep->re_connect_wait); 267 break; 268 case RDMA_CM_EVENT_CONNECT_ERROR: 269 ep->re_connect_status = -ENOTCONN; 270 goto wake_connect_worker; 271 case RDMA_CM_EVENT_UNREACHABLE: 272 ep->re_connect_status = -ENETUNREACH; 273 goto wake_connect_worker; 274 case RDMA_CM_EVENT_REJECTED: 275 ep->re_connect_status = -ECONNREFUSED; 276 if (event->status == IB_CM_REJ_STALE_CONN) 277 ep->re_connect_status = -ENOTCONN; 278 wake_connect_worker: 279 wake_up_all(&ep->re_connect_wait); 280 return 0; 281 case RDMA_CM_EVENT_DISCONNECTED: 282 ep->re_connect_status = -ECONNABORTED; 283 rpcrdma_force_disconnect(ep); 284 return rpcrdma_ep_put(ep); 285 default: 286 break; 287 } 288 289 return 0; 290 } 291 292 static void rpcrdma_ep_removal_done(struct rpcrdma_notification *rn) 293 { 294 struct rpcrdma_ep *ep = container_of(rn, struct rpcrdma_ep, re_rn); 295 296 trace_xprtrdma_device_removal(ep->re_id); 297 xprt_force_disconnect(ep->re_xprt); 298 } 299 300 static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt, 301 struct rpcrdma_ep *ep) 302 { 303 unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1; 304 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 305 struct rdma_cm_id *id; 306 int rc; 307 308 init_completion(&ep->re_done); 309 310 id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, ep, 311 RDMA_PS_TCP, IB_QPT_RC); 312 if (IS_ERR(id)) 313 return id; 314 315 ep->re_async_rc = -ETIMEDOUT; 316 rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr, 317 RDMA_RESOLVE_TIMEOUT); 318 if (rc) 319 goto out; 320 rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout); 321 if (rc < 0) 322 goto out; 323 324 rc = ep->re_async_rc; 325 if (rc) 326 goto out; 327 328 ep->re_async_rc = -ETIMEDOUT; 329 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); 330 if (rc) 331 goto out; 332 rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout); 333 if (rc < 0) 334 goto out; 335 rc = ep->re_async_rc; 336 if (rc) 337 goto out; 338 339 ep->re_id = id; 340 rc = rpcrdma_rn_register(id->device, &ep->re_rn, rpcrdma_ep_removal_done); 341 if (rc) 342 goto out; 343 344 return id; 345 346 out: 347 rdma_destroy_id(id); 348 return ERR_PTR(rc); 349 } 350 351 static void rpcrdma_ep_destroy(struct kref *kref) 352 { 353 struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref); 354 355 if (ep->re_id->qp) { 356 rdma_destroy_qp(ep->re_id); 357 ep->re_id->qp = NULL; 358 } 359 360 if (ep->re_attr.recv_cq) 361 ib_free_cq(ep->re_attr.recv_cq); 362 ep->re_attr.recv_cq = NULL; 363 if (ep->re_attr.send_cq) 364 ib_free_cq(ep->re_attr.send_cq); 365 ep->re_attr.send_cq = NULL; 366 367 if (ep->re_pd) 368 ib_dealloc_pd(ep->re_pd); 369 ep->re_pd = NULL; 370 371 rpcrdma_rn_unregister(ep->re_id->device, &ep->re_rn); 372 373 kfree(ep); 374 module_put(THIS_MODULE); 375 } 376 377 static noinline void rpcrdma_ep_get(struct rpcrdma_ep *ep) 378 { 379 kref_get(&ep->re_kref); 380 } 381 382 /* Returns: 383 * %0 if @ep still has a positive kref count, or 384 * %1 if @ep was destroyed successfully. 385 */ 386 static noinline int rpcrdma_ep_put(struct rpcrdma_ep *ep) 387 { 388 return kref_put(&ep->re_kref, rpcrdma_ep_destroy); 389 } 390 391 static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) 392 { 393 struct rpcrdma_connect_private *pmsg; 394 struct ib_device *device; 395 struct rdma_cm_id *id; 396 struct rpcrdma_ep *ep; 397 int rc; 398 399 ep = kzalloc_obj(*ep, XPRTRDMA_GFP_FLAGS); 400 if (!ep) 401 return -ENOTCONN; 402 ep->re_xprt = &r_xprt->rx_xprt; 403 kref_init(&ep->re_kref); 404 405 id = rpcrdma_create_id(r_xprt, ep); 406 if (IS_ERR(id)) { 407 kfree(ep); 408 return PTR_ERR(id); 409 } 410 __module_get(THIS_MODULE); 411 device = id->device; 412 reinit_completion(&ep->re_done); 413 414 ep->re_max_requests = r_xprt->rx_xprt.max_reqs; 415 ep->re_inline_send = xprt_rdma_max_inline_write; 416 ep->re_inline_recv = xprt_rdma_max_inline_read; 417 rc = frwr_query_device(ep, device); 418 if (rc) 419 goto out_destroy; 420 421 r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests); 422 423 ep->re_attr.srq = NULL; 424 ep->re_attr.cap.max_inline_data = 0; 425 ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 426 ep->re_attr.qp_type = IB_QPT_RC; 427 ep->re_attr.port_num = ~0; 428 429 ep->re_send_batch = ep->re_max_requests >> 3; 430 ep->re_send_count = ep->re_send_batch; 431 init_waitqueue_head(&ep->re_connect_wait); 432 433 ep->re_attr.send_cq = ib_alloc_cq_any(device, r_xprt, 434 ep->re_attr.cap.max_send_wr, 435 IB_POLL_WORKQUEUE); 436 if (IS_ERR(ep->re_attr.send_cq)) { 437 rc = PTR_ERR(ep->re_attr.send_cq); 438 ep->re_attr.send_cq = NULL; 439 goto out_destroy; 440 } 441 442 ep->re_attr.recv_cq = ib_alloc_cq_any(device, r_xprt, 443 ep->re_attr.cap.max_recv_wr, 444 IB_POLL_WORKQUEUE); 445 if (IS_ERR(ep->re_attr.recv_cq)) { 446 rc = PTR_ERR(ep->re_attr.recv_cq); 447 ep->re_attr.recv_cq = NULL; 448 goto out_destroy; 449 } 450 ep->re_receive_count = 0; 451 452 /* Initialize cma parameters */ 453 memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma)); 454 455 /* Prepare RDMA-CM private message */ 456 pmsg = &ep->re_cm_private; 457 pmsg->cp_magic = rpcrdma_cmp_magic; 458 pmsg->cp_version = RPCRDMA_CMP_VERSION; 459 pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK; 460 pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->re_inline_send); 461 pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->re_inline_recv); 462 ep->re_remote_cma.private_data = pmsg; 463 ep->re_remote_cma.private_data_len = sizeof(*pmsg); 464 465 /* Client offers RDMA Read but does not initiate */ 466 ep->re_remote_cma.initiator_depth = 0; 467 ep->re_remote_cma.responder_resources = 468 min_t(int, U8_MAX, device->attrs.max_qp_rd_atom); 469 470 /* Limit transport retries so client can detect server 471 * GID changes quickly. RPC layer handles re-establishing 472 * transport connection and retransmission. 473 */ 474 ep->re_remote_cma.retry_count = 6; 475 476 /* RPC-over-RDMA handles its own flow control. In addition, 477 * make all RNR NAKs visible so we know that RPC-over-RDMA 478 * flow control is working correctly (no NAKs should be seen). 479 */ 480 ep->re_remote_cma.flow_control = 0; 481 ep->re_remote_cma.rnr_retry_count = 0; 482 483 ep->re_pd = ib_alloc_pd(device, 0); 484 if (IS_ERR(ep->re_pd)) { 485 rc = PTR_ERR(ep->re_pd); 486 ep->re_pd = NULL; 487 goto out_destroy; 488 } 489 490 rc = rdma_create_qp(id, ep->re_pd, &ep->re_attr); 491 if (rc) 492 goto out_destroy; 493 494 r_xprt->rx_ep = ep; 495 return 0; 496 497 out_destroy: 498 rpcrdma_ep_put(ep); 499 rdma_destroy_id(id); 500 return rc; 501 } 502 503 /** 504 * rpcrdma_xprt_connect - Connect an unconnected transport 505 * @r_xprt: controlling transport instance 506 * 507 * Returns 0 on success or a negative errno. 508 */ 509 int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt) 510 { 511 struct rpc_xprt *xprt = &r_xprt->rx_xprt; 512 struct rpcrdma_ep *ep; 513 int rc; 514 515 rc = rpcrdma_ep_create(r_xprt); 516 if (rc) 517 return rc; 518 ep = r_xprt->rx_ep; 519 520 xprt_clear_connected(xprt); 521 rpcrdma_reset_cwnd(r_xprt); 522 523 /* Bump the ep's reference count while there are 524 * outstanding Receives. 525 */ 526 rpcrdma_ep_get(ep); 527 rpcrdma_post_recvs(r_xprt, 1); 528 529 rc = rdma_connect(ep->re_id, &ep->re_remote_cma); 530 if (rc) 531 goto out; 532 533 if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO) 534 xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; 535 wait_event_interruptible(ep->re_connect_wait, 536 ep->re_connect_status != 0); 537 if (ep->re_connect_status <= 0) { 538 rc = ep->re_connect_status; 539 goto out; 540 } 541 542 rc = rpcrdma_sendctxs_create(r_xprt); 543 if (rc) { 544 rc = -ENOTCONN; 545 goto out; 546 } 547 548 rc = rpcrdma_reqs_setup(r_xprt); 549 if (rc) { 550 rc = -ENOTCONN; 551 goto out; 552 } 553 rpcrdma_mrs_create(r_xprt); 554 555 /* 556 * rpcrdma_encode_write_list() dereferences the write-pad 557 * MR with no NULL check, so fail the connect rather than 558 * publish a transport whose write-pad MR is NULL. 559 */ 560 rc = frwr_wp_create(r_xprt); 561 if (rc) { 562 rc = -ENOTCONN; 563 goto out; 564 } 565 566 out: 567 trace_xprtrdma_connect(r_xprt, rc); 568 return rc; 569 } 570 571 /** 572 * rpcrdma_xprt_disconnect - Disconnect underlying transport 573 * @r_xprt: controlling transport instance 574 * 575 * Caller serializes. Either the transport send lock is held, 576 * or we're being called to destroy the transport. 577 * 578 * On return, @r_xprt is completely divested of all hardware 579 * resources and prepared for the next ->connect operation. 580 */ 581 void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt) 582 { 583 struct rpcrdma_ep *ep = r_xprt->rx_ep; 584 struct rdma_cm_id *id; 585 int rc; 586 587 if (!ep) 588 return; 589 590 id = ep->re_id; 591 rc = rdma_disconnect(id); 592 trace_xprtrdma_disconnect(r_xprt, rc); 593 594 rpcrdma_xprt_drain(r_xprt); 595 rpcrdma_reps_unmap(r_xprt); 596 rpcrdma_sendctxs_destroy(r_xprt); 597 rpcrdma_reqs_reset(r_xprt); 598 rpcrdma_mrs_destroy(r_xprt); 599 600 if (rpcrdma_ep_put(ep)) 601 rdma_destroy_id(id); 602 603 r_xprt->rx_ep = NULL; 604 } 605 606 /* Fixed-size circular FIFO queue. This implementation is wait-free and 607 * lock-free. 608 * 609 * Consumer is the code path that posts Sends. This path dequeues a 610 * sendctx for use by a Send operation. Multiple consumer threads 611 * are serialized by the RPC transport lock, which allows only one 612 * ->send_request call at a time. 613 * 614 * Producer is the code path that handles Send completions. This path 615 * enqueues a sendctx that has been completed. Multiple producer 616 * threads are serialized by the ib_poll_cq() function. 617 */ 618 619 /* rpcrdma_sendctxs_destroy() assumes caller has already quiesced 620 * queue activity, and rpcrdma_xprt_drain has flushed all remaining 621 * Send requests. 622 */ 623 static void rpcrdma_sendctxs_destroy(struct rpcrdma_xprt *r_xprt) 624 { 625 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 626 unsigned long i; 627 628 if (!buf->rb_sc_ctxs) 629 return; 630 631 /* The QP is drained, but the final unsignaled Sends might not 632 * have been walked by a signaled Send completion. Release those 633 * Send owners before request buffers are reset. 634 * 635 * Unlike the completion sweep, this walk can visit slots with 636 * no Send posted: after a partial rpcrdma_sendctxs_create() 637 * failure on reconnect, rb_sc_head and rb_sc_tail are stale, 638 * and slots between them can be NULL or have sc_req clear. 639 */ 640 for (i = rpcrdma_sendctx_next(buf, buf->rb_sc_tail); 641 i != rpcrdma_sendctx_next(buf, buf->rb_sc_head); 642 i = rpcrdma_sendctx_next(buf, i)) { 643 struct rpcrdma_sendctx *sc = buf->rb_sc_ctxs[i]; 644 645 if (sc && sc->sc_req) 646 rpcrdma_sendctx_unmap(sc); 647 } 648 649 for (i = 0; i <= buf->rb_sc_last; i++) 650 kfree(buf->rb_sc_ctxs[i]); 651 kfree(buf->rb_sc_ctxs); 652 buf->rb_sc_ctxs = NULL; 653 } 654 655 static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) 656 { 657 struct rpcrdma_sendctx *sc; 658 659 sc = kzalloc_flex(*sc, sc_sges, ep->re_attr.cap.max_send_sge, 660 XPRTRDMA_GFP_FLAGS); 661 if (!sc) 662 return NULL; 663 664 sc->sc_cqe.done = rpcrdma_wc_send; 665 sc->sc_cid.ci_queue_id = ep->re_attr.send_cq->res.id; 666 sc->sc_cid.ci_completion_id = 667 atomic_inc_return(&ep->re_completion_ids); 668 return sc; 669 } 670 671 static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) 672 { 673 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 674 struct rpcrdma_sendctx *sc; 675 unsigned long i; 676 677 /* Maximum number of concurrent outstanding Send WRs. Capping 678 * the circular queue size stops Send Queue overflow by causing 679 * the ->send_request call to fail temporarily before too many 680 * Sends are posted. 681 */ 682 i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS; 683 buf->rb_sc_ctxs = kzalloc_objs(sc, i, XPRTRDMA_GFP_FLAGS); 684 if (!buf->rb_sc_ctxs) 685 return -ENOMEM; 686 687 buf->rb_sc_last = i - 1; 688 for (i = 0; i <= buf->rb_sc_last; i++) { 689 sc = rpcrdma_sendctx_create(r_xprt->rx_ep); 690 if (!sc) 691 return -ENOMEM; 692 693 buf->rb_sc_ctxs[i] = sc; 694 } 695 696 buf->rb_sc_head = 0; 697 buf->rb_sc_tail = 0; 698 return 0; 699 } 700 701 /* The sendctx queue is not guaranteed to have a size that is a 702 * power of two, thus the helpers in circ_buf.h cannot be used. 703 * The other option is to use modulus (%), which can be expensive. 704 */ 705 static unsigned long rpcrdma_sendctx_next(struct rpcrdma_buffer *buf, 706 unsigned long item) 707 { 708 return likely(item < buf->rb_sc_last) ? item + 1 : 0; 709 } 710 711 static unsigned long rpcrdma_sendctx_prev(struct rpcrdma_buffer *buf, 712 unsigned long item) 713 { 714 return item > 0 ? item - 1 : buf->rb_sc_last; 715 } 716 717 /** 718 * rpcrdma_sendctx_get_locked - Acquire a send context 719 * @r_xprt: controlling transport instance 720 * 721 * Returns pointer to a free send completion context; or NULL if 722 * the queue is empty. 723 * 724 * Usage: Called to acquire an SGE array before preparing a Send WR. 725 * 726 * The caller serializes calls to this function (per transport), and 727 * provides an effective memory barrier that flushes the new value 728 * of rb_sc_head. 729 */ 730 struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_xprt *r_xprt) 731 { 732 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 733 struct rpcrdma_sendctx *sc; 734 unsigned long next_head; 735 736 next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head); 737 738 if (next_head == READ_ONCE(buf->rb_sc_tail)) 739 goto out_emptyq; 740 741 /* ORDER: item must be accessed _before_ head is updated */ 742 sc = buf->rb_sc_ctxs[next_head]; 743 744 /* Releasing the lock in the caller acts as a memory 745 * barrier that flushes rb_sc_head. 746 */ 747 buf->rb_sc_head = next_head; 748 749 return sc; 750 751 out_emptyq: 752 /* The queue is "empty" if there have not been enough Send 753 * completions recently. This is a sign the Send Queue is 754 * backing up. Cause the caller to pause and try again. 755 */ 756 xprt_wait_for_buffer_space(&r_xprt->rx_xprt); 757 r_xprt->rx_stats.empty_sendctx_q++; 758 759 /* Recheck: a Send completion between the ring-empty test 760 * and the set_bit could cause its xprt_write_space() to 761 * miss, leaving XPRT_WRITE_SPACE set with a non-full ring. 762 * The smp_mb__after_atomic() pairs with smp_store_release() 763 * in rpcrdma_sendctx_put_locked(). 764 */ 765 smp_mb__after_atomic(); 766 next_head = rpcrdma_sendctx_next(buf, buf->rb_sc_head); 767 if (next_head != READ_ONCE(buf->rb_sc_tail)) 768 xprt_write_space(&r_xprt->rx_xprt); 769 770 return NULL; 771 } 772 773 /** 774 * rpcrdma_sendctx_unget_locked - Release an unposted send context 775 * @r_xprt: controlling transport instance 776 * @sc: send context to release 777 * 778 * Usage: Called when no Send is posted for the sendctx most 779 * recently returned by rpcrdma_sendctx_get_locked(). 780 * 781 * The caller serializes calls to this function and to 782 * rpcrdma_sendctx_get_locked() (per transport). 783 */ 784 void rpcrdma_sendctx_unget_locked(struct rpcrdma_xprt *r_xprt, 785 struct rpcrdma_sendctx *sc) 786 { 787 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 788 789 if (WARN_ON_ONCE(buf->rb_sc_ctxs[buf->rb_sc_head] != sc)) 790 return; 791 792 buf->rb_sc_head = rpcrdma_sendctx_prev(buf, buf->rb_sc_head); 793 xprt_write_space(&r_xprt->rx_xprt); 794 } 795 796 /** 797 * rpcrdma_sendctx_put_locked - Release a send context 798 * @r_xprt: controlling transport instance 799 * @sc: send context to release 800 * 801 * Usage: Called from Send completion to return a sendctxt 802 * to the queue. 803 * 804 * The caller serializes calls to this function (per transport). 805 */ 806 static void rpcrdma_sendctx_put_locked(struct rpcrdma_xprt *r_xprt, 807 struct rpcrdma_sendctx *sc) 808 { 809 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 810 unsigned long next_tail; 811 812 /* Release previously completed but unsignaled Sends by walking 813 * up the queue until @sc is found. 814 */ 815 next_tail = buf->rb_sc_tail; 816 do { 817 struct rpcrdma_sendctx *cur; 818 819 next_tail = rpcrdma_sendctx_next(buf, next_tail); 820 821 /* ORDER: item must be accessed _before_ tail is updated */ 822 cur = buf->rb_sc_ctxs[next_tail]; 823 rpcrdma_sendctx_unmap(cur); 824 825 } while (buf->rb_sc_ctxs[next_tail] != sc); 826 827 /* Paired with READ_ONCE in rpcrdma_sendctx_get_locked(): 828 * both the fast-path ring-full test and the post-set_bit 829 * recheck in the slow path depend on this store-release. 830 */ 831 smp_store_release(&buf->rb_sc_tail, next_tail); 832 833 xprt_write_space(&r_xprt->rx_xprt); 834 } 835 836 static void 837 rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) 838 { 839 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 840 struct rpcrdma_ep *ep = r_xprt->rx_ep; 841 struct ib_device *device = ep->re_id->device; 842 unsigned int count; 843 844 /* Try to allocate enough to perform one full-sized I/O */ 845 for (count = 0; count < ep->re_max_rdma_segs; count++) { 846 struct rpcrdma_mr *mr; 847 int rc; 848 849 mr = kzalloc_node(sizeof(*mr), XPRTRDMA_GFP_FLAGS, 850 ibdev_to_node(device)); 851 if (!mr) 852 break; 853 854 rc = frwr_mr_init(r_xprt, mr); 855 if (rc) { 856 kfree(mr); 857 break; 858 } 859 860 spin_lock(&buf->rb_lock); 861 rpcrdma_mr_push(mr, &buf->rb_mrs); 862 list_add(&mr->mr_all, &buf->rb_all_mrs); 863 spin_unlock(&buf->rb_lock); 864 } 865 866 r_xprt->rx_stats.mrs_allocated += count; 867 trace_xprtrdma_createmrs(r_xprt, count); 868 } 869 870 static void 871 rpcrdma_mr_refresh_worker(struct work_struct *work) 872 { 873 struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, 874 rb_refresh_worker); 875 struct rpcrdma_xprt *r_xprt = container_of(buf, struct rpcrdma_xprt, 876 rx_buf); 877 878 rpcrdma_mrs_create(r_xprt); 879 xprt_write_space(&r_xprt->rx_xprt); 880 } 881 882 /** 883 * rpcrdma_mrs_refresh - Wake the MR refresh worker 884 * @r_xprt: controlling transport instance 885 * 886 */ 887 void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt) 888 { 889 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 890 struct rpcrdma_ep *ep = r_xprt->rx_ep; 891 892 /* If there is no underlying connection, it's no use 893 * to wake the refresh worker. 894 */ 895 if (ep->re_connect_status != 1) 896 return; 897 queue_work(system_highpri_wq, &buf->rb_refresh_worker); 898 } 899 900 /** 901 * rpcrdma_req_create - Allocate an rpcrdma_req object 902 * @r_xprt: controlling r_xprt 903 * @size: initial size, in bytes, of send and receive buffers 904 * 905 * Returns an allocated and fully initialized rpcrdma_req or NULL. 906 */ 907 struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, 908 size_t size) 909 { 910 struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; 911 struct rpcrdma_req *req; 912 913 req = kzalloc_obj(*req, XPRTRDMA_GFP_FLAGS); 914 if (req == NULL) 915 goto out1; 916 917 req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE); 918 if (!req->rl_sendbuf) 919 goto out2; 920 921 req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE); 922 if (!req->rl_recvbuf) 923 goto out3; 924 925 INIT_LIST_HEAD(&req->rl_free_mrs); 926 INIT_LIST_HEAD(&req->rl_registered); 927 spin_lock(&buffer->rb_lock); 928 list_add(&req->rl_all, &buffer->rb_allreqs); 929 spin_unlock(&buffer->rb_lock); 930 return req; 931 932 out3: 933 rpcrdma_regbuf_free(req->rl_sendbuf); 934 out2: 935 kfree(req); 936 out1: 937 return NULL; 938 } 939 940 /** 941 * rpcrdma_req_setup - Per-connection instance setup of an rpcrdma_req object 942 * @r_xprt: controlling transport instance 943 * @req: rpcrdma_req object to set up 944 * 945 * Returns zero on success, and a negative errno on failure. 946 */ 947 int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) 948 { 949 struct rpcrdma_regbuf *rb; 950 size_t maxhdrsize; 951 952 /* Compute maximum header buffer size in bytes */ 953 maxhdrsize = rpcrdma_fixed_maxsz + 3 + 954 r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz; 955 maxhdrsize *= sizeof(__be32); 956 rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize), 957 DMA_TO_DEVICE); 958 if (!rb) 959 goto out; 960 961 if (!__rpcrdma_regbuf_dma_map(r_xprt, rb)) 962 goto out_free; 963 964 req->rl_rdmabuf = rb; 965 xdr_buf_init(&req->rl_hdrbuf, rdmab_data(rb), rdmab_length(rb)); 966 return 0; 967 968 out_free: 969 rpcrdma_regbuf_free(rb); 970 out: 971 return -ENOMEM; 972 } 973 974 /* ASSUMPTION: the rb_allreqs list is stable for the duration, 975 * and thus can be walked without holding rb_lock. Eg. the 976 * caller is holding the transport send lock to exclude 977 * device removal or disconnection. 978 */ 979 static int rpcrdma_reqs_setup(struct rpcrdma_xprt *r_xprt) 980 { 981 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 982 struct rpcrdma_req *req; 983 int rc; 984 985 list_for_each_entry(req, &buf->rb_allreqs, rl_all) { 986 rc = rpcrdma_req_setup(r_xprt, req); 987 if (rc) 988 return rc; 989 } 990 return 0; 991 } 992 993 static void rpcrdma_req_reset(struct rpcrdma_req *req) 994 { 995 struct rpcrdma_mr *mr; 996 997 /* Credits are valid for only one connection */ 998 req->rl_slot.rq_cong = 0; 999 1000 rpcrdma_regbuf_free(req->rl_rdmabuf); 1001 req->rl_rdmabuf = NULL; 1002 1003 rpcrdma_regbuf_dma_unmap(req->rl_sendbuf); 1004 rpcrdma_regbuf_dma_unmap(req->rl_recvbuf); 1005 1006 /* The verbs consumer can't know the state of an MR on the 1007 * req->rl_registered list unless a successful completion 1008 * has occurred, so they cannot be re-used. 1009 */ 1010 while ((mr = rpcrdma_mr_pop(&req->rl_registered))) { 1011 struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf; 1012 1013 spin_lock(&buf->rb_lock); 1014 list_del(&mr->mr_all); 1015 spin_unlock(&buf->rb_lock); 1016 1017 frwr_mr_release(mr); 1018 } 1019 } 1020 1021 /* ASSUMPTION: the rb_allreqs list is stable for the duration, 1022 * and thus can be walked without holding rb_lock. Eg. the 1023 * caller is holding the transport send lock to exclude 1024 * device removal or disconnection. 1025 */ 1026 static void rpcrdma_reqs_reset(struct rpcrdma_xprt *r_xprt) 1027 { 1028 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1029 struct rpcrdma_req *req; 1030 1031 list_for_each_entry(req, &buf->rb_allreqs, rl_all) 1032 rpcrdma_req_reset(req); 1033 } 1034 1035 static noinline 1036 struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt) 1037 { 1038 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1039 struct rpcrdma_ep *ep = r_xprt->rx_ep; 1040 struct ib_device *device = ep->re_id->device; 1041 struct rpcrdma_rep *rep; 1042 1043 rep = kzalloc_obj(*rep, XPRTRDMA_GFP_FLAGS); 1044 if (rep == NULL) 1045 goto out; 1046 1047 rep->rr_rdmabuf = rpcrdma_regbuf_alloc_node(ep->re_inline_recv, 1048 DMA_FROM_DEVICE, 1049 ibdev_to_node(device)); 1050 if (!rep->rr_rdmabuf) 1051 goto out_free; 1052 1053 rep->rr_cid.ci_completion_id = 1054 atomic_inc_return(&r_xprt->rx_ep->re_completion_ids); 1055 1056 xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rep->rr_rdmabuf), 1057 rdmab_length(rep->rr_rdmabuf)); 1058 rep->rr_cqe.done = rpcrdma_wc_receive; 1059 rep->rr_rxprt = r_xprt; 1060 rep->rr_recv_wr.next = NULL; 1061 rep->rr_recv_wr.wr_cqe = &rep->rr_cqe; 1062 rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; 1063 rep->rr_recv_wr.num_sge = 1; 1064 1065 spin_lock(&buf->rb_lock); 1066 list_add(&rep->rr_all, &buf->rb_all_reps); 1067 spin_unlock(&buf->rb_lock); 1068 return rep; 1069 1070 out_free: 1071 kfree(rep); 1072 out: 1073 return NULL; 1074 } 1075 1076 static void rpcrdma_rep_free(struct rpcrdma_rep *rep) 1077 { 1078 rpcrdma_regbuf_free(rep->rr_rdmabuf); 1079 kfree(rep); 1080 } 1081 1082 static struct rpcrdma_rep *rpcrdma_rep_get_locked(struct rpcrdma_buffer *buf) 1083 { 1084 struct llist_node *node; 1085 1086 /* Calls to llist_del_first are required to be serialized */ 1087 node = llist_del_first(&buf->rb_free_reps); 1088 if (!node) 1089 return NULL; 1090 return llist_entry(node, struct rpcrdma_rep, rr_node); 1091 } 1092 1093 /** 1094 * rpcrdma_rep_put - Release rpcrdma_rep back to free list 1095 * @buf: buffer pool 1096 * @rep: rep to release 1097 * 1098 * The rep's transient association with an rpc_rqst, established 1099 * by rpcrdma_reply_handler() and torn down here, must not survive 1100 * onto rb_free_reps: rpcrdma_post_recvs() pulls reps from the free 1101 * list to re-post them, and a non-NULL rr_rqst on a free-listed rep 1102 * would imply the rep is still referenced by a req. 1103 */ 1104 void rpcrdma_rep_put(struct rpcrdma_buffer *buf, struct rpcrdma_rep *rep) 1105 { 1106 rep->rr_rqst = NULL; 1107 llist_add(&rep->rr_node, &buf->rb_free_reps); 1108 } 1109 1110 /* Caller must ensure the QP is quiescent (RQ is drained) before 1111 * invoking this function, to guarantee rb_all_reps is not 1112 * changing. 1113 */ 1114 static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt) 1115 { 1116 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1117 struct rpcrdma_rep *rep; 1118 1119 list_for_each_entry(rep, &buf->rb_all_reps, rr_all) 1120 rpcrdma_regbuf_dma_unmap(rep->rr_rdmabuf); 1121 } 1122 1123 static void rpcrdma_reps_destroy(struct rpcrdma_buffer *buf) 1124 { 1125 struct rpcrdma_rep *rep; 1126 1127 spin_lock(&buf->rb_lock); 1128 while ((rep = list_first_entry_or_null(&buf->rb_all_reps, 1129 struct rpcrdma_rep, 1130 rr_all)) != NULL) { 1131 list_del(&rep->rr_all); 1132 spin_unlock(&buf->rb_lock); 1133 1134 rpcrdma_rep_free(rep); 1135 1136 spin_lock(&buf->rb_lock); 1137 } 1138 spin_unlock(&buf->rb_lock); 1139 } 1140 1141 static unsigned int rpcrdma_req_pool_slack(unsigned int max_reqs) 1142 { 1143 /* The sendctx ring can hold up to one Send-signaling batch 1144 * (re_send_batch, set by frwr_open() to re_max_requests >> 3) 1145 * of unfinished Sends. Each pins its req until a signaled Send 1146 * completion releases the sendctx. Size the pool above max_reqs 1147 * by that batch so the recycle delay does not stall a slot 1148 * allocation that the RPC/RDMA credit window would admit. 1149 * 1150 * Round up: re_max_requests >> 3 is zero when max_reqs < 8, but 1151 * a single unsignaled Send is still enough to pin one req. One 1152 * slack slot covers that case. 1153 */ 1154 return DIV_ROUND_UP(max_reqs, 8); 1155 } 1156 1157 /** 1158 * rpcrdma_buffer_create - Create initial set of req/rep objects 1159 * @r_xprt: transport instance to (re)initialize 1160 * 1161 * Returns zero on success, otherwise a negative errno. 1162 */ 1163 int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) 1164 { 1165 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1166 unsigned int max_reqs; 1167 int i, rc; 1168 1169 buf->rb_bc_srv_max_requests = 0; 1170 spin_lock_init(&buf->rb_lock); 1171 INIT_LIST_HEAD(&buf->rb_mrs); 1172 INIT_LIST_HEAD(&buf->rb_all_mrs); 1173 INIT_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); 1174 1175 init_llist_head(&buf->rb_send_bufs); 1176 INIT_LIST_HEAD(&buf->rb_allreqs); 1177 INIT_LIST_HEAD(&buf->rb_all_reps); 1178 1179 rc = -ENOMEM; 1180 max_reqs = r_xprt->rx_xprt.max_reqs; 1181 max_reqs += rpcrdma_req_pool_slack(max_reqs); 1182 for (i = 0; i < max_reqs; i++) { 1183 struct rpcrdma_req *req; 1184 1185 req = rpcrdma_req_create(r_xprt, 1186 RPCRDMA_V1_DEF_INLINE_SIZE * 2); 1187 if (!req) 1188 goto out; 1189 llist_add(&req->rl_node, &buf->rb_send_bufs); 1190 } 1191 1192 init_llist_head(&buf->rb_free_reps); 1193 1194 return 0; 1195 out: 1196 rpcrdma_buffer_destroy(buf); 1197 return rc; 1198 } 1199 1200 /** 1201 * rpcrdma_req_destroy - Destroy an rpcrdma_req object 1202 * @req: unused object to be destroyed 1203 * 1204 * Relies on caller holding the transport send lock to protect 1205 * removing req->rl_all from buf->rb_all_reqs safely. 1206 */ 1207 void rpcrdma_req_destroy(struct rpcrdma_req *req) 1208 { 1209 struct rpcrdma_mr *mr; 1210 1211 list_del(&req->rl_all); 1212 1213 while ((mr = rpcrdma_mr_pop(&req->rl_free_mrs))) { 1214 struct rpcrdma_buffer *buf = &mr->mr_xprt->rx_buf; 1215 1216 spin_lock(&buf->rb_lock); 1217 list_del(&mr->mr_all); 1218 spin_unlock(&buf->rb_lock); 1219 1220 frwr_mr_release(mr); 1221 } 1222 1223 rpcrdma_regbuf_free(req->rl_recvbuf); 1224 rpcrdma_regbuf_free(req->rl_sendbuf); 1225 rpcrdma_regbuf_free(req->rl_rdmabuf); 1226 kfree(req); 1227 } 1228 1229 /** 1230 * rpcrdma_mrs_destroy - Release all of a transport's MRs 1231 * @r_xprt: controlling transport instance 1232 * 1233 * Relies on caller holding the transport send lock to protect 1234 * removing mr->mr_list from req->rl_free_mrs safely. 1235 */ 1236 static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt) 1237 { 1238 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1239 struct rpcrdma_mr *mr; 1240 1241 cancel_work_sync(&buf->rb_refresh_worker); 1242 1243 spin_lock(&buf->rb_lock); 1244 while ((mr = list_first_entry_or_null(&buf->rb_all_mrs, 1245 struct rpcrdma_mr, 1246 mr_all)) != NULL) { 1247 list_del(&mr->mr_list); 1248 list_del(&mr->mr_all); 1249 spin_unlock(&buf->rb_lock); 1250 1251 frwr_mr_release(mr); 1252 1253 spin_lock(&buf->rb_lock); 1254 } 1255 spin_unlock(&buf->rb_lock); 1256 } 1257 1258 /** 1259 * rpcrdma_buffer_destroy - Release all hw resources 1260 * @buf: root control block for resources 1261 * 1262 * ORDERING: relies on a prior rpcrdma_xprt_drain : 1263 * - No more Send or Receive completions can occur 1264 * - All MRs, reps, and reqs are returned to their free lists 1265 */ 1266 void 1267 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1268 { 1269 struct rpcrdma_req *req, *next; 1270 struct llist_node *node; 1271 1272 rpcrdma_reps_destroy(buf); 1273 1274 node = llist_del_all(&buf->rb_send_bufs); 1275 llist_for_each_entry_safe(req, next, node, rl_node) 1276 rpcrdma_req_destroy(req); 1277 } 1278 1279 /** 1280 * rpcrdma_mr_get - Allocate an rpcrdma_mr object 1281 * @r_xprt: controlling transport 1282 * 1283 * Returns an initialized rpcrdma_mr or NULL if no free 1284 * rpcrdma_mr objects are available. 1285 */ 1286 struct rpcrdma_mr * 1287 rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt) 1288 { 1289 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1290 struct rpcrdma_mr *mr; 1291 1292 spin_lock(&buf->rb_lock); 1293 mr = rpcrdma_mr_pop(&buf->rb_mrs); 1294 spin_unlock(&buf->rb_lock); 1295 return mr; 1296 } 1297 1298 /** 1299 * rpcrdma_reply_put - Put reply buffers back into pool 1300 * @buffers: buffer pool 1301 * @req: object to return 1302 * 1303 */ 1304 void rpcrdma_reply_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) 1305 { 1306 struct rpcrdma_rep *rep = req->rl_reply; 1307 1308 if (rep) { 1309 req->rl_reply = NULL; 1310 rpcrdma_rep_put(buffers, rep); 1311 } 1312 } 1313 1314 /** 1315 * rpcrdma_buffer_get - Get a request buffer 1316 * @buffers: Buffer pool from which to obtain a buffer 1317 * 1318 * Returns a fresh rpcrdma_req, or NULL if none are available. 1319 */ 1320 struct rpcrdma_req * 1321 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) 1322 { 1323 struct llist_node *node; 1324 1325 /* Calls to llist_del_first are required to be serialized */ 1326 spin_lock(&buffers->rb_lock); 1327 node = llist_del_first(&buffers->rb_send_bufs); 1328 spin_unlock(&buffers->rb_lock); 1329 if (!node) 1330 return NULL; 1331 return llist_entry(node, struct rpcrdma_req, rl_node); 1332 } 1333 1334 /** 1335 * rpcrdma_buffer_put - Put request/reply buffers back into pool 1336 * @buffers: buffer pool 1337 * @req: object to return 1338 * 1339 */ 1340 void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) 1341 { 1342 rpcrdma_reply_put(buffers, req); 1343 1344 llist_add(&req->rl_node, &buffers->rb_send_bufs); 1345 } 1346 1347 /* Returns a pointer to a rpcrdma_regbuf object, or NULL. 1348 * 1349 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for 1350 * receiving the payload of RDMA RECV operations. During Long Calls 1351 * or Replies they may be registered externally via frwr_map. 1352 */ 1353 static struct rpcrdma_regbuf * 1354 rpcrdma_regbuf_alloc_node(size_t size, enum dma_data_direction direction, 1355 int node) 1356 { 1357 struct rpcrdma_regbuf *rb; 1358 1359 rb = kmalloc_node(sizeof(*rb), XPRTRDMA_GFP_FLAGS, node); 1360 if (!rb) 1361 return NULL; 1362 rb->rg_data = kmalloc_node(size, XPRTRDMA_GFP_FLAGS, node); 1363 if (!rb->rg_data) { 1364 kfree(rb); 1365 return NULL; 1366 } 1367 1368 rb->rg_device = NULL; 1369 rb->rg_direction = direction; 1370 rb->rg_iov.length = size; 1371 return rb; 1372 } 1373 1374 static struct rpcrdma_regbuf * 1375 rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) 1376 { 1377 return rpcrdma_regbuf_alloc_node(size, direction, NUMA_NO_NODE); 1378 } 1379 1380 /** 1381 * rpcrdma_regbuf_realloc - re-allocate a SEND/RECV buffer 1382 * @rb: regbuf to reallocate 1383 * @size: size of buffer to be allocated, in bytes 1384 * @flags: GFP flags 1385 * 1386 * Returns true if reallocation was successful. If false is 1387 * returned, @rb is left untouched. 1388 */ 1389 bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags) 1390 { 1391 return rpcrdma_regbuf_realloc_node(rb, size, flags, NUMA_NO_NODE); 1392 } 1393 1394 static bool rpcrdma_regbuf_realloc_node(struct rpcrdma_regbuf *rb, 1395 size_t size, gfp_t flags, int node) 1396 { 1397 void *buf; 1398 1399 buf = kmalloc_node(size, flags, node); 1400 if (!buf) 1401 return false; 1402 1403 rpcrdma_regbuf_dma_unmap(rb); 1404 kfree(rb->rg_data); 1405 1406 rb->rg_data = buf; 1407 rb->rg_iov.length = size; 1408 return true; 1409 } 1410 1411 static bool rpcrdma_rep_resize(struct rpcrdma_xprt *r_xprt, 1412 struct rpcrdma_rep *rep) 1413 { 1414 struct rpcrdma_regbuf *rb = rep->rr_rdmabuf; 1415 struct rpcrdma_ep *ep = r_xprt->rx_ep; 1416 size_t size = ep->re_inline_recv; 1417 1418 if (likely(rdmab_length(rb) >= size)) 1419 return true; 1420 if (!rpcrdma_regbuf_realloc_node(rb, size, XPRTRDMA_GFP_FLAGS, 1421 ibdev_to_node(ep->re_id->device))) 1422 return false; 1423 1424 xdr_buf_init(&rep->rr_hdrbuf, rdmab_data(rb), rdmab_length(rb)); 1425 return true; 1426 } 1427 1428 /** 1429 * __rpcrdma_regbuf_dma_map - DMA-map a regbuf 1430 * @r_xprt: controlling transport instance 1431 * @rb: regbuf to be mapped 1432 * 1433 * Returns true if the buffer is now DMA mapped to @r_xprt's device 1434 */ 1435 bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt, 1436 struct rpcrdma_regbuf *rb) 1437 { 1438 struct ib_device *device = r_xprt->rx_ep->re_id->device; 1439 1440 if (rb->rg_direction == DMA_NONE) 1441 return false; 1442 1443 rb->rg_iov.addr = ib_dma_map_single(device, rdmab_data(rb), 1444 rdmab_length(rb), rb->rg_direction); 1445 if (ib_dma_mapping_error(device, rdmab_addr(rb))) { 1446 trace_xprtrdma_dma_maperr(rdmab_addr(rb)); 1447 return false; 1448 } 1449 1450 rb->rg_device = device; 1451 rb->rg_iov.lkey = r_xprt->rx_ep->re_pd->local_dma_lkey; 1452 return true; 1453 } 1454 1455 static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb) 1456 { 1457 if (!rb) 1458 return; 1459 1460 if (!rpcrdma_regbuf_is_mapped(rb)) 1461 return; 1462 1463 ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb), rdmab_length(rb), 1464 rb->rg_direction); 1465 rb->rg_device = NULL; 1466 } 1467 1468 static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb) 1469 { 1470 rpcrdma_regbuf_dma_unmap(rb); 1471 if (rb) 1472 kfree(rb->rg_data); 1473 kfree(rb); 1474 } 1475 1476 /** 1477 * rpcrdma_post_recvs - Refill the Receive Queue 1478 * @r_xprt: controlling transport instance 1479 * @needed: current credit grant 1480 * 1481 */ 1482 void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed) 1483 { 1484 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1485 struct rpcrdma_ep *ep = r_xprt->rx_ep; 1486 struct ib_recv_wr *wr, *bad_wr; 1487 struct rpcrdma_rep *rep; 1488 int count, rc; 1489 1490 rc = 0; 1491 count = 0; 1492 1493 if (likely(ep->re_receive_count > needed)) 1494 goto out; 1495 needed -= ep->re_receive_count; 1496 needed += ep->re_recv_batch; 1497 1498 if (atomic_inc_return(&ep->re_receiving) > 1) 1499 goto out_dec; 1500 1501 /* fast path: all needed reps can be found on the free list */ 1502 wr = NULL; 1503 while (needed) { 1504 rep = rpcrdma_rep_get_locked(buf); 1505 if (!rep) 1506 rep = rpcrdma_rep_create(r_xprt); 1507 if (!rep) 1508 break; 1509 /* I1: a rep on rb_free_reps must carry no rqst pointer. */ 1510 WARN_ON_ONCE(rep->rr_rqst); 1511 if (!rpcrdma_rep_resize(r_xprt, rep)) { 1512 rpcrdma_rep_put(buf, rep); 1513 break; 1514 } 1515 if (!rpcrdma_regbuf_dma_map(r_xprt, rep->rr_rdmabuf)) { 1516 rpcrdma_rep_put(buf, rep); 1517 break; 1518 } 1519 1520 rep->rr_cid.ci_queue_id = ep->re_attr.recv_cq->res.id; 1521 trace_xprtrdma_post_recv(&rep->rr_cid); 1522 rep->rr_recv_wr.next = wr; 1523 wr = &rep->rr_recv_wr; 1524 --needed; 1525 ++count; 1526 } 1527 if (!wr) 1528 goto out_dec; 1529 1530 rc = ib_post_recv(ep->re_id->qp, wr, 1531 (const struct ib_recv_wr **)&bad_wr); 1532 if (rc) { 1533 trace_xprtrdma_post_recvs_err(r_xprt, rc); 1534 for (wr = bad_wr; wr;) { 1535 struct rpcrdma_rep *rep; 1536 1537 rep = container_of(wr, struct rpcrdma_rep, rr_recv_wr); 1538 wr = wr->next; 1539 rpcrdma_rep_put(buf, rep); 1540 --count; 1541 } 1542 } 1543 1544 out_dec: 1545 if (atomic_dec_return(&ep->re_receiving) > 0) 1546 complete(&ep->re_done); 1547 out: 1548 trace_xprtrdma_post_recvs(r_xprt, count); 1549 ep->re_receive_count += count; 1550 return; 1551 } 1552