1 /* 2 * Copyright (c) 2003-2007 Network Appliance, Inc. All rights reserved. 3 * 4 * This software is available to you under a choice of one of two 5 * licenses. You may choose to be licensed under the terms of the GNU 6 * General Public License (GPL) Version 2, available from the file 7 * COPYING in the main directory of this source tree, or the BSD-type 8 * license below: 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 14 * Redistributions of source code must retain the above copyright 15 * notice, this list of conditions and the following disclaimer. 16 * 17 * Redistributions in binary form must reproduce the above 18 * copyright notice, this list of conditions and the following 19 * disclaimer in the documentation and/or other materials provided 20 * with the distribution. 21 * 22 * Neither the name of the Network Appliance, Inc. nor the names of 23 * its contributors may be used to endorse or promote products 24 * derived from this software without specific prior written 25 * permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 28 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 29 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 30 * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT 31 * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, 32 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 33 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 34 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 35 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 36 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 37 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 38 */ 39 40 /* 41 * verbs.c 42 * 43 * Encapsulates the major functions managing: 44 * o adapters 45 * o endpoints 46 * o connections 47 * o buffer memory 48 */ 49 50 #include <linux/interrupt.h> 51 #include <linux/slab.h> 52 #include <linux/prefetch.h> 53 #include <linux/sunrpc/addr.h> 54 #include <asm/bitops.h> 55 56 #include "xprt_rdma.h" 57 58 /* 59 * Globals/Macros 60 */ 61 62 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 63 # define RPCDBG_FACILITY RPCDBG_TRANS 64 #endif 65 66 /* 67 * internal functions 68 */ 69 70 /* 71 * handle replies in tasklet context, using a single, global list 72 * rdma tasklet function -- just turn around and call the func 73 * for all replies on the list 74 */ 75 76 static DEFINE_SPINLOCK(rpcrdma_tk_lock_g); 77 static LIST_HEAD(rpcrdma_tasklets_g); 78 79 static void 80 rpcrdma_run_tasklet(unsigned long data) 81 { 82 struct rpcrdma_rep *rep; 83 unsigned long flags; 84 85 data = data; 86 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 87 while (!list_empty(&rpcrdma_tasklets_g)) { 88 rep = list_entry(rpcrdma_tasklets_g.next, 89 struct rpcrdma_rep, rr_list); 90 list_del(&rep->rr_list); 91 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 92 93 rpcrdma_reply_handler(rep); 94 95 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 96 } 97 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 98 } 99 100 static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); 101 102 static void 103 rpcrdma_schedule_tasklet(struct list_head *sched_list) 104 { 105 unsigned long flags; 106 107 spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); 108 list_splice_tail(sched_list, &rpcrdma_tasklets_g); 109 spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); 110 tasklet_schedule(&rpcrdma_tasklet_g); 111 } 112 113 static void 114 rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) 115 { 116 struct rpcrdma_ep *ep = context; 117 118 pr_err("RPC: %s: %s on device %s ep %p\n", 119 __func__, ib_event_msg(event->event), 120 event->device->name, context); 121 if (ep->rep_connected == 1) { 122 ep->rep_connected = -EIO; 123 rpcrdma_conn_func(ep); 124 wake_up_all(&ep->rep_connect_wait); 125 } 126 } 127 128 static void 129 rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context) 130 { 131 struct rpcrdma_ep *ep = context; 132 133 pr_err("RPC: %s: %s on device %s ep %p\n", 134 __func__, ib_event_msg(event->event), 135 event->device->name, context); 136 if (ep->rep_connected == 1) { 137 ep->rep_connected = -EIO; 138 rpcrdma_conn_func(ep); 139 wake_up_all(&ep->rep_connect_wait); 140 } 141 } 142 143 static void 144 rpcrdma_sendcq_process_wc(struct ib_wc *wc) 145 { 146 /* WARNING: Only wr_id and status are reliable at this point */ 147 if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) { 148 if (wc->status != IB_WC_SUCCESS && 149 wc->status != IB_WC_WR_FLUSH_ERR) 150 pr_err("RPC: %s: SEND: %s\n", 151 __func__, ib_wc_status_msg(wc->status)); 152 } else { 153 struct rpcrdma_mw *r; 154 155 r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; 156 r->mw_sendcompletion(wc); 157 } 158 } 159 160 static int 161 rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) 162 { 163 struct ib_wc *wcs; 164 int budget, count, rc; 165 166 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; 167 do { 168 wcs = ep->rep_send_wcs; 169 170 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); 171 if (rc <= 0) 172 return rc; 173 174 count = rc; 175 while (count-- > 0) 176 rpcrdma_sendcq_process_wc(wcs++); 177 } while (rc == RPCRDMA_POLLSIZE && --budget); 178 return 0; 179 } 180 181 /* 182 * Handle send, fast_reg_mr, and local_inv completions. 183 * 184 * Send events are typically suppressed and thus do not result 185 * in an upcall. Occasionally one is signaled, however. This 186 * prevents the provider's completion queue from wrapping and 187 * losing a completion. 188 */ 189 static void 190 rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) 191 { 192 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; 193 int rc; 194 195 rc = rpcrdma_sendcq_poll(cq, ep); 196 if (rc) { 197 dprintk("RPC: %s: ib_poll_cq failed: %i\n", 198 __func__, rc); 199 return; 200 } 201 202 rc = ib_req_notify_cq(cq, 203 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); 204 if (rc == 0) 205 return; 206 if (rc < 0) { 207 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", 208 __func__, rc); 209 return; 210 } 211 212 rpcrdma_sendcq_poll(cq, ep); 213 } 214 215 static void 216 rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) 217 { 218 struct rpcrdma_rep *rep = 219 (struct rpcrdma_rep *)(unsigned long)wc->wr_id; 220 221 /* WARNING: Only wr_id and status are reliable at this point */ 222 if (wc->status != IB_WC_SUCCESS) 223 goto out_fail; 224 225 /* status == SUCCESS means all fields in wc are trustworthy */ 226 if (wc->opcode != IB_WC_RECV) 227 return; 228 229 dprintk("RPC: %s: rep %p opcode 'recv', length %u: success\n", 230 __func__, rep, wc->byte_len); 231 232 rep->rr_len = wc->byte_len; 233 ib_dma_sync_single_for_cpu(rep->rr_device, 234 rdmab_addr(rep->rr_rdmabuf), 235 rep->rr_len, DMA_FROM_DEVICE); 236 prefetch(rdmab_to_msg(rep->rr_rdmabuf)); 237 238 out_schedule: 239 list_add_tail(&rep->rr_list, sched_list); 240 return; 241 out_fail: 242 if (wc->status != IB_WC_WR_FLUSH_ERR) 243 pr_err("RPC: %s: rep %p: %s\n", 244 __func__, rep, ib_wc_status_msg(wc->status)); 245 rep->rr_len = ~0U; 246 goto out_schedule; 247 } 248 249 static int 250 rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) 251 { 252 struct list_head sched_list; 253 struct ib_wc *wcs; 254 int budget, count, rc; 255 256 INIT_LIST_HEAD(&sched_list); 257 budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; 258 do { 259 wcs = ep->rep_recv_wcs; 260 261 rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); 262 if (rc <= 0) 263 goto out_schedule; 264 265 count = rc; 266 while (count-- > 0) 267 rpcrdma_recvcq_process_wc(wcs++, &sched_list); 268 } while (rc == RPCRDMA_POLLSIZE && --budget); 269 rc = 0; 270 271 out_schedule: 272 rpcrdma_schedule_tasklet(&sched_list); 273 return rc; 274 } 275 276 /* 277 * Handle receive completions. 278 * 279 * It is reentrant but processes single events in order to maintain 280 * ordering of receives to keep server credits. 281 * 282 * It is the responsibility of the scheduled tasklet to return 283 * recv buffers to the pool. NOTE: this affects synchronization of 284 * connection shutdown. That is, the structures required for 285 * the completion of the reply handler must remain intact until 286 * all memory has been reclaimed. 287 */ 288 static void 289 rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) 290 { 291 struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; 292 int rc; 293 294 rc = rpcrdma_recvcq_poll(cq, ep); 295 if (rc) { 296 dprintk("RPC: %s: ib_poll_cq failed: %i\n", 297 __func__, rc); 298 return; 299 } 300 301 rc = ib_req_notify_cq(cq, 302 IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); 303 if (rc == 0) 304 return; 305 if (rc < 0) { 306 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", 307 __func__, rc); 308 return; 309 } 310 311 rpcrdma_recvcq_poll(cq, ep); 312 } 313 314 static void 315 rpcrdma_flush_cqs(struct rpcrdma_ep *ep) 316 { 317 struct ib_wc wc; 318 LIST_HEAD(sched_list); 319 320 while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0) 321 rpcrdma_recvcq_process_wc(&wc, &sched_list); 322 if (!list_empty(&sched_list)) 323 rpcrdma_schedule_tasklet(&sched_list); 324 while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0) 325 rpcrdma_sendcq_process_wc(&wc); 326 } 327 328 static int 329 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) 330 { 331 struct rpcrdma_xprt *xprt = id->context; 332 struct rpcrdma_ia *ia = &xprt->rx_ia; 333 struct rpcrdma_ep *ep = &xprt->rx_ep; 334 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 335 struct sockaddr *sap = (struct sockaddr *)&ep->rep_remote_addr; 336 #endif 337 struct ib_qp_attr *attr = &ia->ri_qp_attr; 338 struct ib_qp_init_attr *iattr = &ia->ri_qp_init_attr; 339 int connstate = 0; 340 341 switch (event->event) { 342 case RDMA_CM_EVENT_ADDR_RESOLVED: 343 case RDMA_CM_EVENT_ROUTE_RESOLVED: 344 ia->ri_async_rc = 0; 345 complete(&ia->ri_done); 346 break; 347 case RDMA_CM_EVENT_ADDR_ERROR: 348 ia->ri_async_rc = -EHOSTUNREACH; 349 dprintk("RPC: %s: CM address resolution error, ep 0x%p\n", 350 __func__, ep); 351 complete(&ia->ri_done); 352 break; 353 case RDMA_CM_EVENT_ROUTE_ERROR: 354 ia->ri_async_rc = -ENETUNREACH; 355 dprintk("RPC: %s: CM route resolution error, ep 0x%p\n", 356 __func__, ep); 357 complete(&ia->ri_done); 358 break; 359 case RDMA_CM_EVENT_ESTABLISHED: 360 connstate = 1; 361 ib_query_qp(ia->ri_id->qp, attr, 362 IB_QP_MAX_QP_RD_ATOMIC | IB_QP_MAX_DEST_RD_ATOMIC, 363 iattr); 364 dprintk("RPC: %s: %d responder resources" 365 " (%d initiator)\n", 366 __func__, attr->max_dest_rd_atomic, 367 attr->max_rd_atomic); 368 goto connected; 369 case RDMA_CM_EVENT_CONNECT_ERROR: 370 connstate = -ENOTCONN; 371 goto connected; 372 case RDMA_CM_EVENT_UNREACHABLE: 373 connstate = -ENETDOWN; 374 goto connected; 375 case RDMA_CM_EVENT_REJECTED: 376 connstate = -ECONNREFUSED; 377 goto connected; 378 case RDMA_CM_EVENT_DISCONNECTED: 379 connstate = -ECONNABORTED; 380 goto connected; 381 case RDMA_CM_EVENT_DEVICE_REMOVAL: 382 connstate = -ENODEV; 383 connected: 384 dprintk("RPC: %s: %sconnected\n", 385 __func__, connstate > 0 ? "" : "dis"); 386 ep->rep_connected = connstate; 387 rpcrdma_conn_func(ep); 388 wake_up_all(&ep->rep_connect_wait); 389 /*FALLTHROUGH*/ 390 default: 391 dprintk("RPC: %s: %pIS:%u (ep 0x%p): %s\n", 392 __func__, sap, rpc_get_port(sap), ep, 393 rdma_event_msg(event->event)); 394 break; 395 } 396 397 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) 398 if (connstate == 1) { 399 int ird = attr->max_dest_rd_atomic; 400 int tird = ep->rep_remote_cma.responder_resources; 401 402 pr_info("rpcrdma: connection to %pIS:%u on %s, memreg '%s', %d credits, %d responders%s\n", 403 sap, rpc_get_port(sap), 404 ia->ri_device->name, 405 ia->ri_ops->ro_displayname, 406 xprt->rx_buf.rb_max_requests, 407 ird, ird < 4 && ird < tird / 2 ? " (low!)" : ""); 408 } else if (connstate < 0) { 409 pr_info("rpcrdma: connection to %pIS:%u closed (%d)\n", 410 sap, rpc_get_port(sap), connstate); 411 } 412 #endif 413 414 return 0; 415 } 416 417 static struct rdma_cm_id * 418 rpcrdma_create_id(struct rpcrdma_xprt *xprt, 419 struct rpcrdma_ia *ia, struct sockaddr *addr) 420 { 421 struct rdma_cm_id *id; 422 int rc; 423 424 init_completion(&ia->ri_done); 425 426 id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC); 427 if (IS_ERR(id)) { 428 rc = PTR_ERR(id); 429 dprintk("RPC: %s: rdma_create_id() failed %i\n", 430 __func__, rc); 431 return id; 432 } 433 434 ia->ri_async_rc = -ETIMEDOUT; 435 rc = rdma_resolve_addr(id, NULL, addr, RDMA_RESOLVE_TIMEOUT); 436 if (rc) { 437 dprintk("RPC: %s: rdma_resolve_addr() failed %i\n", 438 __func__, rc); 439 goto out; 440 } 441 wait_for_completion_interruptible_timeout(&ia->ri_done, 442 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); 443 rc = ia->ri_async_rc; 444 if (rc) 445 goto out; 446 447 ia->ri_async_rc = -ETIMEDOUT; 448 rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT); 449 if (rc) { 450 dprintk("RPC: %s: rdma_resolve_route() failed %i\n", 451 __func__, rc); 452 goto out; 453 } 454 wait_for_completion_interruptible_timeout(&ia->ri_done, 455 msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1); 456 rc = ia->ri_async_rc; 457 if (rc) 458 goto out; 459 460 return id; 461 462 out: 463 rdma_destroy_id(id); 464 return ERR_PTR(rc); 465 } 466 467 /* 468 * Drain any cq, prior to teardown. 469 */ 470 static void 471 rpcrdma_clean_cq(struct ib_cq *cq) 472 { 473 struct ib_wc wc; 474 int count = 0; 475 476 while (1 == ib_poll_cq(cq, 1, &wc)) 477 ++count; 478 479 if (count) 480 dprintk("RPC: %s: flushed %d events (last 0x%x)\n", 481 __func__, count, wc.opcode); 482 } 483 484 /* 485 * Exported functions. 486 */ 487 488 /* 489 * Open and initialize an Interface Adapter. 490 * o initializes fields of struct rpcrdma_ia, including 491 * interface and provider attributes and protection zone. 492 */ 493 int 494 rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) 495 { 496 int rc, mem_priv; 497 struct rpcrdma_ia *ia = &xprt->rx_ia; 498 struct ib_device_attr *devattr = &ia->ri_devattr; 499 500 ia->ri_id = rpcrdma_create_id(xprt, ia, addr); 501 if (IS_ERR(ia->ri_id)) { 502 rc = PTR_ERR(ia->ri_id); 503 goto out1; 504 } 505 ia->ri_device = ia->ri_id->device; 506 507 ia->ri_pd = ib_alloc_pd(ia->ri_device); 508 if (IS_ERR(ia->ri_pd)) { 509 rc = PTR_ERR(ia->ri_pd); 510 dprintk("RPC: %s: ib_alloc_pd() failed %i\n", 511 __func__, rc); 512 goto out2; 513 } 514 515 rc = ib_query_device(ia->ri_device, devattr); 516 if (rc) { 517 dprintk("RPC: %s: ib_query_device failed %d\n", 518 __func__, rc); 519 goto out3; 520 } 521 522 if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) { 523 ia->ri_have_dma_lkey = 1; 524 ia->ri_dma_lkey = ia->ri_device->local_dma_lkey; 525 } 526 527 if (memreg == RPCRDMA_FRMR) { 528 /* Requires both frmr reg and local dma lkey */ 529 if (((devattr->device_cap_flags & 530 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) != 531 (IB_DEVICE_MEM_MGT_EXTENSIONS|IB_DEVICE_LOCAL_DMA_LKEY)) || 532 (devattr->max_fast_reg_page_list_len == 0)) { 533 dprintk("RPC: %s: FRMR registration " 534 "not supported by HCA\n", __func__); 535 memreg = RPCRDMA_MTHCAFMR; 536 } 537 } 538 if (memreg == RPCRDMA_MTHCAFMR) { 539 if (!ia->ri_device->alloc_fmr) { 540 dprintk("RPC: %s: MTHCAFMR registration " 541 "not supported by HCA\n", __func__); 542 memreg = RPCRDMA_ALLPHYSICAL; 543 } 544 } 545 546 /* 547 * Optionally obtain an underlying physical identity mapping in 548 * order to do a memory window-based bind. This base registration 549 * is protected from remote access - that is enabled only by binding 550 * for the specific bytes targeted during each RPC operation, and 551 * revoked after the corresponding completion similar to a storage 552 * adapter. 553 */ 554 switch (memreg) { 555 case RPCRDMA_FRMR: 556 ia->ri_ops = &rpcrdma_frwr_memreg_ops; 557 break; 558 case RPCRDMA_ALLPHYSICAL: 559 ia->ri_ops = &rpcrdma_physical_memreg_ops; 560 mem_priv = IB_ACCESS_LOCAL_WRITE | 561 IB_ACCESS_REMOTE_WRITE | 562 IB_ACCESS_REMOTE_READ; 563 goto register_setup; 564 case RPCRDMA_MTHCAFMR: 565 ia->ri_ops = &rpcrdma_fmr_memreg_ops; 566 if (ia->ri_have_dma_lkey) 567 break; 568 mem_priv = IB_ACCESS_LOCAL_WRITE; 569 register_setup: 570 ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv); 571 if (IS_ERR(ia->ri_bind_mem)) { 572 printk(KERN_ALERT "%s: ib_get_dma_mr for " 573 "phys register failed with %lX\n", 574 __func__, PTR_ERR(ia->ri_bind_mem)); 575 rc = -ENOMEM; 576 goto out3; 577 } 578 break; 579 default: 580 printk(KERN_ERR "RPC: Unsupported memory " 581 "registration mode: %d\n", memreg); 582 rc = -ENOMEM; 583 goto out3; 584 } 585 dprintk("RPC: %s: memory registration strategy is '%s'\n", 586 __func__, ia->ri_ops->ro_displayname); 587 588 rwlock_init(&ia->ri_qplock); 589 return 0; 590 591 out3: 592 ib_dealloc_pd(ia->ri_pd); 593 ia->ri_pd = NULL; 594 out2: 595 rdma_destroy_id(ia->ri_id); 596 ia->ri_id = NULL; 597 out1: 598 return rc; 599 } 600 601 /* 602 * Clean up/close an IA. 603 * o if event handles and PD have been initialized, free them. 604 * o close the IA 605 */ 606 void 607 rpcrdma_ia_close(struct rpcrdma_ia *ia) 608 { 609 int rc; 610 611 dprintk("RPC: %s: entering\n", __func__); 612 if (ia->ri_bind_mem != NULL) { 613 rc = ib_dereg_mr(ia->ri_bind_mem); 614 dprintk("RPC: %s: ib_dereg_mr returned %i\n", 615 __func__, rc); 616 } 617 618 if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) { 619 if (ia->ri_id->qp) 620 rdma_destroy_qp(ia->ri_id); 621 rdma_destroy_id(ia->ri_id); 622 ia->ri_id = NULL; 623 } 624 625 /* If the pd is still busy, xprtrdma missed freeing a resource */ 626 if (ia->ri_pd && !IS_ERR(ia->ri_pd)) 627 WARN_ON(ib_dealloc_pd(ia->ri_pd)); 628 } 629 630 /* 631 * Create unconnected endpoint. 632 */ 633 int 634 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, 635 struct rpcrdma_create_data_internal *cdata) 636 { 637 struct ib_device_attr *devattr = &ia->ri_devattr; 638 struct ib_cq *sendcq, *recvcq; 639 struct ib_cq_init_attr cq_attr = {}; 640 int rc, err; 641 642 /* check provider's send/recv wr limits */ 643 if (cdata->max_requests > devattr->max_qp_wr) 644 cdata->max_requests = devattr->max_qp_wr; 645 646 ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; 647 ep->rep_attr.qp_context = ep; 648 ep->rep_attr.srq = NULL; 649 ep->rep_attr.cap.max_send_wr = cdata->max_requests; 650 rc = ia->ri_ops->ro_open(ia, ep, cdata); 651 if (rc) 652 return rc; 653 ep->rep_attr.cap.max_recv_wr = cdata->max_requests; 654 ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2); 655 ep->rep_attr.cap.max_recv_sge = 1; 656 ep->rep_attr.cap.max_inline_data = 0; 657 ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 658 ep->rep_attr.qp_type = IB_QPT_RC; 659 ep->rep_attr.port_num = ~0; 660 661 if (cdata->padding) { 662 ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding, 663 GFP_KERNEL); 664 if (IS_ERR(ep->rep_padbuf)) 665 return PTR_ERR(ep->rep_padbuf); 666 } else 667 ep->rep_padbuf = NULL; 668 669 dprintk("RPC: %s: requested max: dtos: send %d recv %d; " 670 "iovs: send %d recv %d\n", 671 __func__, 672 ep->rep_attr.cap.max_send_wr, 673 ep->rep_attr.cap.max_recv_wr, 674 ep->rep_attr.cap.max_send_sge, 675 ep->rep_attr.cap.max_recv_sge); 676 677 /* set trigger for requesting send completion */ 678 ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1; 679 if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS) 680 ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS; 681 else if (ep->rep_cqinit <= 2) 682 ep->rep_cqinit = 0; 683 INIT_CQCOUNT(ep); 684 init_waitqueue_head(&ep->rep_connect_wait); 685 INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); 686 687 cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1; 688 sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall, 689 rpcrdma_cq_async_error_upcall, ep, &cq_attr); 690 if (IS_ERR(sendcq)) { 691 rc = PTR_ERR(sendcq); 692 dprintk("RPC: %s: failed to create send CQ: %i\n", 693 __func__, rc); 694 goto out1; 695 } 696 697 rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP); 698 if (rc) { 699 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", 700 __func__, rc); 701 goto out2; 702 } 703 704 cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1; 705 recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall, 706 rpcrdma_cq_async_error_upcall, ep, &cq_attr); 707 if (IS_ERR(recvcq)) { 708 rc = PTR_ERR(recvcq); 709 dprintk("RPC: %s: failed to create recv CQ: %i\n", 710 __func__, rc); 711 goto out2; 712 } 713 714 rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP); 715 if (rc) { 716 dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", 717 __func__, rc); 718 ib_destroy_cq(recvcq); 719 goto out2; 720 } 721 722 ep->rep_attr.send_cq = sendcq; 723 ep->rep_attr.recv_cq = recvcq; 724 725 /* Initialize cma parameters */ 726 727 /* RPC/RDMA does not use private data */ 728 ep->rep_remote_cma.private_data = NULL; 729 ep->rep_remote_cma.private_data_len = 0; 730 731 /* Client offers RDMA Read but does not initiate */ 732 ep->rep_remote_cma.initiator_depth = 0; 733 if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */ 734 ep->rep_remote_cma.responder_resources = 32; 735 else 736 ep->rep_remote_cma.responder_resources = 737 devattr->max_qp_rd_atom; 738 739 ep->rep_remote_cma.retry_count = 7; 740 ep->rep_remote_cma.flow_control = 0; 741 ep->rep_remote_cma.rnr_retry_count = 0; 742 743 return 0; 744 745 out2: 746 err = ib_destroy_cq(sendcq); 747 if (err) 748 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 749 __func__, err); 750 out1: 751 rpcrdma_free_regbuf(ia, ep->rep_padbuf); 752 return rc; 753 } 754 755 /* 756 * rpcrdma_ep_destroy 757 * 758 * Disconnect and destroy endpoint. After this, the only 759 * valid operations on the ep are to free it (if dynamically 760 * allocated) or re-create it. 761 */ 762 void 763 rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 764 { 765 int rc; 766 767 dprintk("RPC: %s: entering, connected is %d\n", 768 __func__, ep->rep_connected); 769 770 cancel_delayed_work_sync(&ep->rep_connect_worker); 771 772 if (ia->ri_id->qp) { 773 rpcrdma_ep_disconnect(ep, ia); 774 rdma_destroy_qp(ia->ri_id); 775 ia->ri_id->qp = NULL; 776 } 777 778 rpcrdma_free_regbuf(ia, ep->rep_padbuf); 779 780 rpcrdma_clean_cq(ep->rep_attr.recv_cq); 781 rc = ib_destroy_cq(ep->rep_attr.recv_cq); 782 if (rc) 783 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 784 __func__, rc); 785 786 rpcrdma_clean_cq(ep->rep_attr.send_cq); 787 rc = ib_destroy_cq(ep->rep_attr.send_cq); 788 if (rc) 789 dprintk("RPC: %s: ib_destroy_cq returned %i\n", 790 __func__, rc); 791 } 792 793 /* 794 * Connect unconnected endpoint. 795 */ 796 int 797 rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 798 { 799 struct rdma_cm_id *id, *old; 800 int rc = 0; 801 int retry_count = 0; 802 803 if (ep->rep_connected != 0) { 804 struct rpcrdma_xprt *xprt; 805 retry: 806 dprintk("RPC: %s: reconnecting...\n", __func__); 807 808 rpcrdma_ep_disconnect(ep, ia); 809 rpcrdma_flush_cqs(ep); 810 811 xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); 812 id = rpcrdma_create_id(xprt, ia, 813 (struct sockaddr *)&xprt->rx_data.addr); 814 if (IS_ERR(id)) { 815 rc = -EHOSTUNREACH; 816 goto out; 817 } 818 /* TEMP TEMP TEMP - fail if new device: 819 * Deregister/remarshal *all* requests! 820 * Close and recreate adapter, pd, etc! 821 * Re-determine all attributes still sane! 822 * More stuff I haven't thought of! 823 * Rrrgh! 824 */ 825 if (ia->ri_device != id->device) { 826 printk("RPC: %s: can't reconnect on " 827 "different device!\n", __func__); 828 rdma_destroy_id(id); 829 rc = -ENETUNREACH; 830 goto out; 831 } 832 /* END TEMP */ 833 rc = rdma_create_qp(id, ia->ri_pd, &ep->rep_attr); 834 if (rc) { 835 dprintk("RPC: %s: rdma_create_qp failed %i\n", 836 __func__, rc); 837 rdma_destroy_id(id); 838 rc = -ENETUNREACH; 839 goto out; 840 } 841 842 write_lock(&ia->ri_qplock); 843 old = ia->ri_id; 844 ia->ri_id = id; 845 write_unlock(&ia->ri_qplock); 846 847 rdma_destroy_qp(old); 848 rdma_destroy_id(old); 849 } else { 850 dprintk("RPC: %s: connecting...\n", __func__); 851 rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr); 852 if (rc) { 853 dprintk("RPC: %s: rdma_create_qp failed %i\n", 854 __func__, rc); 855 /* do not update ep->rep_connected */ 856 return -ENETUNREACH; 857 } 858 } 859 860 ep->rep_connected = 0; 861 862 rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); 863 if (rc) { 864 dprintk("RPC: %s: rdma_connect() failed with %i\n", 865 __func__, rc); 866 goto out; 867 } 868 869 wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0); 870 871 /* 872 * Check state. A non-peer reject indicates no listener 873 * (ECONNREFUSED), which may be a transient state. All 874 * others indicate a transport condition which has already 875 * undergone a best-effort. 876 */ 877 if (ep->rep_connected == -ECONNREFUSED && 878 ++retry_count <= RDMA_CONNECT_RETRY_MAX) { 879 dprintk("RPC: %s: non-peer_reject, retry\n", __func__); 880 goto retry; 881 } 882 if (ep->rep_connected <= 0) { 883 /* Sometimes, the only way to reliably connect to remote 884 * CMs is to use same nonzero values for ORD and IRD. */ 885 if (retry_count++ <= RDMA_CONNECT_RETRY_MAX + 1 && 886 (ep->rep_remote_cma.responder_resources == 0 || 887 ep->rep_remote_cma.initiator_depth != 888 ep->rep_remote_cma.responder_resources)) { 889 if (ep->rep_remote_cma.responder_resources == 0) 890 ep->rep_remote_cma.responder_resources = 1; 891 ep->rep_remote_cma.initiator_depth = 892 ep->rep_remote_cma.responder_resources; 893 goto retry; 894 } 895 rc = ep->rep_connected; 896 } else { 897 dprintk("RPC: %s: connected\n", __func__); 898 } 899 900 out: 901 if (rc) 902 ep->rep_connected = rc; 903 return rc; 904 } 905 906 /* 907 * rpcrdma_ep_disconnect 908 * 909 * This is separate from destroy to facilitate the ability 910 * to reconnect without recreating the endpoint. 911 * 912 * This call is not reentrant, and must not be made in parallel 913 * on the same endpoint. 914 */ 915 void 916 rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) 917 { 918 int rc; 919 920 rpcrdma_flush_cqs(ep); 921 rc = rdma_disconnect(ia->ri_id); 922 if (!rc) { 923 /* returns without wait if not connected */ 924 wait_event_interruptible(ep->rep_connect_wait, 925 ep->rep_connected != 1); 926 dprintk("RPC: %s: after wait, %sconnected\n", __func__, 927 (ep->rep_connected == 1) ? "still " : "dis"); 928 } else { 929 dprintk("RPC: %s: rdma_disconnect %i\n", __func__, rc); 930 ep->rep_connected = rc; 931 } 932 } 933 934 static struct rpcrdma_req * 935 rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) 936 { 937 struct rpcrdma_req *req; 938 939 req = kzalloc(sizeof(*req), GFP_KERNEL); 940 if (req == NULL) 941 return ERR_PTR(-ENOMEM); 942 943 req->rl_buffer = &r_xprt->rx_buf; 944 return req; 945 } 946 947 static struct rpcrdma_rep * 948 rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) 949 { 950 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; 951 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 952 struct rpcrdma_rep *rep; 953 int rc; 954 955 rc = -ENOMEM; 956 rep = kzalloc(sizeof(*rep), GFP_KERNEL); 957 if (rep == NULL) 958 goto out; 959 960 rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize, 961 GFP_KERNEL); 962 if (IS_ERR(rep->rr_rdmabuf)) { 963 rc = PTR_ERR(rep->rr_rdmabuf); 964 goto out_free; 965 } 966 967 rep->rr_device = ia->ri_device; 968 rep->rr_rxprt = r_xprt; 969 return rep; 970 971 out_free: 972 kfree(rep); 973 out: 974 return ERR_PTR(rc); 975 } 976 977 int 978 rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) 979 { 980 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 981 struct rpcrdma_ia *ia = &r_xprt->rx_ia; 982 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; 983 char *p; 984 size_t len; 985 int i, rc; 986 987 buf->rb_max_requests = cdata->max_requests; 988 spin_lock_init(&buf->rb_lock); 989 990 /* Need to allocate: 991 * 1. arrays for send and recv pointers 992 * 2. arrays of struct rpcrdma_req to fill in pointers 993 * 3. array of struct rpcrdma_rep for replies 994 * Send/recv buffers in req/rep need to be registered 995 */ 996 len = buf->rb_max_requests * 997 (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); 998 999 p = kzalloc(len, GFP_KERNEL); 1000 if (p == NULL) { 1001 dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", 1002 __func__, len); 1003 rc = -ENOMEM; 1004 goto out; 1005 } 1006 buf->rb_pool = p; /* for freeing it later */ 1007 1008 buf->rb_send_bufs = (struct rpcrdma_req **) p; 1009 p = (char *) &buf->rb_send_bufs[buf->rb_max_requests]; 1010 buf->rb_recv_bufs = (struct rpcrdma_rep **) p; 1011 p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; 1012 1013 rc = ia->ri_ops->ro_init(r_xprt); 1014 if (rc) 1015 goto out; 1016 1017 for (i = 0; i < buf->rb_max_requests; i++) { 1018 struct rpcrdma_req *req; 1019 struct rpcrdma_rep *rep; 1020 1021 req = rpcrdma_create_req(r_xprt); 1022 if (IS_ERR(req)) { 1023 dprintk("RPC: %s: request buffer %d alloc" 1024 " failed\n", __func__, i); 1025 rc = PTR_ERR(req); 1026 goto out; 1027 } 1028 buf->rb_send_bufs[i] = req; 1029 1030 rep = rpcrdma_create_rep(r_xprt); 1031 if (IS_ERR(rep)) { 1032 dprintk("RPC: %s: reply buffer %d alloc failed\n", 1033 __func__, i); 1034 rc = PTR_ERR(rep); 1035 goto out; 1036 } 1037 buf->rb_recv_bufs[i] = rep; 1038 } 1039 1040 return 0; 1041 out: 1042 rpcrdma_buffer_destroy(buf); 1043 return rc; 1044 } 1045 1046 static void 1047 rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep) 1048 { 1049 if (!rep) 1050 return; 1051 1052 rpcrdma_free_regbuf(ia, rep->rr_rdmabuf); 1053 kfree(rep); 1054 } 1055 1056 static void 1057 rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) 1058 { 1059 if (!req) 1060 return; 1061 1062 rpcrdma_free_regbuf(ia, req->rl_sendbuf); 1063 rpcrdma_free_regbuf(ia, req->rl_rdmabuf); 1064 kfree(req); 1065 } 1066 1067 void 1068 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) 1069 { 1070 struct rpcrdma_ia *ia = rdmab_to_ia(buf); 1071 int i; 1072 1073 /* clean up in reverse order from create 1074 * 1. recv mr memory (mr free, then kfree) 1075 * 2. send mr memory (mr free, then kfree) 1076 * 3. MWs 1077 */ 1078 dprintk("RPC: %s: entering\n", __func__); 1079 1080 for (i = 0; i < buf->rb_max_requests; i++) { 1081 if (buf->rb_recv_bufs) 1082 rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]); 1083 if (buf->rb_send_bufs) 1084 rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]); 1085 } 1086 1087 ia->ri_ops->ro_destroy(buf); 1088 1089 kfree(buf->rb_pool); 1090 } 1091 1092 struct rpcrdma_mw * 1093 rpcrdma_get_mw(struct rpcrdma_xprt *r_xprt) 1094 { 1095 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1096 struct rpcrdma_mw *mw = NULL; 1097 1098 spin_lock(&buf->rb_mwlock); 1099 if (!list_empty(&buf->rb_mws)) { 1100 mw = list_first_entry(&buf->rb_mws, 1101 struct rpcrdma_mw, mw_list); 1102 list_del_init(&mw->mw_list); 1103 } 1104 spin_unlock(&buf->rb_mwlock); 1105 1106 if (!mw) 1107 pr_err("RPC: %s: no MWs available\n", __func__); 1108 return mw; 1109 } 1110 1111 void 1112 rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) 1113 { 1114 struct rpcrdma_buffer *buf = &r_xprt->rx_buf; 1115 1116 spin_lock(&buf->rb_mwlock); 1117 list_add_tail(&mw->mw_list, &buf->rb_mws); 1118 spin_unlock(&buf->rb_mwlock); 1119 } 1120 1121 static void 1122 rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) 1123 { 1124 buf->rb_send_bufs[--buf->rb_send_index] = req; 1125 req->rl_niovs = 0; 1126 if (req->rl_reply) { 1127 buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply; 1128 req->rl_reply = NULL; 1129 } 1130 } 1131 1132 /* 1133 * Get a set of request/reply buffers. 1134 * 1135 * Reply buffer (if needed) is attached to send buffer upon return. 1136 * Rule: 1137 * rb_send_index and rb_recv_index MUST always be pointing to the 1138 * *next* available buffer (non-NULL). They are incremented after 1139 * removing buffers, and decremented *before* returning them. 1140 */ 1141 struct rpcrdma_req * 1142 rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) 1143 { 1144 struct rpcrdma_req *req; 1145 unsigned long flags; 1146 1147 spin_lock_irqsave(&buffers->rb_lock, flags); 1148 1149 if (buffers->rb_send_index == buffers->rb_max_requests) { 1150 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1151 dprintk("RPC: %s: out of request buffers\n", __func__); 1152 return ((struct rpcrdma_req *)NULL); 1153 } 1154 1155 req = buffers->rb_send_bufs[buffers->rb_send_index]; 1156 if (buffers->rb_send_index < buffers->rb_recv_index) { 1157 dprintk("RPC: %s: %d extra receives outstanding (ok)\n", 1158 __func__, 1159 buffers->rb_recv_index - buffers->rb_send_index); 1160 req->rl_reply = NULL; 1161 } else { 1162 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; 1163 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; 1164 } 1165 buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; 1166 1167 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1168 return req; 1169 } 1170 1171 /* 1172 * Put request/reply buffers back into pool. 1173 * Pre-decrement counter/array index. 1174 */ 1175 void 1176 rpcrdma_buffer_put(struct rpcrdma_req *req) 1177 { 1178 struct rpcrdma_buffer *buffers = req->rl_buffer; 1179 unsigned long flags; 1180 1181 spin_lock_irqsave(&buffers->rb_lock, flags); 1182 rpcrdma_buffer_put_sendbuf(req, buffers); 1183 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1184 } 1185 1186 /* 1187 * Recover reply buffers from pool. 1188 * This happens when recovering from error conditions. 1189 * Post-increment counter/array index. 1190 */ 1191 void 1192 rpcrdma_recv_buffer_get(struct rpcrdma_req *req) 1193 { 1194 struct rpcrdma_buffer *buffers = req->rl_buffer; 1195 unsigned long flags; 1196 1197 spin_lock_irqsave(&buffers->rb_lock, flags); 1198 if (buffers->rb_recv_index < buffers->rb_max_requests) { 1199 req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; 1200 buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; 1201 } 1202 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1203 } 1204 1205 /* 1206 * Put reply buffers back into pool when not attached to 1207 * request. This happens in error conditions. 1208 */ 1209 void 1210 rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) 1211 { 1212 struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; 1213 unsigned long flags; 1214 1215 spin_lock_irqsave(&buffers->rb_lock, flags); 1216 buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep; 1217 spin_unlock_irqrestore(&buffers->rb_lock, flags); 1218 } 1219 1220 /* 1221 * Wrappers for internal-use kmalloc memory registration, used by buffer code. 1222 */ 1223 1224 void 1225 rpcrdma_mapping_error(struct rpcrdma_mr_seg *seg) 1226 { 1227 dprintk("RPC: map_one: offset %p iova %llx len %zu\n", 1228 seg->mr_offset, 1229 (unsigned long long)seg->mr_dma, seg->mr_dmalen); 1230 } 1231 1232 static int 1233 rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len, 1234 struct ib_mr **mrp, struct ib_sge *iov) 1235 { 1236 struct ib_phys_buf ipb; 1237 struct ib_mr *mr; 1238 int rc; 1239 1240 /* 1241 * All memory passed here was kmalloc'ed, therefore phys-contiguous. 1242 */ 1243 iov->addr = ib_dma_map_single(ia->ri_device, 1244 va, len, DMA_BIDIRECTIONAL); 1245 if (ib_dma_mapping_error(ia->ri_device, iov->addr)) 1246 return -ENOMEM; 1247 1248 iov->length = len; 1249 1250 if (ia->ri_have_dma_lkey) { 1251 *mrp = NULL; 1252 iov->lkey = ia->ri_dma_lkey; 1253 return 0; 1254 } else if (ia->ri_bind_mem != NULL) { 1255 *mrp = NULL; 1256 iov->lkey = ia->ri_bind_mem->lkey; 1257 return 0; 1258 } 1259 1260 ipb.addr = iov->addr; 1261 ipb.size = iov->length; 1262 mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1, 1263 IB_ACCESS_LOCAL_WRITE, &iov->addr); 1264 1265 dprintk("RPC: %s: phys convert: 0x%llx " 1266 "registered 0x%llx length %d\n", 1267 __func__, (unsigned long long)ipb.addr, 1268 (unsigned long long)iov->addr, len); 1269 1270 if (IS_ERR(mr)) { 1271 *mrp = NULL; 1272 rc = PTR_ERR(mr); 1273 dprintk("RPC: %s: failed with %i\n", __func__, rc); 1274 } else { 1275 *mrp = mr; 1276 iov->lkey = mr->lkey; 1277 rc = 0; 1278 } 1279 1280 return rc; 1281 } 1282 1283 static int 1284 rpcrdma_deregister_internal(struct rpcrdma_ia *ia, 1285 struct ib_mr *mr, struct ib_sge *iov) 1286 { 1287 int rc; 1288 1289 ib_dma_unmap_single(ia->ri_device, 1290 iov->addr, iov->length, DMA_BIDIRECTIONAL); 1291 1292 if (NULL == mr) 1293 return 0; 1294 1295 rc = ib_dereg_mr(mr); 1296 if (rc) 1297 dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc); 1298 return rc; 1299 } 1300 1301 /** 1302 * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers 1303 * @ia: controlling rpcrdma_ia 1304 * @size: size of buffer to be allocated, in bytes 1305 * @flags: GFP flags 1306 * 1307 * Returns pointer to private header of an area of internally 1308 * registered memory, or an ERR_PTR. The registered buffer follows 1309 * the end of the private header. 1310 * 1311 * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for 1312 * receiving the payload of RDMA RECV operations. regbufs are not 1313 * used for RDMA READ/WRITE operations, thus are registered only for 1314 * LOCAL access. 1315 */ 1316 struct rpcrdma_regbuf * 1317 rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags) 1318 { 1319 struct rpcrdma_regbuf *rb; 1320 int rc; 1321 1322 rc = -ENOMEM; 1323 rb = kmalloc(sizeof(*rb) + size, flags); 1324 if (rb == NULL) 1325 goto out; 1326 1327 rb->rg_size = size; 1328 rb->rg_owner = NULL; 1329 rc = rpcrdma_register_internal(ia, rb->rg_base, size, 1330 &rb->rg_mr, &rb->rg_iov); 1331 if (rc) 1332 goto out_free; 1333 1334 return rb; 1335 1336 out_free: 1337 kfree(rb); 1338 out: 1339 return ERR_PTR(rc); 1340 } 1341 1342 /** 1343 * rpcrdma_free_regbuf - deregister and free registered buffer 1344 * @ia: controlling rpcrdma_ia 1345 * @rb: regbuf to be deregistered and freed 1346 */ 1347 void 1348 rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb) 1349 { 1350 if (rb) { 1351 rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov); 1352 kfree(rb); 1353 } 1354 } 1355 1356 /* 1357 * Prepost any receive buffer, then post send. 1358 * 1359 * Receive buffer is donated to hardware, reclaimed upon recv completion. 1360 */ 1361 int 1362 rpcrdma_ep_post(struct rpcrdma_ia *ia, 1363 struct rpcrdma_ep *ep, 1364 struct rpcrdma_req *req) 1365 { 1366 struct ib_send_wr send_wr, *send_wr_fail; 1367 struct rpcrdma_rep *rep = req->rl_reply; 1368 int rc; 1369 1370 if (rep) { 1371 rc = rpcrdma_ep_post_recv(ia, ep, rep); 1372 if (rc) 1373 goto out; 1374 req->rl_reply = NULL; 1375 } 1376 1377 send_wr.next = NULL; 1378 send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION; 1379 send_wr.sg_list = req->rl_send_iov; 1380 send_wr.num_sge = req->rl_niovs; 1381 send_wr.opcode = IB_WR_SEND; 1382 if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */ 1383 ib_dma_sync_single_for_device(ia->ri_device, 1384 req->rl_send_iov[3].addr, 1385 req->rl_send_iov[3].length, 1386 DMA_TO_DEVICE); 1387 ib_dma_sync_single_for_device(ia->ri_device, 1388 req->rl_send_iov[1].addr, 1389 req->rl_send_iov[1].length, 1390 DMA_TO_DEVICE); 1391 ib_dma_sync_single_for_device(ia->ri_device, 1392 req->rl_send_iov[0].addr, 1393 req->rl_send_iov[0].length, 1394 DMA_TO_DEVICE); 1395 1396 if (DECR_CQCOUNT(ep) > 0) 1397 send_wr.send_flags = 0; 1398 else { /* Provider must take a send completion every now and then */ 1399 INIT_CQCOUNT(ep); 1400 send_wr.send_flags = IB_SEND_SIGNALED; 1401 } 1402 1403 rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail); 1404 if (rc) 1405 dprintk("RPC: %s: ib_post_send returned %i\n", __func__, 1406 rc); 1407 out: 1408 return rc; 1409 } 1410 1411 /* 1412 * (Re)post a receive buffer. 1413 */ 1414 int 1415 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, 1416 struct rpcrdma_ep *ep, 1417 struct rpcrdma_rep *rep) 1418 { 1419 struct ib_recv_wr recv_wr, *recv_wr_fail; 1420 int rc; 1421 1422 recv_wr.next = NULL; 1423 recv_wr.wr_id = (u64) (unsigned long) rep; 1424 recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov; 1425 recv_wr.num_sge = 1; 1426 1427 ib_dma_sync_single_for_cpu(ia->ri_device, 1428 rdmab_addr(rep->rr_rdmabuf), 1429 rdmab_length(rep->rr_rdmabuf), 1430 DMA_BIDIRECTIONAL); 1431 1432 rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail); 1433 1434 if (rc) 1435 dprintk("RPC: %s: ib_post_recv returned %i\n", __func__, 1436 rc); 1437 return rc; 1438 } 1439 1440 /* How many chunk list items fit within our inline buffers? 1441 */ 1442 unsigned int 1443 rpcrdma_max_segments(struct rpcrdma_xprt *r_xprt) 1444 { 1445 struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; 1446 int bytes, segments; 1447 1448 bytes = min_t(unsigned int, cdata->inline_wsize, cdata->inline_rsize); 1449 bytes -= RPCRDMA_HDRLEN_MIN; 1450 if (bytes < sizeof(struct rpcrdma_segment) * 2) { 1451 pr_warn("RPC: %s: inline threshold too small\n", 1452 __func__); 1453 return 0; 1454 } 1455 1456 segments = 1 << (fls(bytes / sizeof(struct rpcrdma_segment)) - 1); 1457 dprintk("RPC: %s: max chunk list size = %d segments\n", 1458 __func__, segments); 1459 return segments; 1460 } 1461