1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * RDMA transport layer based on the trans_fd.c implementation. 4 * 5 * Copyright (C) 2008 by Tom Tucker <tom@opengridcomputing.com> 6 * Copyright (C) 2006 by Russ Cox <rsc@swtch.com> 7 * Copyright (C) 2004-2005 by Latchesar Ionkov <lucho@ionkov.net> 8 * Copyright (C) 2004-2008 by Eric Van Hensbergen <ericvh@gmail.com> 9 * Copyright (C) 1997-2002 by Ron Minnich <rminnich@sarnoff.com> 10 */ 11 12 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 13 14 #include <linux/in.h> 15 #include <linux/module.h> 16 #include <linux/net.h> 17 #include <linux/ipv6.h> 18 #include <linux/kthread.h> 19 #include <linux/errno.h> 20 #include <linux/kernel.h> 21 #include <linux/un.h> 22 #include <linux/uaccess.h> 23 #include <linux/inet.h> 24 #include <linux/file.h> 25 #include <linux/fs_context.h> 26 #include <linux/semaphore.h> 27 #include <linux/slab.h> 28 #include <linux/seq_file.h> 29 #include <net/9p/9p.h> 30 #include <net/9p/client.h> 31 #include <net/9p/transport.h> 32 #include <rdma/ib_verbs.h> 33 #include <rdma/rdma_cm.h> 34 35 #define P9_RDMA_SEND_SGE 4 36 #define P9_RDMA_RECV_SGE 4 37 #define P9_RDMA_IRD 0 38 #define P9_RDMA_ORD 0 39 #define P9_RDMA_MAXSIZE (1024*1024) /* 1MB */ 40 41 /** 42 * struct p9_trans_rdma - RDMA transport instance 43 * 44 * @state: tracks the transport state machine for connection setup and tear down 45 * @cm_id: The RDMA CM ID 46 * @pd: Protection Domain pointer 47 * @qp: Queue Pair pointer 48 * @cq: Completion Queue pointer 49 * @timeout: Number of uSecs to wait for connection management events 50 * @privport: Whether a privileged port may be used 51 * @port: The port to use 52 * @sq_depth: The depth of the Send Queue 53 * @sq_sem: Semaphore for the SQ 54 * @rq_depth: The depth of the Receive Queue. 55 * @rq_sem: Semaphore for the RQ 56 * @excess_rc : Amount of posted Receive Contexts without a pending request. 57 * See rdma_request() 58 * @addr: The remote peer's address 59 * @req_lock: Protects the active request list 60 * @cm_done: Completion event for connection management tracking 61 */ 62 struct p9_trans_rdma { 63 enum { 64 P9_RDMA_INIT, 65 P9_RDMA_ADDR_RESOLVED, 66 P9_RDMA_ROUTE_RESOLVED, 67 P9_RDMA_CONNECTED, 68 P9_RDMA_FLUSHING, 69 P9_RDMA_CLOSING, 70 P9_RDMA_CLOSED, 71 } state; 72 struct rdma_cm_id *cm_id; 73 struct ib_pd *pd; 74 struct ib_qp *qp; 75 struct ib_cq *cq; 76 long timeout; 77 bool privport; 78 u16 port; 79 int sq_depth; 80 struct semaphore sq_sem; 81 int rq_depth; 82 struct semaphore rq_sem; 83 atomic_t excess_rc; 84 struct sockaddr_in addr; 85 spinlock_t req_lock; 86 87 struct completion cm_done; 88 }; 89 90 struct p9_rdma_req; 91 92 /** 93 * struct p9_rdma_context - Keeps track of in-process WR 94 * 95 * @cqe: completion queue entry 96 * @busa: Bus address to unmap when the WR completes 97 * @req: Keeps track of requests (send) 98 * @rc: Keepts track of replies (receive) 99 */ 100 struct p9_rdma_context { 101 struct ib_cqe cqe; 102 dma_addr_t busa; 103 union { 104 struct p9_req_t *req; 105 struct p9_fcall rc; 106 }; 107 }; 108 109 static int p9_rdma_show_options(struct seq_file *m, struct p9_client *clnt) 110 { 111 struct p9_trans_rdma *rdma = clnt->trans; 112 113 if (rdma->port != P9_RDMA_PORT) 114 seq_printf(m, ",port=%u", rdma->port); 115 if (rdma->sq_depth != P9_RDMA_SQ_DEPTH) 116 seq_printf(m, ",sq=%u", rdma->sq_depth); 117 if (rdma->rq_depth != P9_RDMA_RQ_DEPTH) 118 seq_printf(m, ",rq=%u", rdma->rq_depth); 119 if (rdma->timeout != P9_RDMA_TIMEOUT) 120 seq_printf(m, ",timeout=%lu", rdma->timeout); 121 if (rdma->privport) 122 seq_puts(m, ",privport"); 123 return 0; 124 } 125 126 static int 127 p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) 128 { 129 struct p9_client *c = id->context; 130 struct p9_trans_rdma *rdma = c->trans; 131 switch (event->event) { 132 case RDMA_CM_EVENT_ADDR_RESOLVED: 133 BUG_ON(rdma->state != P9_RDMA_INIT); 134 rdma->state = P9_RDMA_ADDR_RESOLVED; 135 break; 136 137 case RDMA_CM_EVENT_ROUTE_RESOLVED: 138 BUG_ON(rdma->state != P9_RDMA_ADDR_RESOLVED); 139 rdma->state = P9_RDMA_ROUTE_RESOLVED; 140 break; 141 142 case RDMA_CM_EVENT_ESTABLISHED: 143 BUG_ON(rdma->state != P9_RDMA_ROUTE_RESOLVED); 144 rdma->state = P9_RDMA_CONNECTED; 145 break; 146 147 case RDMA_CM_EVENT_DISCONNECTED: 148 if (rdma) 149 rdma->state = P9_RDMA_CLOSED; 150 c->status = Disconnected; 151 break; 152 153 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 154 break; 155 156 case RDMA_CM_EVENT_ADDR_CHANGE: 157 case RDMA_CM_EVENT_ROUTE_ERROR: 158 case RDMA_CM_EVENT_DEVICE_REMOVAL: 159 case RDMA_CM_EVENT_MULTICAST_JOIN: 160 case RDMA_CM_EVENT_MULTICAST_ERROR: 161 case RDMA_CM_EVENT_REJECTED: 162 case RDMA_CM_EVENT_CONNECT_REQUEST: 163 case RDMA_CM_EVENT_CONNECT_RESPONSE: 164 case RDMA_CM_EVENT_CONNECT_ERROR: 165 case RDMA_CM_EVENT_ADDR_ERROR: 166 case RDMA_CM_EVENT_UNREACHABLE: 167 c->status = Disconnected; 168 rdma_disconnect(rdma->cm_id); 169 break; 170 default: 171 BUG(); 172 } 173 complete(&rdma->cm_done); 174 return 0; 175 } 176 177 static void 178 recv_done(struct ib_cq *cq, struct ib_wc *wc) 179 { 180 struct p9_client *client = cq->cq_context; 181 struct p9_trans_rdma *rdma = client->trans; 182 struct p9_rdma_context *c = 183 container_of(wc->wr_cqe, struct p9_rdma_context, cqe); 184 struct p9_req_t *req; 185 int err = 0; 186 int16_t tag; 187 188 req = NULL; 189 ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize, 190 DMA_FROM_DEVICE); 191 192 if (wc->status != IB_WC_SUCCESS) 193 goto err_out; 194 195 c->rc.size = wc->byte_len; 196 err = p9_parse_header(&c->rc, NULL, NULL, &tag, 1); 197 if (err) 198 goto err_out; 199 200 req = p9_tag_lookup(client, tag); 201 if (!req) 202 goto err_out; 203 204 /* Check that we have not yet received a reply for this request. 205 */ 206 if (unlikely(req->rc.sdata)) { 207 pr_err("Duplicate reply for request %d", tag); 208 goto err_out; 209 } 210 211 req->rc.size = c->rc.size; 212 req->rc.sdata = c->rc.sdata; 213 p9_client_cb(client, req, REQ_STATUS_RCVD); 214 215 out: 216 up(&rdma->rq_sem); 217 kfree(c); 218 return; 219 220 err_out: 221 p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", 222 req, err, wc->status); 223 rdma->state = P9_RDMA_FLUSHING; 224 client->status = Disconnected; 225 goto out; 226 } 227 228 static void 229 send_done(struct ib_cq *cq, struct ib_wc *wc) 230 { 231 struct p9_client *client = cq->cq_context; 232 struct p9_trans_rdma *rdma = client->trans; 233 struct p9_rdma_context *c = 234 container_of(wc->wr_cqe, struct p9_rdma_context, cqe); 235 236 ib_dma_unmap_single(rdma->cm_id->device, 237 c->busa, c->req->tc.size, 238 DMA_TO_DEVICE); 239 up(&rdma->sq_sem); 240 p9_req_put(client, c->req); 241 kfree(c); 242 } 243 244 static void qp_event_handler(struct ib_event *event, void *context) 245 { 246 p9_debug(P9_DEBUG_ERROR, "QP event %d context %p\n", 247 event->event, context); 248 } 249 250 static void rdma_destroy_trans(struct p9_trans_rdma *rdma) 251 { 252 if (!rdma) 253 return; 254 255 if (rdma->qp && !IS_ERR(rdma->qp)) 256 ib_destroy_qp(rdma->qp); 257 258 if (rdma->pd && !IS_ERR(rdma->pd)) 259 ib_dealloc_pd(rdma->pd); 260 261 if (rdma->cq && !IS_ERR(rdma->cq)) 262 ib_free_cq(rdma->cq); 263 264 if (rdma->cm_id && !IS_ERR(rdma->cm_id)) 265 rdma_destroy_id(rdma->cm_id); 266 267 kfree(rdma); 268 } 269 270 static int 271 post_recv(struct p9_client *client, struct p9_rdma_context *c) 272 { 273 struct p9_trans_rdma *rdma = client->trans; 274 struct ib_recv_wr wr; 275 struct ib_sge sge; 276 int ret; 277 278 c->busa = ib_dma_map_single(rdma->cm_id->device, 279 c->rc.sdata, client->msize, 280 DMA_FROM_DEVICE); 281 if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) 282 goto error; 283 284 c->cqe.done = recv_done; 285 286 sge.addr = c->busa; 287 sge.length = client->msize; 288 sge.lkey = rdma->pd->local_dma_lkey; 289 290 wr.next = NULL; 291 wr.wr_cqe = &c->cqe; 292 wr.sg_list = &sge; 293 wr.num_sge = 1; 294 295 ret = ib_post_recv(rdma->qp, &wr, NULL); 296 if (ret) 297 ib_dma_unmap_single(rdma->cm_id->device, c->busa, 298 client->msize, DMA_FROM_DEVICE); 299 return ret; 300 301 error: 302 p9_debug(P9_DEBUG_ERROR, "EIO\n"); 303 return -EIO; 304 } 305 306 static int rdma_request(struct p9_client *client, struct p9_req_t *req) 307 { 308 struct p9_trans_rdma *rdma = client->trans; 309 struct ib_send_wr wr; 310 struct ib_sge sge; 311 int err = 0; 312 unsigned long flags; 313 struct p9_rdma_context *c = NULL; 314 struct p9_rdma_context *rpl_context = NULL; 315 316 /* When an error occurs between posting the recv and the send, 317 * there will be a receive context posted without a pending request. 318 * Since there is no way to "un-post" it, we remember it and skip 319 * post_recv() for the next request. 320 * So here, 321 * see if we are this `next request' and need to absorb an excess rc. 322 * If yes, then drop and free our own, and do not recv_post(). 323 **/ 324 if (unlikely(atomic_read(&rdma->excess_rc) > 0)) { 325 if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) { 326 /* Got one! */ 327 p9_fcall_fini(&req->rc); 328 req->rc.sdata = NULL; 329 goto dont_need_post_recv; 330 } else { 331 /* We raced and lost. */ 332 atomic_inc(&rdma->excess_rc); 333 } 334 } 335 336 /* Allocate an fcall for the reply */ 337 rpl_context = kmalloc(sizeof *rpl_context, GFP_NOFS); 338 if (!rpl_context) { 339 err = -ENOMEM; 340 goto recv_error; 341 } 342 rpl_context->rc.sdata = req->rc.sdata; 343 344 /* 345 * Post a receive buffer for this request. We need to ensure 346 * there is a reply buffer available for every outstanding 347 * request. A flushed request can result in no reply for an 348 * outstanding request, so we must keep a count to avoid 349 * overflowing the RQ. 350 */ 351 if (down_interruptible(&rdma->rq_sem)) { 352 err = -EINTR; 353 goto recv_error; 354 } 355 356 err = post_recv(client, rpl_context); 357 if (err) { 358 p9_debug(P9_DEBUG_ERROR, "POST RECV failed: %d\n", err); 359 goto recv_error; 360 } 361 /* remove posted receive buffer from request structure */ 362 req->rc.sdata = NULL; 363 364 dont_need_post_recv: 365 /* Post the request */ 366 c = kmalloc(sizeof *c, GFP_NOFS); 367 if (!c) { 368 err = -ENOMEM; 369 goto send_error; 370 } 371 c->req = req; 372 373 c->busa = ib_dma_map_single(rdma->cm_id->device, 374 c->req->tc.sdata, c->req->tc.size, 375 DMA_TO_DEVICE); 376 if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) { 377 err = -EIO; 378 goto send_error; 379 } 380 381 c->cqe.done = send_done; 382 383 sge.addr = c->busa; 384 sge.length = c->req->tc.size; 385 sge.lkey = rdma->pd->local_dma_lkey; 386 387 wr.next = NULL; 388 wr.wr_cqe = &c->cqe; 389 wr.opcode = IB_WR_SEND; 390 wr.send_flags = IB_SEND_SIGNALED; 391 wr.sg_list = &sge; 392 wr.num_sge = 1; 393 394 if (down_interruptible(&rdma->sq_sem)) { 395 err = -EINTR; 396 goto dma_unmap; 397 } 398 399 /* Mark request as `sent' *before* we actually send it, 400 * because doing if after could erase the REQ_STATUS_RCVD 401 * status in case of a very fast reply. 402 */ 403 WRITE_ONCE(req->status, REQ_STATUS_SENT); 404 err = ib_post_send(rdma->qp, &wr, NULL); 405 if (err) 406 goto dma_unmap; 407 408 /* Success */ 409 return 0; 410 411 dma_unmap: 412 ib_dma_unmap_single(rdma->cm_id->device, c->busa, 413 c->req->tc.size, DMA_TO_DEVICE); 414 /* Handle errors that happened during or while preparing the send: */ 415 send_error: 416 WRITE_ONCE(req->status, REQ_STATUS_ERROR); 417 kfree(c); 418 p9_debug(P9_DEBUG_ERROR, "Error %d in rdma_request()\n", err); 419 420 /* Ach. 421 * We did recv_post(), but not send. We have one recv_post in excess. 422 */ 423 atomic_inc(&rdma->excess_rc); 424 return err; 425 426 /* Handle errors that happened during or while preparing post_recv(): */ 427 recv_error: 428 kfree(rpl_context); 429 spin_lock_irqsave(&rdma->req_lock, flags); 430 if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) { 431 rdma->state = P9_RDMA_CLOSING; 432 spin_unlock_irqrestore(&rdma->req_lock, flags); 433 rdma_disconnect(rdma->cm_id); 434 } else 435 spin_unlock_irqrestore(&rdma->req_lock, flags); 436 return err; 437 } 438 439 static void rdma_close(struct p9_client *client) 440 { 441 struct p9_trans_rdma *rdma; 442 443 if (!client) 444 return; 445 446 rdma = client->trans; 447 if (!rdma) 448 return; 449 450 client->status = Disconnected; 451 rdma_disconnect(rdma->cm_id); 452 rdma_destroy_trans(rdma); 453 } 454 455 /** 456 * alloc_rdma - Allocate and initialize the rdma transport structure 457 * @opts: Mount options structure 458 */ 459 static struct p9_trans_rdma *alloc_rdma(struct p9_rdma_opts *opts) 460 { 461 struct p9_trans_rdma *rdma; 462 463 rdma = kzalloc(sizeof(struct p9_trans_rdma), GFP_KERNEL); 464 if (!rdma) 465 return NULL; 466 467 rdma->port = opts->port; 468 rdma->privport = opts->privport; 469 rdma->sq_depth = opts->sq_depth; 470 rdma->rq_depth = opts->rq_depth; 471 rdma->timeout = opts->timeout; 472 spin_lock_init(&rdma->req_lock); 473 init_completion(&rdma->cm_done); 474 sema_init(&rdma->sq_sem, rdma->sq_depth); 475 sema_init(&rdma->rq_sem, rdma->rq_depth); 476 atomic_set(&rdma->excess_rc, 0); 477 478 return rdma; 479 } 480 481 static int rdma_cancel(struct p9_client *client, struct p9_req_t *req) 482 { 483 /* Nothing to do here. 484 * We will take care of it (if we have to) in rdma_cancelled() 485 */ 486 return 1; 487 } 488 489 /* A request has been fully flushed without a reply. 490 * That means we have posted one buffer in excess. 491 */ 492 static int rdma_cancelled(struct p9_client *client, struct p9_req_t *req) 493 { 494 struct p9_trans_rdma *rdma = client->trans; 495 atomic_inc(&rdma->excess_rc); 496 return 0; 497 } 498 499 static int p9_rdma_bind_privport(struct p9_trans_rdma *rdma) 500 { 501 struct sockaddr_in cl = { 502 .sin_family = AF_INET, 503 .sin_addr.s_addr = htonl(INADDR_ANY), 504 }; 505 int port, err = -EINVAL; 506 507 for (port = P9_DEF_MAX_RESVPORT; port >= P9_DEF_MIN_RESVPORT; port--) { 508 cl.sin_port = htons((ushort)port); 509 err = rdma_bind_addr(rdma->cm_id, (struct sockaddr *)&cl); 510 if (err != -EADDRINUSE) 511 break; 512 } 513 return err; 514 } 515 516 /** 517 * rdma_create_trans - Transport method for creating a transport instance 518 * @client: client instance 519 * @fc: The filesystem context 520 */ 521 static int 522 rdma_create_trans(struct p9_client *client, struct fs_context *fc) 523 { 524 const char *addr = fc->source; 525 struct v9fs_context *ctx = fc->fs_private; 526 struct p9_rdma_opts opts = ctx->rdma_opts; 527 int err; 528 struct p9_trans_rdma *rdma; 529 struct rdma_conn_param conn_param; 530 struct ib_qp_init_attr qp_attr; 531 532 if (addr == NULL) 533 return -EINVAL; 534 535 /* options are already parsed, in the fs context */ 536 opts = ctx->rdma_opts; 537 538 /* Create and initialize the RDMA transport structure */ 539 rdma = alloc_rdma(&opts); 540 if (!rdma) 541 return -ENOMEM; 542 543 /* Create the RDMA CM ID */ 544 rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client, 545 RDMA_PS_TCP, IB_QPT_RC); 546 if (IS_ERR(rdma->cm_id)) 547 goto error; 548 549 /* Associate the client with the transport */ 550 client->trans = rdma; 551 552 /* Bind to a privileged port if we need to */ 553 if (opts.privport) { 554 err = p9_rdma_bind_privport(rdma); 555 if (err < 0) { 556 pr_err("%s (%d): problem binding to privport: %d\n", 557 __func__, task_pid_nr(current), -err); 558 goto error; 559 } 560 } 561 562 /* Resolve the server's address */ 563 rdma->addr.sin_family = AF_INET; 564 rdma->addr.sin_addr.s_addr = in_aton(addr); 565 rdma->addr.sin_port = htons(opts.port); 566 err = rdma_resolve_addr(rdma->cm_id, NULL, 567 (struct sockaddr *)&rdma->addr, 568 rdma->timeout); 569 if (err) 570 goto error; 571 err = wait_for_completion_interruptible(&rdma->cm_done); 572 if (err || (rdma->state != P9_RDMA_ADDR_RESOLVED)) 573 goto error; 574 575 /* Resolve the route to the server */ 576 err = rdma_resolve_route(rdma->cm_id, rdma->timeout); 577 if (err) 578 goto error; 579 err = wait_for_completion_interruptible(&rdma->cm_done); 580 if (err || (rdma->state != P9_RDMA_ROUTE_RESOLVED)) 581 goto error; 582 583 /* Create the Completion Queue */ 584 rdma->cq = ib_alloc_cq_any(rdma->cm_id->device, client, 585 opts.sq_depth + opts.rq_depth + 1, 586 IB_POLL_SOFTIRQ); 587 if (IS_ERR(rdma->cq)) 588 goto error; 589 590 /* Create the Protection Domain */ 591 rdma->pd = ib_alloc_pd(rdma->cm_id->device, 0); 592 if (IS_ERR(rdma->pd)) 593 goto error; 594 595 /* Create the Queue Pair */ 596 memset(&qp_attr, 0, sizeof qp_attr); 597 qp_attr.event_handler = qp_event_handler; 598 qp_attr.qp_context = client; 599 qp_attr.cap.max_send_wr = opts.sq_depth; 600 qp_attr.cap.max_recv_wr = opts.rq_depth; 601 qp_attr.cap.max_send_sge = P9_RDMA_SEND_SGE; 602 qp_attr.cap.max_recv_sge = P9_RDMA_RECV_SGE; 603 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 604 qp_attr.qp_type = IB_QPT_RC; 605 qp_attr.send_cq = rdma->cq; 606 qp_attr.recv_cq = rdma->cq; 607 err = rdma_create_qp(rdma->cm_id, rdma->pd, &qp_attr); 608 if (err) 609 goto error; 610 rdma->qp = rdma->cm_id->qp; 611 612 /* Request a connection */ 613 memset(&conn_param, 0, sizeof(conn_param)); 614 conn_param.private_data = NULL; 615 conn_param.private_data_len = 0; 616 conn_param.responder_resources = P9_RDMA_IRD; 617 conn_param.initiator_depth = P9_RDMA_ORD; 618 err = rdma_connect(rdma->cm_id, &conn_param); 619 if (err) 620 goto error; 621 err = wait_for_completion_interruptible(&rdma->cm_done); 622 if (err || (rdma->state != P9_RDMA_CONNECTED)) 623 goto error; 624 625 client->status = Connected; 626 627 return 0; 628 629 error: 630 rdma_destroy_trans(rdma); 631 return -ENOTCONN; 632 } 633 634 static struct p9_trans_module p9_rdma_trans = { 635 .name = "rdma", 636 .maxsize = P9_RDMA_MAXSIZE, 637 .pooled_rbuffers = true, 638 .def = false, 639 .supports_vmalloc = false, 640 .owner = THIS_MODULE, 641 .create = rdma_create_trans, 642 .close = rdma_close, 643 .request = rdma_request, 644 .cancel = rdma_cancel, 645 .cancelled = rdma_cancelled, 646 .show_options = p9_rdma_show_options, 647 }; 648 649 /** 650 * p9_trans_rdma_init - Register the 9P RDMA transport driver 651 */ 652 static int __init p9_trans_rdma_init(void) 653 { 654 v9fs_register_trans(&p9_rdma_trans); 655 return 0; 656 } 657 658 static void __exit p9_trans_rdma_exit(void) 659 { 660 v9fs_unregister_trans(&p9_rdma_trans); 661 } 662 663 module_init(p9_trans_rdma_init); 664 module_exit(p9_trans_rdma_exit); 665 MODULE_ALIAS_9P("rdma"); 666 667 MODULE_AUTHOR("Tom Tucker <tom@opengridcomputing.com>"); 668 MODULE_DESCRIPTION("RDMA Transport for 9P"); 669 MODULE_LICENSE("Dual BSD/GPL"); 670