1 /* 2 * NVMe over Fabrics RDMA target. 3 * Copyright (c) 2015-2016 HGST, a Western Digital Company. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 #include <linux/atomic.h> 16 #include <linux/ctype.h> 17 #include <linux/delay.h> 18 #include <linux/err.h> 19 #include <linux/init.h> 20 #include <linux/module.h> 21 #include <linux/nvme.h> 22 #include <linux/slab.h> 23 #include <linux/string.h> 24 #include <linux/wait.h> 25 #include <linux/inet.h> 26 #include <asm/unaligned.h> 27 28 #include <rdma/ib_verbs.h> 29 #include <rdma/rdma_cm.h> 30 #include <rdma/rw.h> 31 32 #include <linux/nvme-rdma.h> 33 #include "nvmet.h" 34 35 /* 36 * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data 37 */ 38 #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE 39 #define NVMET_RDMA_MAX_INLINE_SGE 4 40 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) 41 42 struct nvmet_rdma_cmd { 43 struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; 44 struct ib_cqe cqe; 45 struct ib_recv_wr wr; 46 struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; 47 struct nvme_command *nvme_cmd; 48 struct nvmet_rdma_queue *queue; 49 }; 50 51 enum { 52 NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), 53 NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), 54 }; 55 56 struct nvmet_rdma_rsp { 57 struct ib_sge send_sge; 58 struct ib_cqe send_cqe; 59 struct ib_send_wr send_wr; 60 61 struct nvmet_rdma_cmd *cmd; 62 struct nvmet_rdma_queue *queue; 63 64 struct ib_cqe read_cqe; 65 struct rdma_rw_ctx rw; 66 67 struct nvmet_req req; 68 69 u8 n_rdma; 70 u32 flags; 71 u32 invalidate_rkey; 72 73 struct list_head wait_list; 74 struct list_head free_list; 75 }; 76 77 enum nvmet_rdma_queue_state { 78 NVMET_RDMA_Q_CONNECTING, 79 NVMET_RDMA_Q_LIVE, 80 NVMET_RDMA_Q_DISCONNECTING, 81 }; 82 83 struct nvmet_rdma_queue { 84 struct rdma_cm_id *cm_id; 85 struct nvmet_port *port; 86 struct ib_cq *cq; 87 atomic_t sq_wr_avail; 88 struct nvmet_rdma_device *dev; 89 spinlock_t state_lock; 90 enum nvmet_rdma_queue_state state; 91 struct nvmet_cq nvme_cq; 92 struct nvmet_sq nvme_sq; 93 94 struct nvmet_rdma_rsp *rsps; 95 struct list_head free_rsps; 96 spinlock_t rsps_lock; 97 struct nvmet_rdma_cmd *cmds; 98 99 struct work_struct release_work; 100 struct list_head rsp_wait_list; 101 struct list_head rsp_wr_wait_list; 102 spinlock_t rsp_wr_wait_lock; 103 104 int idx; 105 int host_qid; 106 int recv_queue_size; 107 int send_queue_size; 108 109 struct list_head queue_list; 110 }; 111 112 struct nvmet_rdma_device { 113 struct ib_device *device; 114 struct ib_pd *pd; 115 struct ib_srq *srq; 116 struct nvmet_rdma_cmd *srq_cmds; 117 size_t srq_size; 118 struct kref ref; 119 struct list_head entry; 120 int inline_data_size; 121 int inline_page_count; 122 }; 123 124 static bool nvmet_rdma_use_srq; 125 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); 126 MODULE_PARM_DESC(use_srq, "Use shared receive queue."); 127 128 static DEFINE_IDA(nvmet_rdma_queue_ida); 129 static LIST_HEAD(nvmet_rdma_queue_list); 130 static DEFINE_MUTEX(nvmet_rdma_queue_mutex); 131 132 static LIST_HEAD(device_list); 133 static DEFINE_MUTEX(device_list_mutex); 134 135 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); 136 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); 137 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); 138 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); 139 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); 140 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); 141 142 static const struct nvmet_fabrics_ops nvmet_rdma_ops; 143 144 static int num_pages(int len) 145 { 146 return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); 147 } 148 149 /* XXX: really should move to a generic header sooner or later.. */ 150 static inline u32 get_unaligned_le24(const u8 *p) 151 { 152 return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; 153 } 154 155 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) 156 { 157 return nvme_is_write(rsp->req.cmd) && 158 rsp->req.transfer_len && 159 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 160 } 161 162 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) 163 { 164 return !nvme_is_write(rsp->req.cmd) && 165 rsp->req.transfer_len && 166 !rsp->req.rsp->status && 167 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 168 } 169 170 static inline struct nvmet_rdma_rsp * 171 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) 172 { 173 struct nvmet_rdma_rsp *rsp; 174 unsigned long flags; 175 176 spin_lock_irqsave(&queue->rsps_lock, flags); 177 rsp = list_first_entry(&queue->free_rsps, 178 struct nvmet_rdma_rsp, free_list); 179 list_del(&rsp->free_list); 180 spin_unlock_irqrestore(&queue->rsps_lock, flags); 181 182 return rsp; 183 } 184 185 static inline void 186 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) 187 { 188 unsigned long flags; 189 190 spin_lock_irqsave(&rsp->queue->rsps_lock, flags); 191 list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); 192 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); 193 } 194 195 static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, 196 struct nvmet_rdma_cmd *c) 197 { 198 struct scatterlist *sg; 199 struct ib_sge *sge; 200 int i; 201 202 if (!ndev->inline_data_size) 203 return; 204 205 sg = c->inline_sg; 206 sge = &c->sge[1]; 207 208 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { 209 if (sge->length) 210 ib_dma_unmap_page(ndev->device, sge->addr, 211 sge->length, DMA_FROM_DEVICE); 212 if (sg_page(sg)) 213 __free_page(sg_page(sg)); 214 } 215 } 216 217 static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev, 218 struct nvmet_rdma_cmd *c) 219 { 220 struct scatterlist *sg; 221 struct ib_sge *sge; 222 struct page *pg; 223 int len; 224 int i; 225 226 if (!ndev->inline_data_size) 227 return 0; 228 229 sg = c->inline_sg; 230 sg_init_table(sg, ndev->inline_page_count); 231 sge = &c->sge[1]; 232 len = ndev->inline_data_size; 233 234 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { 235 pg = alloc_page(GFP_KERNEL); 236 if (!pg) 237 goto out_err; 238 sg_assign_page(sg, pg); 239 sge->addr = ib_dma_map_page(ndev->device, 240 pg, 0, PAGE_SIZE, DMA_FROM_DEVICE); 241 if (ib_dma_mapping_error(ndev->device, sge->addr)) 242 goto out_err; 243 sge->length = min_t(int, len, PAGE_SIZE); 244 sge->lkey = ndev->pd->local_dma_lkey; 245 len -= sge->length; 246 } 247 248 return 0; 249 out_err: 250 for (; i >= 0; i--, sg--, sge--) { 251 if (sge->length) 252 ib_dma_unmap_page(ndev->device, sge->addr, 253 sge->length, DMA_FROM_DEVICE); 254 if (sg_page(sg)) 255 __free_page(sg_page(sg)); 256 } 257 return -ENOMEM; 258 } 259 260 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, 261 struct nvmet_rdma_cmd *c, bool admin) 262 { 263 /* NVMe command / RDMA RECV */ 264 c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL); 265 if (!c->nvme_cmd) 266 goto out; 267 268 c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd, 269 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 270 if (ib_dma_mapping_error(ndev->device, c->sge[0].addr)) 271 goto out_free_cmd; 272 273 c->sge[0].length = sizeof(*c->nvme_cmd); 274 c->sge[0].lkey = ndev->pd->local_dma_lkey; 275 276 if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c)) 277 goto out_unmap_cmd; 278 279 c->cqe.done = nvmet_rdma_recv_done; 280 281 c->wr.wr_cqe = &c->cqe; 282 c->wr.sg_list = c->sge; 283 c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1; 284 285 return 0; 286 287 out_unmap_cmd: 288 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 289 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 290 out_free_cmd: 291 kfree(c->nvme_cmd); 292 293 out: 294 return -ENOMEM; 295 } 296 297 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, 298 struct nvmet_rdma_cmd *c, bool admin) 299 { 300 if (!admin) 301 nvmet_rdma_free_inline_pages(ndev, c); 302 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 303 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 304 kfree(c->nvme_cmd); 305 } 306 307 static struct nvmet_rdma_cmd * 308 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, 309 int nr_cmds, bool admin) 310 { 311 struct nvmet_rdma_cmd *cmds; 312 int ret = -EINVAL, i; 313 314 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); 315 if (!cmds) 316 goto out; 317 318 for (i = 0; i < nr_cmds; i++) { 319 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin); 320 if (ret) 321 goto out_free; 322 } 323 324 return cmds; 325 326 out_free: 327 while (--i >= 0) 328 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 329 kfree(cmds); 330 out: 331 return ERR_PTR(ret); 332 } 333 334 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, 335 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin) 336 { 337 int i; 338 339 for (i = 0; i < nr_cmds; i++) 340 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 341 kfree(cmds); 342 } 343 344 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, 345 struct nvmet_rdma_rsp *r) 346 { 347 /* NVMe CQE / RDMA SEND */ 348 r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL); 349 if (!r->req.rsp) 350 goto out; 351 352 r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp, 353 sizeof(*r->req.rsp), DMA_TO_DEVICE); 354 if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) 355 goto out_free_rsp; 356 357 r->send_sge.length = sizeof(*r->req.rsp); 358 r->send_sge.lkey = ndev->pd->local_dma_lkey; 359 360 r->send_cqe.done = nvmet_rdma_send_done; 361 362 r->send_wr.wr_cqe = &r->send_cqe; 363 r->send_wr.sg_list = &r->send_sge; 364 r->send_wr.num_sge = 1; 365 r->send_wr.send_flags = IB_SEND_SIGNALED; 366 367 /* Data In / RDMA READ */ 368 r->read_cqe.done = nvmet_rdma_read_data_done; 369 return 0; 370 371 out_free_rsp: 372 kfree(r->req.rsp); 373 out: 374 return -ENOMEM; 375 } 376 377 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, 378 struct nvmet_rdma_rsp *r) 379 { 380 ib_dma_unmap_single(ndev->device, r->send_sge.addr, 381 sizeof(*r->req.rsp), DMA_TO_DEVICE); 382 kfree(r->req.rsp); 383 } 384 385 static int 386 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) 387 { 388 struct nvmet_rdma_device *ndev = queue->dev; 389 int nr_rsps = queue->recv_queue_size * 2; 390 int ret = -EINVAL, i; 391 392 queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), 393 GFP_KERNEL); 394 if (!queue->rsps) 395 goto out; 396 397 for (i = 0; i < nr_rsps; i++) { 398 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 399 400 ret = nvmet_rdma_alloc_rsp(ndev, rsp); 401 if (ret) 402 goto out_free; 403 404 list_add_tail(&rsp->free_list, &queue->free_rsps); 405 } 406 407 return 0; 408 409 out_free: 410 while (--i >= 0) { 411 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 412 413 list_del(&rsp->free_list); 414 nvmet_rdma_free_rsp(ndev, rsp); 415 } 416 kfree(queue->rsps); 417 out: 418 return ret; 419 } 420 421 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) 422 { 423 struct nvmet_rdma_device *ndev = queue->dev; 424 int i, nr_rsps = queue->recv_queue_size * 2; 425 426 for (i = 0; i < nr_rsps; i++) { 427 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 428 429 list_del(&rsp->free_list); 430 nvmet_rdma_free_rsp(ndev, rsp); 431 } 432 kfree(queue->rsps); 433 } 434 435 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, 436 struct nvmet_rdma_cmd *cmd) 437 { 438 struct ib_recv_wr *bad_wr; 439 int ret; 440 441 ib_dma_sync_single_for_device(ndev->device, 442 cmd->sge[0].addr, cmd->sge[0].length, 443 DMA_FROM_DEVICE); 444 445 if (ndev->srq) 446 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, &bad_wr); 447 else 448 ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, &bad_wr); 449 450 if (unlikely(ret)) 451 pr_err("post_recv cmd failed\n"); 452 453 return ret; 454 } 455 456 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) 457 { 458 spin_lock(&queue->rsp_wr_wait_lock); 459 while (!list_empty(&queue->rsp_wr_wait_list)) { 460 struct nvmet_rdma_rsp *rsp; 461 bool ret; 462 463 rsp = list_entry(queue->rsp_wr_wait_list.next, 464 struct nvmet_rdma_rsp, wait_list); 465 list_del(&rsp->wait_list); 466 467 spin_unlock(&queue->rsp_wr_wait_lock); 468 ret = nvmet_rdma_execute_command(rsp); 469 spin_lock(&queue->rsp_wr_wait_lock); 470 471 if (!ret) { 472 list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); 473 break; 474 } 475 } 476 spin_unlock(&queue->rsp_wr_wait_lock); 477 } 478 479 480 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) 481 { 482 struct nvmet_rdma_queue *queue = rsp->queue; 483 484 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 485 486 if (rsp->n_rdma) { 487 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 488 queue->cm_id->port_num, rsp->req.sg, 489 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 490 } 491 492 if (rsp->req.sg != rsp->cmd->inline_sg) 493 sgl_free(rsp->req.sg); 494 495 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) 496 nvmet_rdma_process_wr_wait_list(queue); 497 498 nvmet_rdma_put_rsp(rsp); 499 } 500 501 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue) 502 { 503 if (queue->nvme_sq.ctrl) { 504 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); 505 } else { 506 /* 507 * we didn't setup the controller yet in case 508 * of admin connect error, just disconnect and 509 * cleanup the queue 510 */ 511 nvmet_rdma_queue_disconnect(queue); 512 } 513 } 514 515 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) 516 { 517 struct nvmet_rdma_rsp *rsp = 518 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); 519 520 nvmet_rdma_release_rsp(rsp); 521 522 if (unlikely(wc->status != IB_WC_SUCCESS && 523 wc->status != IB_WC_WR_FLUSH_ERR)) { 524 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", 525 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 526 nvmet_rdma_error_comp(rsp->queue); 527 } 528 } 529 530 static void nvmet_rdma_queue_response(struct nvmet_req *req) 531 { 532 struct nvmet_rdma_rsp *rsp = 533 container_of(req, struct nvmet_rdma_rsp, req); 534 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 535 struct ib_send_wr *first_wr, *bad_wr; 536 537 if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { 538 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; 539 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; 540 } else { 541 rsp->send_wr.opcode = IB_WR_SEND; 542 } 543 544 if (nvmet_rdma_need_data_out(rsp)) 545 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, 546 cm_id->port_num, NULL, &rsp->send_wr); 547 else 548 first_wr = &rsp->send_wr; 549 550 nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); 551 552 ib_dma_sync_single_for_device(rsp->queue->dev->device, 553 rsp->send_sge.addr, rsp->send_sge.length, 554 DMA_TO_DEVICE); 555 556 if (unlikely(ib_post_send(cm_id->qp, first_wr, &bad_wr))) { 557 pr_err("sending cmd response failed\n"); 558 nvmet_rdma_release_rsp(rsp); 559 } 560 } 561 562 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) 563 { 564 struct nvmet_rdma_rsp *rsp = 565 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); 566 struct nvmet_rdma_queue *queue = cq->cq_context; 567 568 WARN_ON(rsp->n_rdma <= 0); 569 atomic_add(rsp->n_rdma, &queue->sq_wr_avail); 570 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 571 queue->cm_id->port_num, rsp->req.sg, 572 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 573 rsp->n_rdma = 0; 574 575 if (unlikely(wc->status != IB_WC_SUCCESS)) { 576 nvmet_req_uninit(&rsp->req); 577 nvmet_rdma_release_rsp(rsp); 578 if (wc->status != IB_WC_WR_FLUSH_ERR) { 579 pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n", 580 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 581 nvmet_rdma_error_comp(queue); 582 } 583 return; 584 } 585 586 nvmet_req_execute(&rsp->req); 587 } 588 589 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, 590 u64 off) 591 { 592 int sg_count = num_pages(len); 593 struct scatterlist *sg; 594 int i; 595 596 sg = rsp->cmd->inline_sg; 597 for (i = 0; i < sg_count; i++, sg++) { 598 if (i < sg_count - 1) 599 sg_unmark_end(sg); 600 else 601 sg_mark_end(sg); 602 sg->offset = off; 603 sg->length = min_t(int, len, PAGE_SIZE - off); 604 len -= sg->length; 605 if (!i) 606 off = 0; 607 } 608 609 rsp->req.sg = rsp->cmd->inline_sg; 610 rsp->req.sg_cnt = sg_count; 611 } 612 613 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) 614 { 615 struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl; 616 u64 off = le64_to_cpu(sgl->addr); 617 u32 len = le32_to_cpu(sgl->length); 618 619 if (!nvme_is_write(rsp->req.cmd)) 620 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 621 622 if (off + len > rsp->queue->dev->inline_data_size) { 623 pr_err("invalid inline data offset!\n"); 624 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; 625 } 626 627 /* no data command? */ 628 if (!len) 629 return 0; 630 631 nvmet_rdma_use_inline_sg(rsp, len, off); 632 rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; 633 rsp->req.transfer_len += len; 634 return 0; 635 } 636 637 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, 638 struct nvme_keyed_sgl_desc *sgl, bool invalidate) 639 { 640 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 641 u64 addr = le64_to_cpu(sgl->addr); 642 u32 len = get_unaligned_le24(sgl->length); 643 u32 key = get_unaligned_le32(sgl->key); 644 int ret; 645 646 /* no data command? */ 647 if (!len) 648 return 0; 649 650 rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt); 651 if (!rsp->req.sg) 652 return NVME_SC_INTERNAL; 653 654 ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, 655 rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, 656 nvmet_data_dir(&rsp->req)); 657 if (ret < 0) 658 return NVME_SC_INTERNAL; 659 rsp->req.transfer_len += len; 660 rsp->n_rdma += ret; 661 662 if (invalidate) { 663 rsp->invalidate_rkey = key; 664 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; 665 } 666 667 return 0; 668 } 669 670 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) 671 { 672 struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl; 673 674 switch (sgl->type >> 4) { 675 case NVME_SGL_FMT_DATA_DESC: 676 switch (sgl->type & 0xf) { 677 case NVME_SGL_FMT_OFFSET: 678 return nvmet_rdma_map_sgl_inline(rsp); 679 default: 680 pr_err("invalid SGL subtype: %#x\n", sgl->type); 681 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 682 } 683 case NVME_KEY_SGL_FMT_DATA_DESC: 684 switch (sgl->type & 0xf) { 685 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE: 686 return nvmet_rdma_map_sgl_keyed(rsp, sgl, true); 687 case NVME_SGL_FMT_ADDRESS: 688 return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); 689 default: 690 pr_err("invalid SGL subtype: %#x\n", sgl->type); 691 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 692 } 693 default: 694 pr_err("invalid SGL type: %#x\n", sgl->type); 695 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; 696 } 697 } 698 699 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) 700 { 701 struct nvmet_rdma_queue *queue = rsp->queue; 702 703 if (unlikely(atomic_sub_return(1 + rsp->n_rdma, 704 &queue->sq_wr_avail) < 0)) { 705 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n", 706 1 + rsp->n_rdma, queue->idx, 707 queue->nvme_sq.ctrl->cntlid); 708 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 709 return false; 710 } 711 712 if (nvmet_rdma_need_data_in(rsp)) { 713 if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, 714 queue->cm_id->port_num, &rsp->read_cqe, NULL)) 715 nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); 716 } else { 717 nvmet_req_execute(&rsp->req); 718 } 719 720 return true; 721 } 722 723 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, 724 struct nvmet_rdma_rsp *cmd) 725 { 726 u16 status; 727 728 ib_dma_sync_single_for_cpu(queue->dev->device, 729 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length, 730 DMA_FROM_DEVICE); 731 ib_dma_sync_single_for_cpu(queue->dev->device, 732 cmd->send_sge.addr, cmd->send_sge.length, 733 DMA_TO_DEVICE); 734 735 if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, 736 &queue->nvme_sq, &nvmet_rdma_ops)) 737 return; 738 739 status = nvmet_rdma_map_sgl(cmd); 740 if (status) 741 goto out_err; 742 743 if (unlikely(!nvmet_rdma_execute_command(cmd))) { 744 spin_lock(&queue->rsp_wr_wait_lock); 745 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list); 746 spin_unlock(&queue->rsp_wr_wait_lock); 747 } 748 749 return; 750 751 out_err: 752 nvmet_req_complete(&cmd->req, status); 753 } 754 755 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) 756 { 757 struct nvmet_rdma_cmd *cmd = 758 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); 759 struct nvmet_rdma_queue *queue = cq->cq_context; 760 struct nvmet_rdma_rsp *rsp; 761 762 if (unlikely(wc->status != IB_WC_SUCCESS)) { 763 if (wc->status != IB_WC_WR_FLUSH_ERR) { 764 pr_err("RECV for CQE 0x%p failed with status %s (%d)\n", 765 wc->wr_cqe, ib_wc_status_msg(wc->status), 766 wc->status); 767 nvmet_rdma_error_comp(queue); 768 } 769 return; 770 } 771 772 if (unlikely(wc->byte_len < sizeof(struct nvme_command))) { 773 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n"); 774 nvmet_rdma_error_comp(queue); 775 return; 776 } 777 778 cmd->queue = queue; 779 rsp = nvmet_rdma_get_rsp(queue); 780 rsp->queue = queue; 781 rsp->cmd = cmd; 782 rsp->flags = 0; 783 rsp->req.cmd = cmd->nvme_cmd; 784 rsp->req.port = queue->port; 785 rsp->n_rdma = 0; 786 787 if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { 788 unsigned long flags; 789 790 spin_lock_irqsave(&queue->state_lock, flags); 791 if (queue->state == NVMET_RDMA_Q_CONNECTING) 792 list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); 793 else 794 nvmet_rdma_put_rsp(rsp); 795 spin_unlock_irqrestore(&queue->state_lock, flags); 796 return; 797 } 798 799 nvmet_rdma_handle_command(queue, rsp); 800 } 801 802 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev) 803 { 804 if (!ndev->srq) 805 return; 806 807 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 808 ib_destroy_srq(ndev->srq); 809 } 810 811 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) 812 { 813 struct ib_srq_init_attr srq_attr = { NULL, }; 814 struct ib_srq *srq; 815 size_t srq_size; 816 int ret, i; 817 818 srq_size = 4095; /* XXX: tune */ 819 820 srq_attr.attr.max_wr = srq_size; 821 srq_attr.attr.max_sge = 1 + ndev->inline_page_count; 822 srq_attr.attr.srq_limit = 0; 823 srq_attr.srq_type = IB_SRQT_BASIC; 824 srq = ib_create_srq(ndev->pd, &srq_attr); 825 if (IS_ERR(srq)) { 826 /* 827 * If SRQs aren't supported we just go ahead and use normal 828 * non-shared receive queues. 829 */ 830 pr_info("SRQ requested but not supported.\n"); 831 return 0; 832 } 833 834 ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); 835 if (IS_ERR(ndev->srq_cmds)) { 836 ret = PTR_ERR(ndev->srq_cmds); 837 goto out_destroy_srq; 838 } 839 840 ndev->srq = srq; 841 ndev->srq_size = srq_size; 842 843 for (i = 0; i < srq_size; i++) { 844 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); 845 if (ret) 846 goto out_free_cmds; 847 } 848 849 return 0; 850 851 out_free_cmds: 852 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 853 out_destroy_srq: 854 ib_destroy_srq(srq); 855 return ret; 856 } 857 858 static void nvmet_rdma_free_dev(struct kref *ref) 859 { 860 struct nvmet_rdma_device *ndev = 861 container_of(ref, struct nvmet_rdma_device, ref); 862 863 mutex_lock(&device_list_mutex); 864 list_del(&ndev->entry); 865 mutex_unlock(&device_list_mutex); 866 867 nvmet_rdma_destroy_srq(ndev); 868 ib_dealloc_pd(ndev->pd); 869 870 kfree(ndev); 871 } 872 873 static struct nvmet_rdma_device * 874 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) 875 { 876 struct nvmet_port *port = cm_id->context; 877 struct nvmet_rdma_device *ndev; 878 int inline_page_count; 879 int inline_sge_count; 880 int ret; 881 882 mutex_lock(&device_list_mutex); 883 list_for_each_entry(ndev, &device_list, entry) { 884 if (ndev->device->node_guid == cm_id->device->node_guid && 885 kref_get_unless_zero(&ndev->ref)) 886 goto out_unlock; 887 } 888 889 ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); 890 if (!ndev) 891 goto out_err; 892 893 inline_page_count = num_pages(port->inline_data_size); 894 inline_sge_count = max(cm_id->device->attrs.max_sge_rd, 895 cm_id->device->attrs.max_sge) - 1; 896 if (inline_page_count > inline_sge_count) { 897 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", 898 port->inline_data_size, cm_id->device->name, 899 inline_sge_count * PAGE_SIZE); 900 port->inline_data_size = inline_sge_count * PAGE_SIZE; 901 inline_page_count = inline_sge_count; 902 } 903 ndev->inline_data_size = port->inline_data_size; 904 ndev->inline_page_count = inline_page_count; 905 ndev->device = cm_id->device; 906 kref_init(&ndev->ref); 907 908 ndev->pd = ib_alloc_pd(ndev->device, 0); 909 if (IS_ERR(ndev->pd)) 910 goto out_free_dev; 911 912 if (nvmet_rdma_use_srq) { 913 ret = nvmet_rdma_init_srq(ndev); 914 if (ret) 915 goto out_free_pd; 916 } 917 918 list_add(&ndev->entry, &device_list); 919 out_unlock: 920 mutex_unlock(&device_list_mutex); 921 pr_debug("added %s.\n", ndev->device->name); 922 return ndev; 923 924 out_free_pd: 925 ib_dealloc_pd(ndev->pd); 926 out_free_dev: 927 kfree(ndev); 928 out_err: 929 mutex_unlock(&device_list_mutex); 930 return NULL; 931 } 932 933 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) 934 { 935 struct ib_qp_init_attr qp_attr; 936 struct nvmet_rdma_device *ndev = queue->dev; 937 int comp_vector, nr_cqe, ret, i; 938 939 /* 940 * Spread the io queues across completion vectors, 941 * but still keep all admin queues on vector 0. 942 */ 943 comp_vector = !queue->host_qid ? 0 : 944 queue->idx % ndev->device->num_comp_vectors; 945 946 /* 947 * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. 948 */ 949 nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; 950 951 queue->cq = ib_alloc_cq(ndev->device, queue, 952 nr_cqe + 1, comp_vector, 953 IB_POLL_WORKQUEUE); 954 if (IS_ERR(queue->cq)) { 955 ret = PTR_ERR(queue->cq); 956 pr_err("failed to create CQ cqe= %d ret= %d\n", 957 nr_cqe + 1, ret); 958 goto out; 959 } 960 961 memset(&qp_attr, 0, sizeof(qp_attr)); 962 qp_attr.qp_context = queue; 963 qp_attr.event_handler = nvmet_rdma_qp_event; 964 qp_attr.send_cq = queue->cq; 965 qp_attr.recv_cq = queue->cq; 966 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 967 qp_attr.qp_type = IB_QPT_RC; 968 /* +1 for drain */ 969 qp_attr.cap.max_send_wr = queue->send_queue_size + 1; 970 qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; 971 qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, 972 ndev->device->attrs.max_sge); 973 974 if (ndev->srq) { 975 qp_attr.srq = ndev->srq; 976 } else { 977 /* +1 for drain */ 978 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; 979 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; 980 } 981 982 ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); 983 if (ret) { 984 pr_err("failed to create_qp ret= %d\n", ret); 985 goto err_destroy_cq; 986 } 987 988 atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); 989 990 pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", 991 __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, 992 qp_attr.cap.max_send_wr, queue->cm_id); 993 994 if (!ndev->srq) { 995 for (i = 0; i < queue->recv_queue_size; i++) { 996 queue->cmds[i].queue = queue; 997 ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); 998 if (ret) 999 goto err_destroy_qp; 1000 } 1001 } 1002 1003 out: 1004 return ret; 1005 1006 err_destroy_qp: 1007 rdma_destroy_qp(queue->cm_id); 1008 err_destroy_cq: 1009 ib_free_cq(queue->cq); 1010 goto out; 1011 } 1012 1013 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) 1014 { 1015 struct ib_qp *qp = queue->cm_id->qp; 1016 1017 ib_drain_qp(qp); 1018 rdma_destroy_id(queue->cm_id); 1019 ib_destroy_qp(qp); 1020 ib_free_cq(queue->cq); 1021 } 1022 1023 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) 1024 { 1025 pr_debug("freeing queue %d\n", queue->idx); 1026 1027 nvmet_sq_destroy(&queue->nvme_sq); 1028 1029 nvmet_rdma_destroy_queue_ib(queue); 1030 if (!queue->dev->srq) { 1031 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1032 queue->recv_queue_size, 1033 !queue->host_qid); 1034 } 1035 nvmet_rdma_free_rsps(queue); 1036 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1037 kfree(queue); 1038 } 1039 1040 static void nvmet_rdma_release_queue_work(struct work_struct *w) 1041 { 1042 struct nvmet_rdma_queue *queue = 1043 container_of(w, struct nvmet_rdma_queue, release_work); 1044 struct nvmet_rdma_device *dev = queue->dev; 1045 1046 nvmet_rdma_free_queue(queue); 1047 1048 kref_put(&dev->ref, nvmet_rdma_free_dev); 1049 } 1050 1051 static int 1052 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, 1053 struct nvmet_rdma_queue *queue) 1054 { 1055 struct nvme_rdma_cm_req *req; 1056 1057 req = (struct nvme_rdma_cm_req *)conn->private_data; 1058 if (!req || conn->private_data_len == 0) 1059 return NVME_RDMA_CM_INVALID_LEN; 1060 1061 if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0) 1062 return NVME_RDMA_CM_INVALID_RECFMT; 1063 1064 queue->host_qid = le16_to_cpu(req->qid); 1065 1066 /* 1067 * req->hsqsize corresponds to our recv queue size plus 1 1068 * req->hrqsize corresponds to our send queue size 1069 */ 1070 queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; 1071 queue->send_queue_size = le16_to_cpu(req->hrqsize); 1072 1073 if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH) 1074 return NVME_RDMA_CM_INVALID_HSQSIZE; 1075 1076 /* XXX: Should we enforce some kind of max for IO queues? */ 1077 1078 return 0; 1079 } 1080 1081 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, 1082 enum nvme_rdma_cm_status status) 1083 { 1084 struct nvme_rdma_cm_rej rej; 1085 1086 pr_debug("rejecting connect request: status %d (%s)\n", 1087 status, nvme_rdma_cm_msg(status)); 1088 1089 rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1090 rej.sts = cpu_to_le16(status); 1091 1092 return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); 1093 } 1094 1095 static struct nvmet_rdma_queue * 1096 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, 1097 struct rdma_cm_id *cm_id, 1098 struct rdma_cm_event *event) 1099 { 1100 struct nvmet_rdma_queue *queue; 1101 int ret; 1102 1103 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 1104 if (!queue) { 1105 ret = NVME_RDMA_CM_NO_RSC; 1106 goto out_reject; 1107 } 1108 1109 ret = nvmet_sq_init(&queue->nvme_sq); 1110 if (ret) { 1111 ret = NVME_RDMA_CM_NO_RSC; 1112 goto out_free_queue; 1113 } 1114 1115 ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); 1116 if (ret) 1117 goto out_destroy_sq; 1118 1119 /* 1120 * Schedules the actual release because calling rdma_destroy_id from 1121 * inside a CM callback would trigger a deadlock. (great API design..) 1122 */ 1123 INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); 1124 queue->dev = ndev; 1125 queue->cm_id = cm_id; 1126 1127 spin_lock_init(&queue->state_lock); 1128 queue->state = NVMET_RDMA_Q_CONNECTING; 1129 INIT_LIST_HEAD(&queue->rsp_wait_list); 1130 INIT_LIST_HEAD(&queue->rsp_wr_wait_list); 1131 spin_lock_init(&queue->rsp_wr_wait_lock); 1132 INIT_LIST_HEAD(&queue->free_rsps); 1133 spin_lock_init(&queue->rsps_lock); 1134 INIT_LIST_HEAD(&queue->queue_list); 1135 1136 queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); 1137 if (queue->idx < 0) { 1138 ret = NVME_RDMA_CM_NO_RSC; 1139 goto out_destroy_sq; 1140 } 1141 1142 ret = nvmet_rdma_alloc_rsps(queue); 1143 if (ret) { 1144 ret = NVME_RDMA_CM_NO_RSC; 1145 goto out_ida_remove; 1146 } 1147 1148 if (!ndev->srq) { 1149 queue->cmds = nvmet_rdma_alloc_cmds(ndev, 1150 queue->recv_queue_size, 1151 !queue->host_qid); 1152 if (IS_ERR(queue->cmds)) { 1153 ret = NVME_RDMA_CM_NO_RSC; 1154 goto out_free_responses; 1155 } 1156 } 1157 1158 ret = nvmet_rdma_create_queue_ib(queue); 1159 if (ret) { 1160 pr_err("%s: creating RDMA queue failed (%d).\n", 1161 __func__, ret); 1162 ret = NVME_RDMA_CM_NO_RSC; 1163 goto out_free_cmds; 1164 } 1165 1166 return queue; 1167 1168 out_free_cmds: 1169 if (!ndev->srq) { 1170 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1171 queue->recv_queue_size, 1172 !queue->host_qid); 1173 } 1174 out_free_responses: 1175 nvmet_rdma_free_rsps(queue); 1176 out_ida_remove: 1177 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1178 out_destroy_sq: 1179 nvmet_sq_destroy(&queue->nvme_sq); 1180 out_free_queue: 1181 kfree(queue); 1182 out_reject: 1183 nvmet_rdma_cm_reject(cm_id, ret); 1184 return NULL; 1185 } 1186 1187 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) 1188 { 1189 struct nvmet_rdma_queue *queue = priv; 1190 1191 switch (event->event) { 1192 case IB_EVENT_COMM_EST: 1193 rdma_notify(queue->cm_id, event->event); 1194 break; 1195 default: 1196 pr_err("received IB QP event: %s (%d)\n", 1197 ib_event_msg(event->event), event->event); 1198 break; 1199 } 1200 } 1201 1202 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, 1203 struct nvmet_rdma_queue *queue, 1204 struct rdma_conn_param *p) 1205 { 1206 struct rdma_conn_param param = { }; 1207 struct nvme_rdma_cm_rep priv = { }; 1208 int ret = -ENOMEM; 1209 1210 param.rnr_retry_count = 7; 1211 param.flow_control = 1; 1212 param.initiator_depth = min_t(u8, p->initiator_depth, 1213 queue->dev->device->attrs.max_qp_init_rd_atom); 1214 param.private_data = &priv; 1215 param.private_data_len = sizeof(priv); 1216 priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1217 priv.crqsize = cpu_to_le16(queue->recv_queue_size); 1218 1219 ret = rdma_accept(cm_id, ¶m); 1220 if (ret) 1221 pr_err("rdma_accept failed (error code = %d)\n", ret); 1222 1223 return ret; 1224 } 1225 1226 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, 1227 struct rdma_cm_event *event) 1228 { 1229 struct nvmet_rdma_device *ndev; 1230 struct nvmet_rdma_queue *queue; 1231 int ret = -EINVAL; 1232 1233 ndev = nvmet_rdma_find_get_device(cm_id); 1234 if (!ndev) { 1235 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC); 1236 return -ECONNREFUSED; 1237 } 1238 1239 queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); 1240 if (!queue) { 1241 ret = -ENOMEM; 1242 goto put_device; 1243 } 1244 queue->port = cm_id->context; 1245 1246 if (queue->host_qid == 0) { 1247 /* Let inflight controller teardown complete */ 1248 flush_scheduled_work(); 1249 } 1250 1251 ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); 1252 if (ret) { 1253 schedule_work(&queue->release_work); 1254 /* Destroying rdma_cm id is not needed here */ 1255 return 0; 1256 } 1257 1258 mutex_lock(&nvmet_rdma_queue_mutex); 1259 list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); 1260 mutex_unlock(&nvmet_rdma_queue_mutex); 1261 1262 return 0; 1263 1264 put_device: 1265 kref_put(&ndev->ref, nvmet_rdma_free_dev); 1266 1267 return ret; 1268 } 1269 1270 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue) 1271 { 1272 unsigned long flags; 1273 1274 spin_lock_irqsave(&queue->state_lock, flags); 1275 if (queue->state != NVMET_RDMA_Q_CONNECTING) { 1276 pr_warn("trying to establish a connected queue\n"); 1277 goto out_unlock; 1278 } 1279 queue->state = NVMET_RDMA_Q_LIVE; 1280 1281 while (!list_empty(&queue->rsp_wait_list)) { 1282 struct nvmet_rdma_rsp *cmd; 1283 1284 cmd = list_first_entry(&queue->rsp_wait_list, 1285 struct nvmet_rdma_rsp, wait_list); 1286 list_del(&cmd->wait_list); 1287 1288 spin_unlock_irqrestore(&queue->state_lock, flags); 1289 nvmet_rdma_handle_command(queue, cmd); 1290 spin_lock_irqsave(&queue->state_lock, flags); 1291 } 1292 1293 out_unlock: 1294 spin_unlock_irqrestore(&queue->state_lock, flags); 1295 } 1296 1297 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1298 { 1299 bool disconnect = false; 1300 unsigned long flags; 1301 1302 pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state); 1303 1304 spin_lock_irqsave(&queue->state_lock, flags); 1305 switch (queue->state) { 1306 case NVMET_RDMA_Q_CONNECTING: 1307 case NVMET_RDMA_Q_LIVE: 1308 queue->state = NVMET_RDMA_Q_DISCONNECTING; 1309 disconnect = true; 1310 break; 1311 case NVMET_RDMA_Q_DISCONNECTING: 1312 break; 1313 } 1314 spin_unlock_irqrestore(&queue->state_lock, flags); 1315 1316 if (disconnect) { 1317 rdma_disconnect(queue->cm_id); 1318 schedule_work(&queue->release_work); 1319 } 1320 } 1321 1322 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1323 { 1324 bool disconnect = false; 1325 1326 mutex_lock(&nvmet_rdma_queue_mutex); 1327 if (!list_empty(&queue->queue_list)) { 1328 list_del_init(&queue->queue_list); 1329 disconnect = true; 1330 } 1331 mutex_unlock(&nvmet_rdma_queue_mutex); 1332 1333 if (disconnect) 1334 __nvmet_rdma_queue_disconnect(queue); 1335 } 1336 1337 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, 1338 struct nvmet_rdma_queue *queue) 1339 { 1340 WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); 1341 1342 mutex_lock(&nvmet_rdma_queue_mutex); 1343 if (!list_empty(&queue->queue_list)) 1344 list_del_init(&queue->queue_list); 1345 mutex_unlock(&nvmet_rdma_queue_mutex); 1346 1347 pr_err("failed to connect queue %d\n", queue->idx); 1348 schedule_work(&queue->release_work); 1349 } 1350 1351 /** 1352 * nvme_rdma_device_removal() - Handle RDMA device removal 1353 * @cm_id: rdma_cm id, used for nvmet port 1354 * @queue: nvmet rdma queue (cm id qp_context) 1355 * 1356 * DEVICE_REMOVAL event notifies us that the RDMA device is about 1357 * to unplug. Note that this event can be generated on a normal 1358 * queue cm_id and/or a device bound listener cm_id (where in this 1359 * case queue will be null). 1360 * 1361 * We registered an ib_client to handle device removal for queues, 1362 * so we only need to handle the listening port cm_ids. In this case 1363 * we nullify the priv to prevent double cm_id destruction and destroying 1364 * the cm_id implicitely by returning a non-zero rc to the callout. 1365 */ 1366 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, 1367 struct nvmet_rdma_queue *queue) 1368 { 1369 struct nvmet_port *port; 1370 1371 if (queue) { 1372 /* 1373 * This is a queue cm_id. we have registered 1374 * an ib_client to handle queues removal 1375 * so don't interfear and just return. 1376 */ 1377 return 0; 1378 } 1379 1380 port = cm_id->context; 1381 1382 /* 1383 * This is a listener cm_id. Make sure that 1384 * future remove_port won't invoke a double 1385 * cm_id destroy. use atomic xchg to make sure 1386 * we don't compete with remove_port. 1387 */ 1388 if (xchg(&port->priv, NULL) != cm_id) 1389 return 0; 1390 1391 /* 1392 * We need to return 1 so that the core will destroy 1393 * it's own ID. What a great API design.. 1394 */ 1395 return 1; 1396 } 1397 1398 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, 1399 struct rdma_cm_event *event) 1400 { 1401 struct nvmet_rdma_queue *queue = NULL; 1402 int ret = 0; 1403 1404 if (cm_id->qp) 1405 queue = cm_id->qp->qp_context; 1406 1407 pr_debug("%s (%d): status %d id %p\n", 1408 rdma_event_msg(event->event), event->event, 1409 event->status, cm_id); 1410 1411 switch (event->event) { 1412 case RDMA_CM_EVENT_CONNECT_REQUEST: 1413 ret = nvmet_rdma_queue_connect(cm_id, event); 1414 break; 1415 case RDMA_CM_EVENT_ESTABLISHED: 1416 nvmet_rdma_queue_established(queue); 1417 break; 1418 case RDMA_CM_EVENT_ADDR_CHANGE: 1419 case RDMA_CM_EVENT_DISCONNECTED: 1420 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1421 nvmet_rdma_queue_disconnect(queue); 1422 break; 1423 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1424 ret = nvmet_rdma_device_removal(cm_id, queue); 1425 break; 1426 case RDMA_CM_EVENT_REJECTED: 1427 pr_debug("Connection rejected: %s\n", 1428 rdma_reject_msg(cm_id, event->status)); 1429 /* FALLTHROUGH */ 1430 case RDMA_CM_EVENT_UNREACHABLE: 1431 case RDMA_CM_EVENT_CONNECT_ERROR: 1432 nvmet_rdma_queue_connect_fail(cm_id, queue); 1433 break; 1434 default: 1435 pr_err("received unrecognized RDMA CM event %d\n", 1436 event->event); 1437 break; 1438 } 1439 1440 return ret; 1441 } 1442 1443 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) 1444 { 1445 struct nvmet_rdma_queue *queue; 1446 1447 restart: 1448 mutex_lock(&nvmet_rdma_queue_mutex); 1449 list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { 1450 if (queue->nvme_sq.ctrl == ctrl) { 1451 list_del_init(&queue->queue_list); 1452 mutex_unlock(&nvmet_rdma_queue_mutex); 1453 1454 __nvmet_rdma_queue_disconnect(queue); 1455 goto restart; 1456 } 1457 } 1458 mutex_unlock(&nvmet_rdma_queue_mutex); 1459 } 1460 1461 static int nvmet_rdma_add_port(struct nvmet_port *port) 1462 { 1463 struct rdma_cm_id *cm_id; 1464 struct sockaddr_storage addr = { }; 1465 __kernel_sa_family_t af; 1466 int ret; 1467 1468 switch (port->disc_addr.adrfam) { 1469 case NVMF_ADDR_FAMILY_IP4: 1470 af = AF_INET; 1471 break; 1472 case NVMF_ADDR_FAMILY_IP6: 1473 af = AF_INET6; 1474 break; 1475 default: 1476 pr_err("address family %d not supported\n", 1477 port->disc_addr.adrfam); 1478 return -EINVAL; 1479 } 1480 1481 if (port->inline_data_size < 0) { 1482 port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; 1483 } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { 1484 pr_warn("inline_data_size %u is too large, reducing to %u\n", 1485 port->inline_data_size, 1486 NVMET_RDMA_MAX_INLINE_DATA_SIZE); 1487 port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; 1488 } 1489 1490 ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, 1491 port->disc_addr.trsvcid, &addr); 1492 if (ret) { 1493 pr_err("malformed ip/port passed: %s:%s\n", 1494 port->disc_addr.traddr, port->disc_addr.trsvcid); 1495 return ret; 1496 } 1497 1498 cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, 1499 RDMA_PS_TCP, IB_QPT_RC); 1500 if (IS_ERR(cm_id)) { 1501 pr_err("CM ID creation failed\n"); 1502 return PTR_ERR(cm_id); 1503 } 1504 1505 /* 1506 * Allow both IPv4 and IPv6 sockets to bind a single port 1507 * at the same time. 1508 */ 1509 ret = rdma_set_afonly(cm_id, 1); 1510 if (ret) { 1511 pr_err("rdma_set_afonly failed (%d)\n", ret); 1512 goto out_destroy_id; 1513 } 1514 1515 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr); 1516 if (ret) { 1517 pr_err("binding CM ID to %pISpcs failed (%d)\n", 1518 (struct sockaddr *)&addr, ret); 1519 goto out_destroy_id; 1520 } 1521 1522 ret = rdma_listen(cm_id, 128); 1523 if (ret) { 1524 pr_err("listening to %pISpcs failed (%d)\n", 1525 (struct sockaddr *)&addr, ret); 1526 goto out_destroy_id; 1527 } 1528 1529 pr_info("enabling port %d (%pISpcs)\n", 1530 le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr); 1531 port->priv = cm_id; 1532 return 0; 1533 1534 out_destroy_id: 1535 rdma_destroy_id(cm_id); 1536 return ret; 1537 } 1538 1539 static void nvmet_rdma_remove_port(struct nvmet_port *port) 1540 { 1541 struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); 1542 1543 if (cm_id) 1544 rdma_destroy_id(cm_id); 1545 } 1546 1547 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, 1548 struct nvmet_port *port, char *traddr) 1549 { 1550 struct rdma_cm_id *cm_id = port->priv; 1551 1552 if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { 1553 struct nvmet_rdma_rsp *rsp = 1554 container_of(req, struct nvmet_rdma_rsp, req); 1555 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id; 1556 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr; 1557 1558 sprintf(traddr, "%pISc", addr); 1559 } else { 1560 memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); 1561 } 1562 } 1563 1564 static const struct nvmet_fabrics_ops nvmet_rdma_ops = { 1565 .owner = THIS_MODULE, 1566 .type = NVMF_TRTYPE_RDMA, 1567 .msdbd = 1, 1568 .has_keyed_sgls = 1, 1569 .add_port = nvmet_rdma_add_port, 1570 .remove_port = nvmet_rdma_remove_port, 1571 .queue_response = nvmet_rdma_queue_response, 1572 .delete_ctrl = nvmet_rdma_delete_ctrl, 1573 .disc_traddr = nvmet_rdma_disc_port_addr, 1574 }; 1575 1576 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) 1577 { 1578 struct nvmet_rdma_queue *queue, *tmp; 1579 struct nvmet_rdma_device *ndev; 1580 bool found = false; 1581 1582 mutex_lock(&device_list_mutex); 1583 list_for_each_entry(ndev, &device_list, entry) { 1584 if (ndev->device == ib_device) { 1585 found = true; 1586 break; 1587 } 1588 } 1589 mutex_unlock(&device_list_mutex); 1590 1591 if (!found) 1592 return; 1593 1594 /* 1595 * IB Device that is used by nvmet controllers is being removed, 1596 * delete all queues using this device. 1597 */ 1598 mutex_lock(&nvmet_rdma_queue_mutex); 1599 list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, 1600 queue_list) { 1601 if (queue->dev->device != ib_device) 1602 continue; 1603 1604 pr_info("Removing queue %d\n", queue->idx); 1605 list_del_init(&queue->queue_list); 1606 __nvmet_rdma_queue_disconnect(queue); 1607 } 1608 mutex_unlock(&nvmet_rdma_queue_mutex); 1609 1610 flush_scheduled_work(); 1611 } 1612 1613 static struct ib_client nvmet_rdma_ib_client = { 1614 .name = "nvmet_rdma", 1615 .remove = nvmet_rdma_remove_one 1616 }; 1617 1618 static int __init nvmet_rdma_init(void) 1619 { 1620 int ret; 1621 1622 ret = ib_register_client(&nvmet_rdma_ib_client); 1623 if (ret) 1624 return ret; 1625 1626 ret = nvmet_register_transport(&nvmet_rdma_ops); 1627 if (ret) 1628 goto err_ib_client; 1629 1630 return 0; 1631 1632 err_ib_client: 1633 ib_unregister_client(&nvmet_rdma_ib_client); 1634 return ret; 1635 } 1636 1637 static void __exit nvmet_rdma_exit(void) 1638 { 1639 nvmet_unregister_transport(&nvmet_rdma_ops); 1640 ib_unregister_client(&nvmet_rdma_ib_client); 1641 WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list)); 1642 ida_destroy(&nvmet_rdma_queue_ida); 1643 } 1644 1645 module_init(nvmet_rdma_init); 1646 module_exit(nvmet_rdma_exit); 1647 1648 MODULE_LICENSE("GPL v2"); 1649 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ 1650