1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NVMe over Fabrics RDMA target. 4 * Copyright (c) 2015-2016 HGST, a Western Digital Company. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 #include <linux/atomic.h> 8 #include <linux/ctype.h> 9 #include <linux/delay.h> 10 #include <linux/err.h> 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/nvme.h> 14 #include <linux/slab.h> 15 #include <linux/string.h> 16 #include <linux/wait.h> 17 #include <linux/inet.h> 18 #include <asm/unaligned.h> 19 20 #include <rdma/ib_verbs.h> 21 #include <rdma/rdma_cm.h> 22 #include <rdma/rw.h> 23 24 #include <linux/nvme-rdma.h> 25 #include "nvmet.h" 26 27 /* 28 * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data 29 */ 30 #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE 31 #define NVMET_RDMA_MAX_INLINE_SGE 4 32 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) 33 34 /* Assume mpsmin == device_page_size == 4KB */ 35 #define NVMET_RDMA_MAX_MDTS 8 36 37 struct nvmet_rdma_cmd { 38 struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; 39 struct ib_cqe cqe; 40 struct ib_recv_wr wr; 41 struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; 42 struct nvme_command *nvme_cmd; 43 struct nvmet_rdma_queue *queue; 44 }; 45 46 enum { 47 NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), 48 NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), 49 }; 50 51 struct nvmet_rdma_rsp { 52 struct ib_sge send_sge; 53 struct ib_cqe send_cqe; 54 struct ib_send_wr send_wr; 55 56 struct nvmet_rdma_cmd *cmd; 57 struct nvmet_rdma_queue *queue; 58 59 struct ib_cqe read_cqe; 60 struct rdma_rw_ctx rw; 61 62 struct nvmet_req req; 63 64 bool allocated; 65 u8 n_rdma; 66 u32 flags; 67 u32 invalidate_rkey; 68 69 struct list_head wait_list; 70 struct list_head free_list; 71 }; 72 73 enum nvmet_rdma_queue_state { 74 NVMET_RDMA_Q_CONNECTING, 75 NVMET_RDMA_Q_LIVE, 76 NVMET_RDMA_Q_DISCONNECTING, 77 }; 78 79 struct nvmet_rdma_queue { 80 struct rdma_cm_id *cm_id; 81 struct nvmet_port *port; 82 struct ib_cq *cq; 83 atomic_t sq_wr_avail; 84 struct nvmet_rdma_device *dev; 85 spinlock_t state_lock; 86 enum nvmet_rdma_queue_state state; 87 struct nvmet_cq nvme_cq; 88 struct nvmet_sq nvme_sq; 89 90 struct nvmet_rdma_rsp *rsps; 91 struct list_head free_rsps; 92 spinlock_t rsps_lock; 93 struct nvmet_rdma_cmd *cmds; 94 95 struct work_struct release_work; 96 struct list_head rsp_wait_list; 97 struct list_head rsp_wr_wait_list; 98 spinlock_t rsp_wr_wait_lock; 99 100 int idx; 101 int host_qid; 102 int recv_queue_size; 103 int send_queue_size; 104 105 struct list_head queue_list; 106 }; 107 108 struct nvmet_rdma_device { 109 struct ib_device *device; 110 struct ib_pd *pd; 111 struct ib_srq *srq; 112 struct nvmet_rdma_cmd *srq_cmds; 113 size_t srq_size; 114 struct kref ref; 115 struct list_head entry; 116 int inline_data_size; 117 int inline_page_count; 118 }; 119 120 static bool nvmet_rdma_use_srq; 121 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); 122 MODULE_PARM_DESC(use_srq, "Use shared receive queue."); 123 124 static DEFINE_IDA(nvmet_rdma_queue_ida); 125 static LIST_HEAD(nvmet_rdma_queue_list); 126 static DEFINE_MUTEX(nvmet_rdma_queue_mutex); 127 128 static LIST_HEAD(device_list); 129 static DEFINE_MUTEX(device_list_mutex); 130 131 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); 132 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); 133 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); 134 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); 135 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); 136 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); 137 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, 138 struct nvmet_rdma_rsp *r); 139 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, 140 struct nvmet_rdma_rsp *r); 141 142 static const struct nvmet_fabrics_ops nvmet_rdma_ops; 143 144 static int num_pages(int len) 145 { 146 return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); 147 } 148 149 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) 150 { 151 return nvme_is_write(rsp->req.cmd) && 152 rsp->req.transfer_len && 153 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 154 } 155 156 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) 157 { 158 return !nvme_is_write(rsp->req.cmd) && 159 rsp->req.transfer_len && 160 !rsp->req.cqe->status && 161 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 162 } 163 164 static inline struct nvmet_rdma_rsp * 165 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) 166 { 167 struct nvmet_rdma_rsp *rsp; 168 unsigned long flags; 169 170 spin_lock_irqsave(&queue->rsps_lock, flags); 171 rsp = list_first_entry_or_null(&queue->free_rsps, 172 struct nvmet_rdma_rsp, free_list); 173 if (likely(rsp)) 174 list_del(&rsp->free_list); 175 spin_unlock_irqrestore(&queue->rsps_lock, flags); 176 177 if (unlikely(!rsp)) { 178 int ret; 179 180 rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); 181 if (unlikely(!rsp)) 182 return NULL; 183 ret = nvmet_rdma_alloc_rsp(queue->dev, rsp); 184 if (unlikely(ret)) { 185 kfree(rsp); 186 return NULL; 187 } 188 189 rsp->allocated = true; 190 } 191 192 return rsp; 193 } 194 195 static inline void 196 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) 197 { 198 unsigned long flags; 199 200 if (unlikely(rsp->allocated)) { 201 nvmet_rdma_free_rsp(rsp->queue->dev, rsp); 202 kfree(rsp); 203 return; 204 } 205 206 spin_lock_irqsave(&rsp->queue->rsps_lock, flags); 207 list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); 208 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); 209 } 210 211 static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, 212 struct nvmet_rdma_cmd *c) 213 { 214 struct scatterlist *sg; 215 struct ib_sge *sge; 216 int i; 217 218 if (!ndev->inline_data_size) 219 return; 220 221 sg = c->inline_sg; 222 sge = &c->sge[1]; 223 224 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { 225 if (sge->length) 226 ib_dma_unmap_page(ndev->device, sge->addr, 227 sge->length, DMA_FROM_DEVICE); 228 if (sg_page(sg)) 229 __free_page(sg_page(sg)); 230 } 231 } 232 233 static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev, 234 struct nvmet_rdma_cmd *c) 235 { 236 struct scatterlist *sg; 237 struct ib_sge *sge; 238 struct page *pg; 239 int len; 240 int i; 241 242 if (!ndev->inline_data_size) 243 return 0; 244 245 sg = c->inline_sg; 246 sg_init_table(sg, ndev->inline_page_count); 247 sge = &c->sge[1]; 248 len = ndev->inline_data_size; 249 250 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { 251 pg = alloc_page(GFP_KERNEL); 252 if (!pg) 253 goto out_err; 254 sg_assign_page(sg, pg); 255 sge->addr = ib_dma_map_page(ndev->device, 256 pg, 0, PAGE_SIZE, DMA_FROM_DEVICE); 257 if (ib_dma_mapping_error(ndev->device, sge->addr)) 258 goto out_err; 259 sge->length = min_t(int, len, PAGE_SIZE); 260 sge->lkey = ndev->pd->local_dma_lkey; 261 len -= sge->length; 262 } 263 264 return 0; 265 out_err: 266 for (; i >= 0; i--, sg--, sge--) { 267 if (sge->length) 268 ib_dma_unmap_page(ndev->device, sge->addr, 269 sge->length, DMA_FROM_DEVICE); 270 if (sg_page(sg)) 271 __free_page(sg_page(sg)); 272 } 273 return -ENOMEM; 274 } 275 276 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, 277 struct nvmet_rdma_cmd *c, bool admin) 278 { 279 /* NVMe command / RDMA RECV */ 280 c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL); 281 if (!c->nvme_cmd) 282 goto out; 283 284 c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd, 285 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 286 if (ib_dma_mapping_error(ndev->device, c->sge[0].addr)) 287 goto out_free_cmd; 288 289 c->sge[0].length = sizeof(*c->nvme_cmd); 290 c->sge[0].lkey = ndev->pd->local_dma_lkey; 291 292 if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c)) 293 goto out_unmap_cmd; 294 295 c->cqe.done = nvmet_rdma_recv_done; 296 297 c->wr.wr_cqe = &c->cqe; 298 c->wr.sg_list = c->sge; 299 c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1; 300 301 return 0; 302 303 out_unmap_cmd: 304 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 305 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 306 out_free_cmd: 307 kfree(c->nvme_cmd); 308 309 out: 310 return -ENOMEM; 311 } 312 313 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, 314 struct nvmet_rdma_cmd *c, bool admin) 315 { 316 if (!admin) 317 nvmet_rdma_free_inline_pages(ndev, c); 318 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 319 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 320 kfree(c->nvme_cmd); 321 } 322 323 static struct nvmet_rdma_cmd * 324 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, 325 int nr_cmds, bool admin) 326 { 327 struct nvmet_rdma_cmd *cmds; 328 int ret = -EINVAL, i; 329 330 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); 331 if (!cmds) 332 goto out; 333 334 for (i = 0; i < nr_cmds; i++) { 335 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin); 336 if (ret) 337 goto out_free; 338 } 339 340 return cmds; 341 342 out_free: 343 while (--i >= 0) 344 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 345 kfree(cmds); 346 out: 347 return ERR_PTR(ret); 348 } 349 350 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, 351 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin) 352 { 353 int i; 354 355 for (i = 0; i < nr_cmds; i++) 356 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 357 kfree(cmds); 358 } 359 360 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, 361 struct nvmet_rdma_rsp *r) 362 { 363 /* NVMe CQE / RDMA SEND */ 364 r->req.cqe = kmalloc(sizeof(*r->req.cqe), GFP_KERNEL); 365 if (!r->req.cqe) 366 goto out; 367 368 r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.cqe, 369 sizeof(*r->req.cqe), DMA_TO_DEVICE); 370 if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) 371 goto out_free_rsp; 372 373 r->req.p2p_client = &ndev->device->dev; 374 r->send_sge.length = sizeof(*r->req.cqe); 375 r->send_sge.lkey = ndev->pd->local_dma_lkey; 376 377 r->send_cqe.done = nvmet_rdma_send_done; 378 379 r->send_wr.wr_cqe = &r->send_cqe; 380 r->send_wr.sg_list = &r->send_sge; 381 r->send_wr.num_sge = 1; 382 r->send_wr.send_flags = IB_SEND_SIGNALED; 383 384 /* Data In / RDMA READ */ 385 r->read_cqe.done = nvmet_rdma_read_data_done; 386 return 0; 387 388 out_free_rsp: 389 kfree(r->req.cqe); 390 out: 391 return -ENOMEM; 392 } 393 394 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, 395 struct nvmet_rdma_rsp *r) 396 { 397 ib_dma_unmap_single(ndev->device, r->send_sge.addr, 398 sizeof(*r->req.cqe), DMA_TO_DEVICE); 399 kfree(r->req.cqe); 400 } 401 402 static int 403 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) 404 { 405 struct nvmet_rdma_device *ndev = queue->dev; 406 int nr_rsps = queue->recv_queue_size * 2; 407 int ret = -EINVAL, i; 408 409 queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), 410 GFP_KERNEL); 411 if (!queue->rsps) 412 goto out; 413 414 for (i = 0; i < nr_rsps; i++) { 415 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 416 417 ret = nvmet_rdma_alloc_rsp(ndev, rsp); 418 if (ret) 419 goto out_free; 420 421 list_add_tail(&rsp->free_list, &queue->free_rsps); 422 } 423 424 return 0; 425 426 out_free: 427 while (--i >= 0) { 428 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 429 430 list_del(&rsp->free_list); 431 nvmet_rdma_free_rsp(ndev, rsp); 432 } 433 kfree(queue->rsps); 434 out: 435 return ret; 436 } 437 438 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) 439 { 440 struct nvmet_rdma_device *ndev = queue->dev; 441 int i, nr_rsps = queue->recv_queue_size * 2; 442 443 for (i = 0; i < nr_rsps; i++) { 444 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 445 446 list_del(&rsp->free_list); 447 nvmet_rdma_free_rsp(ndev, rsp); 448 } 449 kfree(queue->rsps); 450 } 451 452 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, 453 struct nvmet_rdma_cmd *cmd) 454 { 455 int ret; 456 457 ib_dma_sync_single_for_device(ndev->device, 458 cmd->sge[0].addr, cmd->sge[0].length, 459 DMA_FROM_DEVICE); 460 461 if (ndev->srq) 462 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL); 463 else 464 ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL); 465 466 if (unlikely(ret)) 467 pr_err("post_recv cmd failed\n"); 468 469 return ret; 470 } 471 472 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) 473 { 474 spin_lock(&queue->rsp_wr_wait_lock); 475 while (!list_empty(&queue->rsp_wr_wait_list)) { 476 struct nvmet_rdma_rsp *rsp; 477 bool ret; 478 479 rsp = list_entry(queue->rsp_wr_wait_list.next, 480 struct nvmet_rdma_rsp, wait_list); 481 list_del(&rsp->wait_list); 482 483 spin_unlock(&queue->rsp_wr_wait_lock); 484 ret = nvmet_rdma_execute_command(rsp); 485 spin_lock(&queue->rsp_wr_wait_lock); 486 487 if (!ret) { 488 list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); 489 break; 490 } 491 } 492 spin_unlock(&queue->rsp_wr_wait_lock); 493 } 494 495 496 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) 497 { 498 struct nvmet_rdma_queue *queue = rsp->queue; 499 500 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 501 502 if (rsp->n_rdma) { 503 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 504 queue->cm_id->port_num, rsp->req.sg, 505 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 506 } 507 508 if (rsp->req.sg != rsp->cmd->inline_sg) 509 nvmet_req_free_sgl(&rsp->req); 510 511 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) 512 nvmet_rdma_process_wr_wait_list(queue); 513 514 nvmet_rdma_put_rsp(rsp); 515 } 516 517 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue) 518 { 519 if (queue->nvme_sq.ctrl) { 520 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); 521 } else { 522 /* 523 * we didn't setup the controller yet in case 524 * of admin connect error, just disconnect and 525 * cleanup the queue 526 */ 527 nvmet_rdma_queue_disconnect(queue); 528 } 529 } 530 531 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) 532 { 533 struct nvmet_rdma_rsp *rsp = 534 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); 535 struct nvmet_rdma_queue *queue = cq->cq_context; 536 537 nvmet_rdma_release_rsp(rsp); 538 539 if (unlikely(wc->status != IB_WC_SUCCESS && 540 wc->status != IB_WC_WR_FLUSH_ERR)) { 541 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", 542 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 543 nvmet_rdma_error_comp(queue); 544 } 545 } 546 547 static void nvmet_rdma_queue_response(struct nvmet_req *req) 548 { 549 struct nvmet_rdma_rsp *rsp = 550 container_of(req, struct nvmet_rdma_rsp, req); 551 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 552 struct ib_send_wr *first_wr; 553 554 if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { 555 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; 556 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; 557 } else { 558 rsp->send_wr.opcode = IB_WR_SEND; 559 } 560 561 if (nvmet_rdma_need_data_out(rsp)) 562 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, 563 cm_id->port_num, NULL, &rsp->send_wr); 564 else 565 first_wr = &rsp->send_wr; 566 567 nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); 568 569 ib_dma_sync_single_for_device(rsp->queue->dev->device, 570 rsp->send_sge.addr, rsp->send_sge.length, 571 DMA_TO_DEVICE); 572 573 if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) { 574 pr_err("sending cmd response failed\n"); 575 nvmet_rdma_release_rsp(rsp); 576 } 577 } 578 579 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) 580 { 581 struct nvmet_rdma_rsp *rsp = 582 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); 583 struct nvmet_rdma_queue *queue = cq->cq_context; 584 585 WARN_ON(rsp->n_rdma <= 0); 586 atomic_add(rsp->n_rdma, &queue->sq_wr_avail); 587 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 588 queue->cm_id->port_num, rsp->req.sg, 589 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 590 rsp->n_rdma = 0; 591 592 if (unlikely(wc->status != IB_WC_SUCCESS)) { 593 nvmet_req_uninit(&rsp->req); 594 nvmet_rdma_release_rsp(rsp); 595 if (wc->status != IB_WC_WR_FLUSH_ERR) { 596 pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n", 597 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 598 nvmet_rdma_error_comp(queue); 599 } 600 return; 601 } 602 603 rsp->req.execute(&rsp->req); 604 } 605 606 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, 607 u64 off) 608 { 609 int sg_count = num_pages(len); 610 struct scatterlist *sg; 611 int i; 612 613 sg = rsp->cmd->inline_sg; 614 for (i = 0; i < sg_count; i++, sg++) { 615 if (i < sg_count - 1) 616 sg_unmark_end(sg); 617 else 618 sg_mark_end(sg); 619 sg->offset = off; 620 sg->length = min_t(int, len, PAGE_SIZE - off); 621 len -= sg->length; 622 if (!i) 623 off = 0; 624 } 625 626 rsp->req.sg = rsp->cmd->inline_sg; 627 rsp->req.sg_cnt = sg_count; 628 } 629 630 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) 631 { 632 struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl; 633 u64 off = le64_to_cpu(sgl->addr); 634 u32 len = le32_to_cpu(sgl->length); 635 636 if (!nvme_is_write(rsp->req.cmd)) { 637 rsp->req.error_loc = 638 offsetof(struct nvme_common_command, opcode); 639 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 640 } 641 642 if (off + len > rsp->queue->dev->inline_data_size) { 643 pr_err("invalid inline data offset!\n"); 644 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; 645 } 646 647 /* no data command? */ 648 if (!len) 649 return 0; 650 651 nvmet_rdma_use_inline_sg(rsp, len, off); 652 rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; 653 rsp->req.transfer_len += len; 654 return 0; 655 } 656 657 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, 658 struct nvme_keyed_sgl_desc *sgl, bool invalidate) 659 { 660 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 661 u64 addr = le64_to_cpu(sgl->addr); 662 u32 key = get_unaligned_le32(sgl->key); 663 int ret; 664 665 rsp->req.transfer_len = get_unaligned_le24(sgl->length); 666 667 /* no data command? */ 668 if (!rsp->req.transfer_len) 669 return 0; 670 671 ret = nvmet_req_alloc_sgl(&rsp->req); 672 if (unlikely(ret < 0)) 673 goto error_out; 674 675 ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, 676 rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, 677 nvmet_data_dir(&rsp->req)); 678 if (unlikely(ret < 0)) 679 goto error_out; 680 rsp->n_rdma += ret; 681 682 if (invalidate) { 683 rsp->invalidate_rkey = key; 684 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; 685 } 686 687 return 0; 688 689 error_out: 690 rsp->req.transfer_len = 0; 691 return NVME_SC_INTERNAL; 692 } 693 694 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) 695 { 696 struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl; 697 698 switch (sgl->type >> 4) { 699 case NVME_SGL_FMT_DATA_DESC: 700 switch (sgl->type & 0xf) { 701 case NVME_SGL_FMT_OFFSET: 702 return nvmet_rdma_map_sgl_inline(rsp); 703 default: 704 pr_err("invalid SGL subtype: %#x\n", sgl->type); 705 rsp->req.error_loc = 706 offsetof(struct nvme_common_command, dptr); 707 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 708 } 709 case NVME_KEY_SGL_FMT_DATA_DESC: 710 switch (sgl->type & 0xf) { 711 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE: 712 return nvmet_rdma_map_sgl_keyed(rsp, sgl, true); 713 case NVME_SGL_FMT_ADDRESS: 714 return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); 715 default: 716 pr_err("invalid SGL subtype: %#x\n", sgl->type); 717 rsp->req.error_loc = 718 offsetof(struct nvme_common_command, dptr); 719 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 720 } 721 default: 722 pr_err("invalid SGL type: %#x\n", sgl->type); 723 rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); 724 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; 725 } 726 } 727 728 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) 729 { 730 struct nvmet_rdma_queue *queue = rsp->queue; 731 732 if (unlikely(atomic_sub_return(1 + rsp->n_rdma, 733 &queue->sq_wr_avail) < 0)) { 734 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n", 735 1 + rsp->n_rdma, queue->idx, 736 queue->nvme_sq.ctrl->cntlid); 737 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 738 return false; 739 } 740 741 if (nvmet_rdma_need_data_in(rsp)) { 742 if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, 743 queue->cm_id->port_num, &rsp->read_cqe, NULL)) 744 nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); 745 } else { 746 rsp->req.execute(&rsp->req); 747 } 748 749 return true; 750 } 751 752 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, 753 struct nvmet_rdma_rsp *cmd) 754 { 755 u16 status; 756 757 ib_dma_sync_single_for_cpu(queue->dev->device, 758 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length, 759 DMA_FROM_DEVICE); 760 ib_dma_sync_single_for_cpu(queue->dev->device, 761 cmd->send_sge.addr, cmd->send_sge.length, 762 DMA_TO_DEVICE); 763 764 if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, 765 &queue->nvme_sq, &nvmet_rdma_ops)) 766 return; 767 768 status = nvmet_rdma_map_sgl(cmd); 769 if (status) 770 goto out_err; 771 772 if (unlikely(!nvmet_rdma_execute_command(cmd))) { 773 spin_lock(&queue->rsp_wr_wait_lock); 774 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list); 775 spin_unlock(&queue->rsp_wr_wait_lock); 776 } 777 778 return; 779 780 out_err: 781 nvmet_req_complete(&cmd->req, status); 782 } 783 784 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) 785 { 786 struct nvmet_rdma_cmd *cmd = 787 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); 788 struct nvmet_rdma_queue *queue = cq->cq_context; 789 struct nvmet_rdma_rsp *rsp; 790 791 if (unlikely(wc->status != IB_WC_SUCCESS)) { 792 if (wc->status != IB_WC_WR_FLUSH_ERR) { 793 pr_err("RECV for CQE 0x%p failed with status %s (%d)\n", 794 wc->wr_cqe, ib_wc_status_msg(wc->status), 795 wc->status); 796 nvmet_rdma_error_comp(queue); 797 } 798 return; 799 } 800 801 if (unlikely(wc->byte_len < sizeof(struct nvme_command))) { 802 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n"); 803 nvmet_rdma_error_comp(queue); 804 return; 805 } 806 807 cmd->queue = queue; 808 rsp = nvmet_rdma_get_rsp(queue); 809 if (unlikely(!rsp)) { 810 /* 811 * we get here only under memory pressure, 812 * silently drop and have the host retry 813 * as we can't even fail it. 814 */ 815 nvmet_rdma_post_recv(queue->dev, cmd); 816 return; 817 } 818 rsp->queue = queue; 819 rsp->cmd = cmd; 820 rsp->flags = 0; 821 rsp->req.cmd = cmd->nvme_cmd; 822 rsp->req.port = queue->port; 823 rsp->n_rdma = 0; 824 825 if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { 826 unsigned long flags; 827 828 spin_lock_irqsave(&queue->state_lock, flags); 829 if (queue->state == NVMET_RDMA_Q_CONNECTING) 830 list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); 831 else 832 nvmet_rdma_put_rsp(rsp); 833 spin_unlock_irqrestore(&queue->state_lock, flags); 834 return; 835 } 836 837 nvmet_rdma_handle_command(queue, rsp); 838 } 839 840 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev) 841 { 842 if (!ndev->srq) 843 return; 844 845 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 846 ib_destroy_srq(ndev->srq); 847 } 848 849 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) 850 { 851 struct ib_srq_init_attr srq_attr = { NULL, }; 852 struct ib_srq *srq; 853 size_t srq_size; 854 int ret, i; 855 856 srq_size = 4095; /* XXX: tune */ 857 858 srq_attr.attr.max_wr = srq_size; 859 srq_attr.attr.max_sge = 1 + ndev->inline_page_count; 860 srq_attr.attr.srq_limit = 0; 861 srq_attr.srq_type = IB_SRQT_BASIC; 862 srq = ib_create_srq(ndev->pd, &srq_attr); 863 if (IS_ERR(srq)) { 864 /* 865 * If SRQs aren't supported we just go ahead and use normal 866 * non-shared receive queues. 867 */ 868 pr_info("SRQ requested but not supported.\n"); 869 return 0; 870 } 871 872 ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); 873 if (IS_ERR(ndev->srq_cmds)) { 874 ret = PTR_ERR(ndev->srq_cmds); 875 goto out_destroy_srq; 876 } 877 878 ndev->srq = srq; 879 ndev->srq_size = srq_size; 880 881 for (i = 0; i < srq_size; i++) { 882 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); 883 if (ret) 884 goto out_free_cmds; 885 } 886 887 return 0; 888 889 out_free_cmds: 890 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 891 out_destroy_srq: 892 ib_destroy_srq(srq); 893 return ret; 894 } 895 896 static void nvmet_rdma_free_dev(struct kref *ref) 897 { 898 struct nvmet_rdma_device *ndev = 899 container_of(ref, struct nvmet_rdma_device, ref); 900 901 mutex_lock(&device_list_mutex); 902 list_del(&ndev->entry); 903 mutex_unlock(&device_list_mutex); 904 905 nvmet_rdma_destroy_srq(ndev); 906 ib_dealloc_pd(ndev->pd); 907 908 kfree(ndev); 909 } 910 911 static struct nvmet_rdma_device * 912 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) 913 { 914 struct nvmet_port *port = cm_id->context; 915 struct nvmet_rdma_device *ndev; 916 int inline_page_count; 917 int inline_sge_count; 918 int ret; 919 920 mutex_lock(&device_list_mutex); 921 list_for_each_entry(ndev, &device_list, entry) { 922 if (ndev->device->node_guid == cm_id->device->node_guid && 923 kref_get_unless_zero(&ndev->ref)) 924 goto out_unlock; 925 } 926 927 ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); 928 if (!ndev) 929 goto out_err; 930 931 inline_page_count = num_pages(port->inline_data_size); 932 inline_sge_count = max(cm_id->device->attrs.max_sge_rd, 933 cm_id->device->attrs.max_recv_sge) - 1; 934 if (inline_page_count > inline_sge_count) { 935 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", 936 port->inline_data_size, cm_id->device->name, 937 inline_sge_count * PAGE_SIZE); 938 port->inline_data_size = inline_sge_count * PAGE_SIZE; 939 inline_page_count = inline_sge_count; 940 } 941 ndev->inline_data_size = port->inline_data_size; 942 ndev->inline_page_count = inline_page_count; 943 ndev->device = cm_id->device; 944 kref_init(&ndev->ref); 945 946 ndev->pd = ib_alloc_pd(ndev->device, 0); 947 if (IS_ERR(ndev->pd)) 948 goto out_free_dev; 949 950 if (nvmet_rdma_use_srq) { 951 ret = nvmet_rdma_init_srq(ndev); 952 if (ret) 953 goto out_free_pd; 954 } 955 956 list_add(&ndev->entry, &device_list); 957 out_unlock: 958 mutex_unlock(&device_list_mutex); 959 pr_debug("added %s.\n", ndev->device->name); 960 return ndev; 961 962 out_free_pd: 963 ib_dealloc_pd(ndev->pd); 964 out_free_dev: 965 kfree(ndev); 966 out_err: 967 mutex_unlock(&device_list_mutex); 968 return NULL; 969 } 970 971 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) 972 { 973 struct ib_qp_init_attr qp_attr; 974 struct nvmet_rdma_device *ndev = queue->dev; 975 int comp_vector, nr_cqe, ret, i, factor; 976 977 /* 978 * Spread the io queues across completion vectors, 979 * but still keep all admin queues on vector 0. 980 */ 981 comp_vector = !queue->host_qid ? 0 : 982 queue->idx % ndev->device->num_comp_vectors; 983 984 /* 985 * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. 986 */ 987 nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; 988 989 queue->cq = ib_alloc_cq(ndev->device, queue, 990 nr_cqe + 1, comp_vector, 991 IB_POLL_WORKQUEUE); 992 if (IS_ERR(queue->cq)) { 993 ret = PTR_ERR(queue->cq); 994 pr_err("failed to create CQ cqe= %d ret= %d\n", 995 nr_cqe + 1, ret); 996 goto out; 997 } 998 999 memset(&qp_attr, 0, sizeof(qp_attr)); 1000 qp_attr.qp_context = queue; 1001 qp_attr.event_handler = nvmet_rdma_qp_event; 1002 qp_attr.send_cq = queue->cq; 1003 qp_attr.recv_cq = queue->cq; 1004 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 1005 qp_attr.qp_type = IB_QPT_RC; 1006 /* +1 for drain */ 1007 qp_attr.cap.max_send_wr = queue->send_queue_size + 1; 1008 factor = rdma_rw_mr_factor(ndev->device, queue->cm_id->port_num, 1009 1 << NVMET_RDMA_MAX_MDTS); 1010 qp_attr.cap.max_rdma_ctxs = queue->send_queue_size * factor; 1011 qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, 1012 ndev->device->attrs.max_send_sge); 1013 1014 if (ndev->srq) { 1015 qp_attr.srq = ndev->srq; 1016 } else { 1017 /* +1 for drain */ 1018 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; 1019 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; 1020 } 1021 1022 ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); 1023 if (ret) { 1024 pr_err("failed to create_qp ret= %d\n", ret); 1025 goto err_destroy_cq; 1026 } 1027 1028 atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); 1029 1030 pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", 1031 __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, 1032 qp_attr.cap.max_send_wr, queue->cm_id); 1033 1034 if (!ndev->srq) { 1035 for (i = 0; i < queue->recv_queue_size; i++) { 1036 queue->cmds[i].queue = queue; 1037 ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); 1038 if (ret) 1039 goto err_destroy_qp; 1040 } 1041 } 1042 1043 out: 1044 return ret; 1045 1046 err_destroy_qp: 1047 rdma_destroy_qp(queue->cm_id); 1048 err_destroy_cq: 1049 ib_free_cq(queue->cq); 1050 goto out; 1051 } 1052 1053 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) 1054 { 1055 struct ib_qp *qp = queue->cm_id->qp; 1056 1057 ib_drain_qp(qp); 1058 rdma_destroy_id(queue->cm_id); 1059 ib_destroy_qp(qp); 1060 ib_free_cq(queue->cq); 1061 } 1062 1063 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) 1064 { 1065 pr_debug("freeing queue %d\n", queue->idx); 1066 1067 nvmet_sq_destroy(&queue->nvme_sq); 1068 1069 nvmet_rdma_destroy_queue_ib(queue); 1070 if (!queue->dev->srq) { 1071 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1072 queue->recv_queue_size, 1073 !queue->host_qid); 1074 } 1075 nvmet_rdma_free_rsps(queue); 1076 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1077 kfree(queue); 1078 } 1079 1080 static void nvmet_rdma_release_queue_work(struct work_struct *w) 1081 { 1082 struct nvmet_rdma_queue *queue = 1083 container_of(w, struct nvmet_rdma_queue, release_work); 1084 struct nvmet_rdma_device *dev = queue->dev; 1085 1086 nvmet_rdma_free_queue(queue); 1087 1088 kref_put(&dev->ref, nvmet_rdma_free_dev); 1089 } 1090 1091 static int 1092 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, 1093 struct nvmet_rdma_queue *queue) 1094 { 1095 struct nvme_rdma_cm_req *req; 1096 1097 req = (struct nvme_rdma_cm_req *)conn->private_data; 1098 if (!req || conn->private_data_len == 0) 1099 return NVME_RDMA_CM_INVALID_LEN; 1100 1101 if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0) 1102 return NVME_RDMA_CM_INVALID_RECFMT; 1103 1104 queue->host_qid = le16_to_cpu(req->qid); 1105 1106 /* 1107 * req->hsqsize corresponds to our recv queue size plus 1 1108 * req->hrqsize corresponds to our send queue size 1109 */ 1110 queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; 1111 queue->send_queue_size = le16_to_cpu(req->hrqsize); 1112 1113 if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH) 1114 return NVME_RDMA_CM_INVALID_HSQSIZE; 1115 1116 /* XXX: Should we enforce some kind of max for IO queues? */ 1117 1118 return 0; 1119 } 1120 1121 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, 1122 enum nvme_rdma_cm_status status) 1123 { 1124 struct nvme_rdma_cm_rej rej; 1125 1126 pr_debug("rejecting connect request: status %d (%s)\n", 1127 status, nvme_rdma_cm_msg(status)); 1128 1129 rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1130 rej.sts = cpu_to_le16(status); 1131 1132 return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); 1133 } 1134 1135 static struct nvmet_rdma_queue * 1136 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, 1137 struct rdma_cm_id *cm_id, 1138 struct rdma_cm_event *event) 1139 { 1140 struct nvmet_rdma_queue *queue; 1141 int ret; 1142 1143 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 1144 if (!queue) { 1145 ret = NVME_RDMA_CM_NO_RSC; 1146 goto out_reject; 1147 } 1148 1149 ret = nvmet_sq_init(&queue->nvme_sq); 1150 if (ret) { 1151 ret = NVME_RDMA_CM_NO_RSC; 1152 goto out_free_queue; 1153 } 1154 1155 ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); 1156 if (ret) 1157 goto out_destroy_sq; 1158 1159 /* 1160 * Schedules the actual release because calling rdma_destroy_id from 1161 * inside a CM callback would trigger a deadlock. (great API design..) 1162 */ 1163 INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); 1164 queue->dev = ndev; 1165 queue->cm_id = cm_id; 1166 1167 spin_lock_init(&queue->state_lock); 1168 queue->state = NVMET_RDMA_Q_CONNECTING; 1169 INIT_LIST_HEAD(&queue->rsp_wait_list); 1170 INIT_LIST_HEAD(&queue->rsp_wr_wait_list); 1171 spin_lock_init(&queue->rsp_wr_wait_lock); 1172 INIT_LIST_HEAD(&queue->free_rsps); 1173 spin_lock_init(&queue->rsps_lock); 1174 INIT_LIST_HEAD(&queue->queue_list); 1175 1176 queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); 1177 if (queue->idx < 0) { 1178 ret = NVME_RDMA_CM_NO_RSC; 1179 goto out_destroy_sq; 1180 } 1181 1182 ret = nvmet_rdma_alloc_rsps(queue); 1183 if (ret) { 1184 ret = NVME_RDMA_CM_NO_RSC; 1185 goto out_ida_remove; 1186 } 1187 1188 if (!ndev->srq) { 1189 queue->cmds = nvmet_rdma_alloc_cmds(ndev, 1190 queue->recv_queue_size, 1191 !queue->host_qid); 1192 if (IS_ERR(queue->cmds)) { 1193 ret = NVME_RDMA_CM_NO_RSC; 1194 goto out_free_responses; 1195 } 1196 } 1197 1198 ret = nvmet_rdma_create_queue_ib(queue); 1199 if (ret) { 1200 pr_err("%s: creating RDMA queue failed (%d).\n", 1201 __func__, ret); 1202 ret = NVME_RDMA_CM_NO_RSC; 1203 goto out_free_cmds; 1204 } 1205 1206 return queue; 1207 1208 out_free_cmds: 1209 if (!ndev->srq) { 1210 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1211 queue->recv_queue_size, 1212 !queue->host_qid); 1213 } 1214 out_free_responses: 1215 nvmet_rdma_free_rsps(queue); 1216 out_ida_remove: 1217 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1218 out_destroy_sq: 1219 nvmet_sq_destroy(&queue->nvme_sq); 1220 out_free_queue: 1221 kfree(queue); 1222 out_reject: 1223 nvmet_rdma_cm_reject(cm_id, ret); 1224 return NULL; 1225 } 1226 1227 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) 1228 { 1229 struct nvmet_rdma_queue *queue = priv; 1230 1231 switch (event->event) { 1232 case IB_EVENT_COMM_EST: 1233 rdma_notify(queue->cm_id, event->event); 1234 break; 1235 default: 1236 pr_err("received IB QP event: %s (%d)\n", 1237 ib_event_msg(event->event), event->event); 1238 break; 1239 } 1240 } 1241 1242 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, 1243 struct nvmet_rdma_queue *queue, 1244 struct rdma_conn_param *p) 1245 { 1246 struct rdma_conn_param param = { }; 1247 struct nvme_rdma_cm_rep priv = { }; 1248 int ret = -ENOMEM; 1249 1250 param.rnr_retry_count = 7; 1251 param.flow_control = 1; 1252 param.initiator_depth = min_t(u8, p->initiator_depth, 1253 queue->dev->device->attrs.max_qp_init_rd_atom); 1254 param.private_data = &priv; 1255 param.private_data_len = sizeof(priv); 1256 priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1257 priv.crqsize = cpu_to_le16(queue->recv_queue_size); 1258 1259 ret = rdma_accept(cm_id, ¶m); 1260 if (ret) 1261 pr_err("rdma_accept failed (error code = %d)\n", ret); 1262 1263 return ret; 1264 } 1265 1266 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, 1267 struct rdma_cm_event *event) 1268 { 1269 struct nvmet_rdma_device *ndev; 1270 struct nvmet_rdma_queue *queue; 1271 int ret = -EINVAL; 1272 1273 ndev = nvmet_rdma_find_get_device(cm_id); 1274 if (!ndev) { 1275 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC); 1276 return -ECONNREFUSED; 1277 } 1278 1279 queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); 1280 if (!queue) { 1281 ret = -ENOMEM; 1282 goto put_device; 1283 } 1284 queue->port = cm_id->context; 1285 1286 if (queue->host_qid == 0) { 1287 /* Let inflight controller teardown complete */ 1288 flush_scheduled_work(); 1289 } 1290 1291 ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); 1292 if (ret) { 1293 schedule_work(&queue->release_work); 1294 /* Destroying rdma_cm id is not needed here */ 1295 return 0; 1296 } 1297 1298 mutex_lock(&nvmet_rdma_queue_mutex); 1299 list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); 1300 mutex_unlock(&nvmet_rdma_queue_mutex); 1301 1302 return 0; 1303 1304 put_device: 1305 kref_put(&ndev->ref, nvmet_rdma_free_dev); 1306 1307 return ret; 1308 } 1309 1310 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue) 1311 { 1312 unsigned long flags; 1313 1314 spin_lock_irqsave(&queue->state_lock, flags); 1315 if (queue->state != NVMET_RDMA_Q_CONNECTING) { 1316 pr_warn("trying to establish a connected queue\n"); 1317 goto out_unlock; 1318 } 1319 queue->state = NVMET_RDMA_Q_LIVE; 1320 1321 while (!list_empty(&queue->rsp_wait_list)) { 1322 struct nvmet_rdma_rsp *cmd; 1323 1324 cmd = list_first_entry(&queue->rsp_wait_list, 1325 struct nvmet_rdma_rsp, wait_list); 1326 list_del(&cmd->wait_list); 1327 1328 spin_unlock_irqrestore(&queue->state_lock, flags); 1329 nvmet_rdma_handle_command(queue, cmd); 1330 spin_lock_irqsave(&queue->state_lock, flags); 1331 } 1332 1333 out_unlock: 1334 spin_unlock_irqrestore(&queue->state_lock, flags); 1335 } 1336 1337 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1338 { 1339 bool disconnect = false; 1340 unsigned long flags; 1341 1342 pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state); 1343 1344 spin_lock_irqsave(&queue->state_lock, flags); 1345 switch (queue->state) { 1346 case NVMET_RDMA_Q_CONNECTING: 1347 case NVMET_RDMA_Q_LIVE: 1348 queue->state = NVMET_RDMA_Q_DISCONNECTING; 1349 disconnect = true; 1350 break; 1351 case NVMET_RDMA_Q_DISCONNECTING: 1352 break; 1353 } 1354 spin_unlock_irqrestore(&queue->state_lock, flags); 1355 1356 if (disconnect) { 1357 rdma_disconnect(queue->cm_id); 1358 schedule_work(&queue->release_work); 1359 } 1360 } 1361 1362 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1363 { 1364 bool disconnect = false; 1365 1366 mutex_lock(&nvmet_rdma_queue_mutex); 1367 if (!list_empty(&queue->queue_list)) { 1368 list_del_init(&queue->queue_list); 1369 disconnect = true; 1370 } 1371 mutex_unlock(&nvmet_rdma_queue_mutex); 1372 1373 if (disconnect) 1374 __nvmet_rdma_queue_disconnect(queue); 1375 } 1376 1377 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, 1378 struct nvmet_rdma_queue *queue) 1379 { 1380 WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); 1381 1382 mutex_lock(&nvmet_rdma_queue_mutex); 1383 if (!list_empty(&queue->queue_list)) 1384 list_del_init(&queue->queue_list); 1385 mutex_unlock(&nvmet_rdma_queue_mutex); 1386 1387 pr_err("failed to connect queue %d\n", queue->idx); 1388 schedule_work(&queue->release_work); 1389 } 1390 1391 /** 1392 * nvme_rdma_device_removal() - Handle RDMA device removal 1393 * @cm_id: rdma_cm id, used for nvmet port 1394 * @queue: nvmet rdma queue (cm id qp_context) 1395 * 1396 * DEVICE_REMOVAL event notifies us that the RDMA device is about 1397 * to unplug. Note that this event can be generated on a normal 1398 * queue cm_id and/or a device bound listener cm_id (where in this 1399 * case queue will be null). 1400 * 1401 * We registered an ib_client to handle device removal for queues, 1402 * so we only need to handle the listening port cm_ids. In this case 1403 * we nullify the priv to prevent double cm_id destruction and destroying 1404 * the cm_id implicitely by returning a non-zero rc to the callout. 1405 */ 1406 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, 1407 struct nvmet_rdma_queue *queue) 1408 { 1409 struct nvmet_port *port; 1410 1411 if (queue) { 1412 /* 1413 * This is a queue cm_id. we have registered 1414 * an ib_client to handle queues removal 1415 * so don't interfear and just return. 1416 */ 1417 return 0; 1418 } 1419 1420 port = cm_id->context; 1421 1422 /* 1423 * This is a listener cm_id. Make sure that 1424 * future remove_port won't invoke a double 1425 * cm_id destroy. use atomic xchg to make sure 1426 * we don't compete with remove_port. 1427 */ 1428 if (xchg(&port->priv, NULL) != cm_id) 1429 return 0; 1430 1431 /* 1432 * We need to return 1 so that the core will destroy 1433 * it's own ID. What a great API design.. 1434 */ 1435 return 1; 1436 } 1437 1438 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, 1439 struct rdma_cm_event *event) 1440 { 1441 struct nvmet_rdma_queue *queue = NULL; 1442 int ret = 0; 1443 1444 if (cm_id->qp) 1445 queue = cm_id->qp->qp_context; 1446 1447 pr_debug("%s (%d): status %d id %p\n", 1448 rdma_event_msg(event->event), event->event, 1449 event->status, cm_id); 1450 1451 switch (event->event) { 1452 case RDMA_CM_EVENT_CONNECT_REQUEST: 1453 ret = nvmet_rdma_queue_connect(cm_id, event); 1454 break; 1455 case RDMA_CM_EVENT_ESTABLISHED: 1456 nvmet_rdma_queue_established(queue); 1457 break; 1458 case RDMA_CM_EVENT_ADDR_CHANGE: 1459 case RDMA_CM_EVENT_DISCONNECTED: 1460 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1461 nvmet_rdma_queue_disconnect(queue); 1462 break; 1463 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1464 ret = nvmet_rdma_device_removal(cm_id, queue); 1465 break; 1466 case RDMA_CM_EVENT_REJECTED: 1467 pr_debug("Connection rejected: %s\n", 1468 rdma_reject_msg(cm_id, event->status)); 1469 /* FALLTHROUGH */ 1470 case RDMA_CM_EVENT_UNREACHABLE: 1471 case RDMA_CM_EVENT_CONNECT_ERROR: 1472 nvmet_rdma_queue_connect_fail(cm_id, queue); 1473 break; 1474 default: 1475 pr_err("received unrecognized RDMA CM event %d\n", 1476 event->event); 1477 break; 1478 } 1479 1480 return ret; 1481 } 1482 1483 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) 1484 { 1485 struct nvmet_rdma_queue *queue; 1486 1487 restart: 1488 mutex_lock(&nvmet_rdma_queue_mutex); 1489 list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { 1490 if (queue->nvme_sq.ctrl == ctrl) { 1491 list_del_init(&queue->queue_list); 1492 mutex_unlock(&nvmet_rdma_queue_mutex); 1493 1494 __nvmet_rdma_queue_disconnect(queue); 1495 goto restart; 1496 } 1497 } 1498 mutex_unlock(&nvmet_rdma_queue_mutex); 1499 } 1500 1501 static int nvmet_rdma_add_port(struct nvmet_port *port) 1502 { 1503 struct rdma_cm_id *cm_id; 1504 struct sockaddr_storage addr = { }; 1505 __kernel_sa_family_t af; 1506 int ret; 1507 1508 switch (port->disc_addr.adrfam) { 1509 case NVMF_ADDR_FAMILY_IP4: 1510 af = AF_INET; 1511 break; 1512 case NVMF_ADDR_FAMILY_IP6: 1513 af = AF_INET6; 1514 break; 1515 default: 1516 pr_err("address family %d not supported\n", 1517 port->disc_addr.adrfam); 1518 return -EINVAL; 1519 } 1520 1521 if (port->inline_data_size < 0) { 1522 port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; 1523 } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { 1524 pr_warn("inline_data_size %u is too large, reducing to %u\n", 1525 port->inline_data_size, 1526 NVMET_RDMA_MAX_INLINE_DATA_SIZE); 1527 port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; 1528 } 1529 1530 ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, 1531 port->disc_addr.trsvcid, &addr); 1532 if (ret) { 1533 pr_err("malformed ip/port passed: %s:%s\n", 1534 port->disc_addr.traddr, port->disc_addr.trsvcid); 1535 return ret; 1536 } 1537 1538 cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, 1539 RDMA_PS_TCP, IB_QPT_RC); 1540 if (IS_ERR(cm_id)) { 1541 pr_err("CM ID creation failed\n"); 1542 return PTR_ERR(cm_id); 1543 } 1544 1545 /* 1546 * Allow both IPv4 and IPv6 sockets to bind a single port 1547 * at the same time. 1548 */ 1549 ret = rdma_set_afonly(cm_id, 1); 1550 if (ret) { 1551 pr_err("rdma_set_afonly failed (%d)\n", ret); 1552 goto out_destroy_id; 1553 } 1554 1555 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr); 1556 if (ret) { 1557 pr_err("binding CM ID to %pISpcs failed (%d)\n", 1558 (struct sockaddr *)&addr, ret); 1559 goto out_destroy_id; 1560 } 1561 1562 ret = rdma_listen(cm_id, 128); 1563 if (ret) { 1564 pr_err("listening to %pISpcs failed (%d)\n", 1565 (struct sockaddr *)&addr, ret); 1566 goto out_destroy_id; 1567 } 1568 1569 pr_info("enabling port %d (%pISpcs)\n", 1570 le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr); 1571 port->priv = cm_id; 1572 return 0; 1573 1574 out_destroy_id: 1575 rdma_destroy_id(cm_id); 1576 return ret; 1577 } 1578 1579 static void nvmet_rdma_remove_port(struct nvmet_port *port) 1580 { 1581 struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); 1582 1583 if (cm_id) 1584 rdma_destroy_id(cm_id); 1585 } 1586 1587 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, 1588 struct nvmet_port *port, char *traddr) 1589 { 1590 struct rdma_cm_id *cm_id = port->priv; 1591 1592 if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { 1593 struct nvmet_rdma_rsp *rsp = 1594 container_of(req, struct nvmet_rdma_rsp, req); 1595 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id; 1596 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr; 1597 1598 sprintf(traddr, "%pISc", addr); 1599 } else { 1600 memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); 1601 } 1602 } 1603 1604 static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) 1605 { 1606 return NVMET_RDMA_MAX_MDTS; 1607 } 1608 1609 static const struct nvmet_fabrics_ops nvmet_rdma_ops = { 1610 .owner = THIS_MODULE, 1611 .type = NVMF_TRTYPE_RDMA, 1612 .msdbd = 1, 1613 .has_keyed_sgls = 1, 1614 .add_port = nvmet_rdma_add_port, 1615 .remove_port = nvmet_rdma_remove_port, 1616 .queue_response = nvmet_rdma_queue_response, 1617 .delete_ctrl = nvmet_rdma_delete_ctrl, 1618 .disc_traddr = nvmet_rdma_disc_port_addr, 1619 .get_mdts = nvmet_rdma_get_mdts, 1620 }; 1621 1622 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) 1623 { 1624 struct nvmet_rdma_queue *queue, *tmp; 1625 struct nvmet_rdma_device *ndev; 1626 bool found = false; 1627 1628 mutex_lock(&device_list_mutex); 1629 list_for_each_entry(ndev, &device_list, entry) { 1630 if (ndev->device == ib_device) { 1631 found = true; 1632 break; 1633 } 1634 } 1635 mutex_unlock(&device_list_mutex); 1636 1637 if (!found) 1638 return; 1639 1640 /* 1641 * IB Device that is used by nvmet controllers is being removed, 1642 * delete all queues using this device. 1643 */ 1644 mutex_lock(&nvmet_rdma_queue_mutex); 1645 list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, 1646 queue_list) { 1647 if (queue->dev->device != ib_device) 1648 continue; 1649 1650 pr_info("Removing queue %d\n", queue->idx); 1651 list_del_init(&queue->queue_list); 1652 __nvmet_rdma_queue_disconnect(queue); 1653 } 1654 mutex_unlock(&nvmet_rdma_queue_mutex); 1655 1656 flush_scheduled_work(); 1657 } 1658 1659 static struct ib_client nvmet_rdma_ib_client = { 1660 .name = "nvmet_rdma", 1661 .remove = nvmet_rdma_remove_one 1662 }; 1663 1664 static int __init nvmet_rdma_init(void) 1665 { 1666 int ret; 1667 1668 ret = ib_register_client(&nvmet_rdma_ib_client); 1669 if (ret) 1670 return ret; 1671 1672 ret = nvmet_register_transport(&nvmet_rdma_ops); 1673 if (ret) 1674 goto err_ib_client; 1675 1676 return 0; 1677 1678 err_ib_client: 1679 ib_unregister_client(&nvmet_rdma_ib_client); 1680 return ret; 1681 } 1682 1683 static void __exit nvmet_rdma_exit(void) 1684 { 1685 nvmet_unregister_transport(&nvmet_rdma_ops); 1686 ib_unregister_client(&nvmet_rdma_ib_client); 1687 WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list)); 1688 ida_destroy(&nvmet_rdma_queue_ida); 1689 } 1690 1691 module_init(nvmet_rdma_init); 1692 module_exit(nvmet_rdma_exit); 1693 1694 MODULE_LICENSE("GPL v2"); 1695 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ 1696