1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * NVMe over Fabrics RDMA target. 4 * Copyright (c) 2015-2016 HGST, a Western Digital Company. 5 */ 6 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 7 #include <linux/atomic.h> 8 #include <linux/ctype.h> 9 #include <linux/delay.h> 10 #include <linux/err.h> 11 #include <linux/init.h> 12 #include <linux/module.h> 13 #include <linux/nvme.h> 14 #include <linux/slab.h> 15 #include <linux/string.h> 16 #include <linux/wait.h> 17 #include <linux/inet.h> 18 #include <asm/unaligned.h> 19 20 #include <rdma/ib_verbs.h> 21 #include <rdma/rdma_cm.h> 22 #include <rdma/rw.h> 23 24 #include <linux/nvme-rdma.h> 25 #include "nvmet.h" 26 27 /* 28 * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data 29 */ 30 #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE 31 #define NVMET_RDMA_MAX_INLINE_SGE 4 32 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) 33 34 /* Assume mpsmin == device_page_size == 4KB */ 35 #define NVMET_RDMA_MAX_MDTS 8 36 37 struct nvmet_rdma_cmd { 38 struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; 39 struct ib_cqe cqe; 40 struct ib_recv_wr wr; 41 struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; 42 struct nvme_command *nvme_cmd; 43 struct nvmet_rdma_queue *queue; 44 }; 45 46 enum { 47 NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), 48 NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), 49 }; 50 51 struct nvmet_rdma_rsp { 52 struct ib_sge send_sge; 53 struct ib_cqe send_cqe; 54 struct ib_send_wr send_wr; 55 56 struct nvmet_rdma_cmd *cmd; 57 struct nvmet_rdma_queue *queue; 58 59 struct ib_cqe read_cqe; 60 struct rdma_rw_ctx rw; 61 62 struct nvmet_req req; 63 64 bool allocated; 65 u8 n_rdma; 66 u32 flags; 67 u32 invalidate_rkey; 68 69 struct list_head wait_list; 70 struct list_head free_list; 71 }; 72 73 enum nvmet_rdma_queue_state { 74 NVMET_RDMA_Q_CONNECTING, 75 NVMET_RDMA_Q_LIVE, 76 NVMET_RDMA_Q_DISCONNECTING, 77 }; 78 79 struct nvmet_rdma_queue { 80 struct rdma_cm_id *cm_id; 81 struct nvmet_port *port; 82 struct ib_cq *cq; 83 atomic_t sq_wr_avail; 84 struct nvmet_rdma_device *dev; 85 spinlock_t state_lock; 86 enum nvmet_rdma_queue_state state; 87 struct nvmet_cq nvme_cq; 88 struct nvmet_sq nvme_sq; 89 90 struct nvmet_rdma_rsp *rsps; 91 struct list_head free_rsps; 92 spinlock_t rsps_lock; 93 struct nvmet_rdma_cmd *cmds; 94 95 struct work_struct release_work; 96 struct list_head rsp_wait_list; 97 struct list_head rsp_wr_wait_list; 98 spinlock_t rsp_wr_wait_lock; 99 100 int idx; 101 int host_qid; 102 int recv_queue_size; 103 int send_queue_size; 104 105 struct list_head queue_list; 106 }; 107 108 struct nvmet_rdma_device { 109 struct ib_device *device; 110 struct ib_pd *pd; 111 struct ib_srq *srq; 112 struct nvmet_rdma_cmd *srq_cmds; 113 size_t srq_size; 114 struct kref ref; 115 struct list_head entry; 116 int inline_data_size; 117 int inline_page_count; 118 }; 119 120 static bool nvmet_rdma_use_srq; 121 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); 122 MODULE_PARM_DESC(use_srq, "Use shared receive queue."); 123 124 static DEFINE_IDA(nvmet_rdma_queue_ida); 125 static LIST_HEAD(nvmet_rdma_queue_list); 126 static DEFINE_MUTEX(nvmet_rdma_queue_mutex); 127 128 static LIST_HEAD(device_list); 129 static DEFINE_MUTEX(device_list_mutex); 130 131 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); 132 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); 133 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); 134 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); 135 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); 136 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); 137 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, 138 struct nvmet_rdma_rsp *r); 139 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, 140 struct nvmet_rdma_rsp *r); 141 142 static const struct nvmet_fabrics_ops nvmet_rdma_ops; 143 144 static int num_pages(int len) 145 { 146 return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); 147 } 148 149 /* XXX: really should move to a generic header sooner or later.. */ 150 static inline u32 get_unaligned_le24(const u8 *p) 151 { 152 return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; 153 } 154 155 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) 156 { 157 return nvme_is_write(rsp->req.cmd) && 158 rsp->req.transfer_len && 159 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 160 } 161 162 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) 163 { 164 return !nvme_is_write(rsp->req.cmd) && 165 rsp->req.transfer_len && 166 !rsp->req.cqe->status && 167 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 168 } 169 170 static inline struct nvmet_rdma_rsp * 171 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) 172 { 173 struct nvmet_rdma_rsp *rsp; 174 unsigned long flags; 175 176 spin_lock_irqsave(&queue->rsps_lock, flags); 177 rsp = list_first_entry_or_null(&queue->free_rsps, 178 struct nvmet_rdma_rsp, free_list); 179 if (likely(rsp)) 180 list_del(&rsp->free_list); 181 spin_unlock_irqrestore(&queue->rsps_lock, flags); 182 183 if (unlikely(!rsp)) { 184 int ret; 185 186 rsp = kzalloc(sizeof(*rsp), GFP_KERNEL); 187 if (unlikely(!rsp)) 188 return NULL; 189 ret = nvmet_rdma_alloc_rsp(queue->dev, rsp); 190 if (unlikely(ret)) { 191 kfree(rsp); 192 return NULL; 193 } 194 195 rsp->allocated = true; 196 } 197 198 return rsp; 199 } 200 201 static inline void 202 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) 203 { 204 unsigned long flags; 205 206 if (unlikely(rsp->allocated)) { 207 nvmet_rdma_free_rsp(rsp->queue->dev, rsp); 208 kfree(rsp); 209 return; 210 } 211 212 spin_lock_irqsave(&rsp->queue->rsps_lock, flags); 213 list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); 214 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); 215 } 216 217 static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, 218 struct nvmet_rdma_cmd *c) 219 { 220 struct scatterlist *sg; 221 struct ib_sge *sge; 222 int i; 223 224 if (!ndev->inline_data_size) 225 return; 226 227 sg = c->inline_sg; 228 sge = &c->sge[1]; 229 230 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { 231 if (sge->length) 232 ib_dma_unmap_page(ndev->device, sge->addr, 233 sge->length, DMA_FROM_DEVICE); 234 if (sg_page(sg)) 235 __free_page(sg_page(sg)); 236 } 237 } 238 239 static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev, 240 struct nvmet_rdma_cmd *c) 241 { 242 struct scatterlist *sg; 243 struct ib_sge *sge; 244 struct page *pg; 245 int len; 246 int i; 247 248 if (!ndev->inline_data_size) 249 return 0; 250 251 sg = c->inline_sg; 252 sg_init_table(sg, ndev->inline_page_count); 253 sge = &c->sge[1]; 254 len = ndev->inline_data_size; 255 256 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { 257 pg = alloc_page(GFP_KERNEL); 258 if (!pg) 259 goto out_err; 260 sg_assign_page(sg, pg); 261 sge->addr = ib_dma_map_page(ndev->device, 262 pg, 0, PAGE_SIZE, DMA_FROM_DEVICE); 263 if (ib_dma_mapping_error(ndev->device, sge->addr)) 264 goto out_err; 265 sge->length = min_t(int, len, PAGE_SIZE); 266 sge->lkey = ndev->pd->local_dma_lkey; 267 len -= sge->length; 268 } 269 270 return 0; 271 out_err: 272 for (; i >= 0; i--, sg--, sge--) { 273 if (sge->length) 274 ib_dma_unmap_page(ndev->device, sge->addr, 275 sge->length, DMA_FROM_DEVICE); 276 if (sg_page(sg)) 277 __free_page(sg_page(sg)); 278 } 279 return -ENOMEM; 280 } 281 282 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, 283 struct nvmet_rdma_cmd *c, bool admin) 284 { 285 /* NVMe command / RDMA RECV */ 286 c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL); 287 if (!c->nvme_cmd) 288 goto out; 289 290 c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd, 291 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 292 if (ib_dma_mapping_error(ndev->device, c->sge[0].addr)) 293 goto out_free_cmd; 294 295 c->sge[0].length = sizeof(*c->nvme_cmd); 296 c->sge[0].lkey = ndev->pd->local_dma_lkey; 297 298 if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c)) 299 goto out_unmap_cmd; 300 301 c->cqe.done = nvmet_rdma_recv_done; 302 303 c->wr.wr_cqe = &c->cqe; 304 c->wr.sg_list = c->sge; 305 c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1; 306 307 return 0; 308 309 out_unmap_cmd: 310 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 311 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 312 out_free_cmd: 313 kfree(c->nvme_cmd); 314 315 out: 316 return -ENOMEM; 317 } 318 319 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, 320 struct nvmet_rdma_cmd *c, bool admin) 321 { 322 if (!admin) 323 nvmet_rdma_free_inline_pages(ndev, c); 324 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 325 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 326 kfree(c->nvme_cmd); 327 } 328 329 static struct nvmet_rdma_cmd * 330 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, 331 int nr_cmds, bool admin) 332 { 333 struct nvmet_rdma_cmd *cmds; 334 int ret = -EINVAL, i; 335 336 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); 337 if (!cmds) 338 goto out; 339 340 for (i = 0; i < nr_cmds; i++) { 341 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin); 342 if (ret) 343 goto out_free; 344 } 345 346 return cmds; 347 348 out_free: 349 while (--i >= 0) 350 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 351 kfree(cmds); 352 out: 353 return ERR_PTR(ret); 354 } 355 356 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, 357 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin) 358 { 359 int i; 360 361 for (i = 0; i < nr_cmds; i++) 362 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 363 kfree(cmds); 364 } 365 366 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, 367 struct nvmet_rdma_rsp *r) 368 { 369 /* NVMe CQE / RDMA SEND */ 370 r->req.cqe = kmalloc(sizeof(*r->req.cqe), GFP_KERNEL); 371 if (!r->req.cqe) 372 goto out; 373 374 r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.cqe, 375 sizeof(*r->req.cqe), DMA_TO_DEVICE); 376 if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) 377 goto out_free_rsp; 378 379 r->req.p2p_client = &ndev->device->dev; 380 r->send_sge.length = sizeof(*r->req.cqe); 381 r->send_sge.lkey = ndev->pd->local_dma_lkey; 382 383 r->send_cqe.done = nvmet_rdma_send_done; 384 385 r->send_wr.wr_cqe = &r->send_cqe; 386 r->send_wr.sg_list = &r->send_sge; 387 r->send_wr.num_sge = 1; 388 r->send_wr.send_flags = IB_SEND_SIGNALED; 389 390 /* Data In / RDMA READ */ 391 r->read_cqe.done = nvmet_rdma_read_data_done; 392 return 0; 393 394 out_free_rsp: 395 kfree(r->req.cqe); 396 out: 397 return -ENOMEM; 398 } 399 400 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, 401 struct nvmet_rdma_rsp *r) 402 { 403 ib_dma_unmap_single(ndev->device, r->send_sge.addr, 404 sizeof(*r->req.cqe), DMA_TO_DEVICE); 405 kfree(r->req.cqe); 406 } 407 408 static int 409 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) 410 { 411 struct nvmet_rdma_device *ndev = queue->dev; 412 int nr_rsps = queue->recv_queue_size * 2; 413 int ret = -EINVAL, i; 414 415 queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), 416 GFP_KERNEL); 417 if (!queue->rsps) 418 goto out; 419 420 for (i = 0; i < nr_rsps; i++) { 421 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 422 423 ret = nvmet_rdma_alloc_rsp(ndev, rsp); 424 if (ret) 425 goto out_free; 426 427 list_add_tail(&rsp->free_list, &queue->free_rsps); 428 } 429 430 return 0; 431 432 out_free: 433 while (--i >= 0) { 434 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 435 436 list_del(&rsp->free_list); 437 nvmet_rdma_free_rsp(ndev, rsp); 438 } 439 kfree(queue->rsps); 440 out: 441 return ret; 442 } 443 444 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) 445 { 446 struct nvmet_rdma_device *ndev = queue->dev; 447 int i, nr_rsps = queue->recv_queue_size * 2; 448 449 for (i = 0; i < nr_rsps; i++) { 450 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 451 452 list_del(&rsp->free_list); 453 nvmet_rdma_free_rsp(ndev, rsp); 454 } 455 kfree(queue->rsps); 456 } 457 458 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, 459 struct nvmet_rdma_cmd *cmd) 460 { 461 int ret; 462 463 ib_dma_sync_single_for_device(ndev->device, 464 cmd->sge[0].addr, cmd->sge[0].length, 465 DMA_FROM_DEVICE); 466 467 if (ndev->srq) 468 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL); 469 else 470 ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL); 471 472 if (unlikely(ret)) 473 pr_err("post_recv cmd failed\n"); 474 475 return ret; 476 } 477 478 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) 479 { 480 spin_lock(&queue->rsp_wr_wait_lock); 481 while (!list_empty(&queue->rsp_wr_wait_list)) { 482 struct nvmet_rdma_rsp *rsp; 483 bool ret; 484 485 rsp = list_entry(queue->rsp_wr_wait_list.next, 486 struct nvmet_rdma_rsp, wait_list); 487 list_del(&rsp->wait_list); 488 489 spin_unlock(&queue->rsp_wr_wait_lock); 490 ret = nvmet_rdma_execute_command(rsp); 491 spin_lock(&queue->rsp_wr_wait_lock); 492 493 if (!ret) { 494 list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); 495 break; 496 } 497 } 498 spin_unlock(&queue->rsp_wr_wait_lock); 499 } 500 501 502 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) 503 { 504 struct nvmet_rdma_queue *queue = rsp->queue; 505 506 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 507 508 if (rsp->n_rdma) { 509 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 510 queue->cm_id->port_num, rsp->req.sg, 511 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 512 } 513 514 if (rsp->req.sg != rsp->cmd->inline_sg) 515 nvmet_req_free_sgl(&rsp->req); 516 517 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) 518 nvmet_rdma_process_wr_wait_list(queue); 519 520 nvmet_rdma_put_rsp(rsp); 521 } 522 523 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue) 524 { 525 if (queue->nvme_sq.ctrl) { 526 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); 527 } else { 528 /* 529 * we didn't setup the controller yet in case 530 * of admin connect error, just disconnect and 531 * cleanup the queue 532 */ 533 nvmet_rdma_queue_disconnect(queue); 534 } 535 } 536 537 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) 538 { 539 struct nvmet_rdma_rsp *rsp = 540 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); 541 struct nvmet_rdma_queue *queue = cq->cq_context; 542 543 nvmet_rdma_release_rsp(rsp); 544 545 if (unlikely(wc->status != IB_WC_SUCCESS && 546 wc->status != IB_WC_WR_FLUSH_ERR)) { 547 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", 548 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 549 nvmet_rdma_error_comp(queue); 550 } 551 } 552 553 static void nvmet_rdma_queue_response(struct nvmet_req *req) 554 { 555 struct nvmet_rdma_rsp *rsp = 556 container_of(req, struct nvmet_rdma_rsp, req); 557 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 558 struct ib_send_wr *first_wr; 559 560 if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { 561 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; 562 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; 563 } else { 564 rsp->send_wr.opcode = IB_WR_SEND; 565 } 566 567 if (nvmet_rdma_need_data_out(rsp)) 568 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, 569 cm_id->port_num, NULL, &rsp->send_wr); 570 else 571 first_wr = &rsp->send_wr; 572 573 nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); 574 575 ib_dma_sync_single_for_device(rsp->queue->dev->device, 576 rsp->send_sge.addr, rsp->send_sge.length, 577 DMA_TO_DEVICE); 578 579 if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) { 580 pr_err("sending cmd response failed\n"); 581 nvmet_rdma_release_rsp(rsp); 582 } 583 } 584 585 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) 586 { 587 struct nvmet_rdma_rsp *rsp = 588 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); 589 struct nvmet_rdma_queue *queue = cq->cq_context; 590 591 WARN_ON(rsp->n_rdma <= 0); 592 atomic_add(rsp->n_rdma, &queue->sq_wr_avail); 593 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 594 queue->cm_id->port_num, rsp->req.sg, 595 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 596 rsp->n_rdma = 0; 597 598 if (unlikely(wc->status != IB_WC_SUCCESS)) { 599 nvmet_req_uninit(&rsp->req); 600 nvmet_rdma_release_rsp(rsp); 601 if (wc->status != IB_WC_WR_FLUSH_ERR) { 602 pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n", 603 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 604 nvmet_rdma_error_comp(queue); 605 } 606 return; 607 } 608 609 rsp->req.execute(&rsp->req); 610 } 611 612 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, 613 u64 off) 614 { 615 int sg_count = num_pages(len); 616 struct scatterlist *sg; 617 int i; 618 619 sg = rsp->cmd->inline_sg; 620 for (i = 0; i < sg_count; i++, sg++) { 621 if (i < sg_count - 1) 622 sg_unmark_end(sg); 623 else 624 sg_mark_end(sg); 625 sg->offset = off; 626 sg->length = min_t(int, len, PAGE_SIZE - off); 627 len -= sg->length; 628 if (!i) 629 off = 0; 630 } 631 632 rsp->req.sg = rsp->cmd->inline_sg; 633 rsp->req.sg_cnt = sg_count; 634 } 635 636 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) 637 { 638 struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl; 639 u64 off = le64_to_cpu(sgl->addr); 640 u32 len = le32_to_cpu(sgl->length); 641 642 if (!nvme_is_write(rsp->req.cmd)) { 643 rsp->req.error_loc = 644 offsetof(struct nvme_common_command, opcode); 645 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 646 } 647 648 if (off + len > rsp->queue->dev->inline_data_size) { 649 pr_err("invalid inline data offset!\n"); 650 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; 651 } 652 653 /* no data command? */ 654 if (!len) 655 return 0; 656 657 nvmet_rdma_use_inline_sg(rsp, len, off); 658 rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; 659 rsp->req.transfer_len += len; 660 return 0; 661 } 662 663 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, 664 struct nvme_keyed_sgl_desc *sgl, bool invalidate) 665 { 666 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 667 u64 addr = le64_to_cpu(sgl->addr); 668 u32 key = get_unaligned_le32(sgl->key); 669 int ret; 670 671 rsp->req.transfer_len = get_unaligned_le24(sgl->length); 672 673 /* no data command? */ 674 if (!rsp->req.transfer_len) 675 return 0; 676 677 ret = nvmet_req_alloc_sgl(&rsp->req); 678 if (unlikely(ret < 0)) 679 goto error_out; 680 681 ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, 682 rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, 683 nvmet_data_dir(&rsp->req)); 684 if (unlikely(ret < 0)) 685 goto error_out; 686 rsp->n_rdma += ret; 687 688 if (invalidate) { 689 rsp->invalidate_rkey = key; 690 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; 691 } 692 693 return 0; 694 695 error_out: 696 rsp->req.transfer_len = 0; 697 return NVME_SC_INTERNAL; 698 } 699 700 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) 701 { 702 struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl; 703 704 switch (sgl->type >> 4) { 705 case NVME_SGL_FMT_DATA_DESC: 706 switch (sgl->type & 0xf) { 707 case NVME_SGL_FMT_OFFSET: 708 return nvmet_rdma_map_sgl_inline(rsp); 709 default: 710 pr_err("invalid SGL subtype: %#x\n", sgl->type); 711 rsp->req.error_loc = 712 offsetof(struct nvme_common_command, dptr); 713 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 714 } 715 case NVME_KEY_SGL_FMT_DATA_DESC: 716 switch (sgl->type & 0xf) { 717 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE: 718 return nvmet_rdma_map_sgl_keyed(rsp, sgl, true); 719 case NVME_SGL_FMT_ADDRESS: 720 return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); 721 default: 722 pr_err("invalid SGL subtype: %#x\n", sgl->type); 723 rsp->req.error_loc = 724 offsetof(struct nvme_common_command, dptr); 725 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 726 } 727 default: 728 pr_err("invalid SGL type: %#x\n", sgl->type); 729 rsp->req.error_loc = offsetof(struct nvme_common_command, dptr); 730 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; 731 } 732 } 733 734 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) 735 { 736 struct nvmet_rdma_queue *queue = rsp->queue; 737 738 if (unlikely(atomic_sub_return(1 + rsp->n_rdma, 739 &queue->sq_wr_avail) < 0)) { 740 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n", 741 1 + rsp->n_rdma, queue->idx, 742 queue->nvme_sq.ctrl->cntlid); 743 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 744 return false; 745 } 746 747 if (nvmet_rdma_need_data_in(rsp)) { 748 if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, 749 queue->cm_id->port_num, &rsp->read_cqe, NULL)) 750 nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); 751 } else { 752 rsp->req.execute(&rsp->req); 753 } 754 755 return true; 756 } 757 758 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, 759 struct nvmet_rdma_rsp *cmd) 760 { 761 u16 status; 762 763 ib_dma_sync_single_for_cpu(queue->dev->device, 764 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length, 765 DMA_FROM_DEVICE); 766 ib_dma_sync_single_for_cpu(queue->dev->device, 767 cmd->send_sge.addr, cmd->send_sge.length, 768 DMA_TO_DEVICE); 769 770 if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, 771 &queue->nvme_sq, &nvmet_rdma_ops)) 772 return; 773 774 status = nvmet_rdma_map_sgl(cmd); 775 if (status) 776 goto out_err; 777 778 if (unlikely(!nvmet_rdma_execute_command(cmd))) { 779 spin_lock(&queue->rsp_wr_wait_lock); 780 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list); 781 spin_unlock(&queue->rsp_wr_wait_lock); 782 } 783 784 return; 785 786 out_err: 787 nvmet_req_complete(&cmd->req, status); 788 } 789 790 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) 791 { 792 struct nvmet_rdma_cmd *cmd = 793 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); 794 struct nvmet_rdma_queue *queue = cq->cq_context; 795 struct nvmet_rdma_rsp *rsp; 796 797 if (unlikely(wc->status != IB_WC_SUCCESS)) { 798 if (wc->status != IB_WC_WR_FLUSH_ERR) { 799 pr_err("RECV for CQE 0x%p failed with status %s (%d)\n", 800 wc->wr_cqe, ib_wc_status_msg(wc->status), 801 wc->status); 802 nvmet_rdma_error_comp(queue); 803 } 804 return; 805 } 806 807 if (unlikely(wc->byte_len < sizeof(struct nvme_command))) { 808 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n"); 809 nvmet_rdma_error_comp(queue); 810 return; 811 } 812 813 cmd->queue = queue; 814 rsp = nvmet_rdma_get_rsp(queue); 815 if (unlikely(!rsp)) { 816 /* 817 * we get here only under memory pressure, 818 * silently drop and have the host retry 819 * as we can't even fail it. 820 */ 821 nvmet_rdma_post_recv(queue->dev, cmd); 822 return; 823 } 824 rsp->queue = queue; 825 rsp->cmd = cmd; 826 rsp->flags = 0; 827 rsp->req.cmd = cmd->nvme_cmd; 828 rsp->req.port = queue->port; 829 rsp->n_rdma = 0; 830 831 if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { 832 unsigned long flags; 833 834 spin_lock_irqsave(&queue->state_lock, flags); 835 if (queue->state == NVMET_RDMA_Q_CONNECTING) 836 list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); 837 else 838 nvmet_rdma_put_rsp(rsp); 839 spin_unlock_irqrestore(&queue->state_lock, flags); 840 return; 841 } 842 843 nvmet_rdma_handle_command(queue, rsp); 844 } 845 846 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev) 847 { 848 if (!ndev->srq) 849 return; 850 851 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 852 ib_destroy_srq(ndev->srq); 853 } 854 855 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) 856 { 857 struct ib_srq_init_attr srq_attr = { NULL, }; 858 struct ib_srq *srq; 859 size_t srq_size; 860 int ret, i; 861 862 srq_size = 4095; /* XXX: tune */ 863 864 srq_attr.attr.max_wr = srq_size; 865 srq_attr.attr.max_sge = 1 + ndev->inline_page_count; 866 srq_attr.attr.srq_limit = 0; 867 srq_attr.srq_type = IB_SRQT_BASIC; 868 srq = ib_create_srq(ndev->pd, &srq_attr); 869 if (IS_ERR(srq)) { 870 /* 871 * If SRQs aren't supported we just go ahead and use normal 872 * non-shared receive queues. 873 */ 874 pr_info("SRQ requested but not supported.\n"); 875 return 0; 876 } 877 878 ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); 879 if (IS_ERR(ndev->srq_cmds)) { 880 ret = PTR_ERR(ndev->srq_cmds); 881 goto out_destroy_srq; 882 } 883 884 ndev->srq = srq; 885 ndev->srq_size = srq_size; 886 887 for (i = 0; i < srq_size; i++) { 888 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); 889 if (ret) 890 goto out_free_cmds; 891 } 892 893 return 0; 894 895 out_free_cmds: 896 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 897 out_destroy_srq: 898 ib_destroy_srq(srq); 899 return ret; 900 } 901 902 static void nvmet_rdma_free_dev(struct kref *ref) 903 { 904 struct nvmet_rdma_device *ndev = 905 container_of(ref, struct nvmet_rdma_device, ref); 906 907 mutex_lock(&device_list_mutex); 908 list_del(&ndev->entry); 909 mutex_unlock(&device_list_mutex); 910 911 nvmet_rdma_destroy_srq(ndev); 912 ib_dealloc_pd(ndev->pd); 913 914 kfree(ndev); 915 } 916 917 static struct nvmet_rdma_device * 918 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) 919 { 920 struct nvmet_port *port = cm_id->context; 921 struct nvmet_rdma_device *ndev; 922 int inline_page_count; 923 int inline_sge_count; 924 int ret; 925 926 mutex_lock(&device_list_mutex); 927 list_for_each_entry(ndev, &device_list, entry) { 928 if (ndev->device->node_guid == cm_id->device->node_guid && 929 kref_get_unless_zero(&ndev->ref)) 930 goto out_unlock; 931 } 932 933 ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); 934 if (!ndev) 935 goto out_err; 936 937 inline_page_count = num_pages(port->inline_data_size); 938 inline_sge_count = max(cm_id->device->attrs.max_sge_rd, 939 cm_id->device->attrs.max_recv_sge) - 1; 940 if (inline_page_count > inline_sge_count) { 941 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", 942 port->inline_data_size, cm_id->device->name, 943 inline_sge_count * PAGE_SIZE); 944 port->inline_data_size = inline_sge_count * PAGE_SIZE; 945 inline_page_count = inline_sge_count; 946 } 947 ndev->inline_data_size = port->inline_data_size; 948 ndev->inline_page_count = inline_page_count; 949 ndev->device = cm_id->device; 950 kref_init(&ndev->ref); 951 952 ndev->pd = ib_alloc_pd(ndev->device, 0); 953 if (IS_ERR(ndev->pd)) 954 goto out_free_dev; 955 956 if (nvmet_rdma_use_srq) { 957 ret = nvmet_rdma_init_srq(ndev); 958 if (ret) 959 goto out_free_pd; 960 } 961 962 list_add(&ndev->entry, &device_list); 963 out_unlock: 964 mutex_unlock(&device_list_mutex); 965 pr_debug("added %s.\n", ndev->device->name); 966 return ndev; 967 968 out_free_pd: 969 ib_dealloc_pd(ndev->pd); 970 out_free_dev: 971 kfree(ndev); 972 out_err: 973 mutex_unlock(&device_list_mutex); 974 return NULL; 975 } 976 977 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) 978 { 979 struct ib_qp_init_attr qp_attr; 980 struct nvmet_rdma_device *ndev = queue->dev; 981 int comp_vector, nr_cqe, ret, i, factor; 982 983 /* 984 * Spread the io queues across completion vectors, 985 * but still keep all admin queues on vector 0. 986 */ 987 comp_vector = !queue->host_qid ? 0 : 988 queue->idx % ndev->device->num_comp_vectors; 989 990 /* 991 * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. 992 */ 993 nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; 994 995 queue->cq = ib_alloc_cq(ndev->device, queue, 996 nr_cqe + 1, comp_vector, 997 IB_POLL_WORKQUEUE); 998 if (IS_ERR(queue->cq)) { 999 ret = PTR_ERR(queue->cq); 1000 pr_err("failed to create CQ cqe= %d ret= %d\n", 1001 nr_cqe + 1, ret); 1002 goto out; 1003 } 1004 1005 memset(&qp_attr, 0, sizeof(qp_attr)); 1006 qp_attr.qp_context = queue; 1007 qp_attr.event_handler = nvmet_rdma_qp_event; 1008 qp_attr.send_cq = queue->cq; 1009 qp_attr.recv_cq = queue->cq; 1010 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 1011 qp_attr.qp_type = IB_QPT_RC; 1012 /* +1 for drain */ 1013 qp_attr.cap.max_send_wr = queue->send_queue_size + 1; 1014 factor = rdma_rw_mr_factor(ndev->device, queue->cm_id->port_num, 1015 1 << NVMET_RDMA_MAX_MDTS); 1016 qp_attr.cap.max_rdma_ctxs = queue->send_queue_size * factor; 1017 qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, 1018 ndev->device->attrs.max_send_sge); 1019 1020 if (ndev->srq) { 1021 qp_attr.srq = ndev->srq; 1022 } else { 1023 /* +1 for drain */ 1024 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; 1025 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; 1026 } 1027 1028 ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); 1029 if (ret) { 1030 pr_err("failed to create_qp ret= %d\n", ret); 1031 goto err_destroy_cq; 1032 } 1033 1034 atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); 1035 1036 pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", 1037 __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, 1038 qp_attr.cap.max_send_wr, queue->cm_id); 1039 1040 if (!ndev->srq) { 1041 for (i = 0; i < queue->recv_queue_size; i++) { 1042 queue->cmds[i].queue = queue; 1043 ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); 1044 if (ret) 1045 goto err_destroy_qp; 1046 } 1047 } 1048 1049 out: 1050 return ret; 1051 1052 err_destroy_qp: 1053 rdma_destroy_qp(queue->cm_id); 1054 err_destroy_cq: 1055 ib_free_cq(queue->cq); 1056 goto out; 1057 } 1058 1059 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) 1060 { 1061 struct ib_qp *qp = queue->cm_id->qp; 1062 1063 ib_drain_qp(qp); 1064 rdma_destroy_id(queue->cm_id); 1065 ib_destroy_qp(qp); 1066 ib_free_cq(queue->cq); 1067 } 1068 1069 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) 1070 { 1071 pr_debug("freeing queue %d\n", queue->idx); 1072 1073 nvmet_sq_destroy(&queue->nvme_sq); 1074 1075 nvmet_rdma_destroy_queue_ib(queue); 1076 if (!queue->dev->srq) { 1077 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1078 queue->recv_queue_size, 1079 !queue->host_qid); 1080 } 1081 nvmet_rdma_free_rsps(queue); 1082 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1083 kfree(queue); 1084 } 1085 1086 static void nvmet_rdma_release_queue_work(struct work_struct *w) 1087 { 1088 struct nvmet_rdma_queue *queue = 1089 container_of(w, struct nvmet_rdma_queue, release_work); 1090 struct nvmet_rdma_device *dev = queue->dev; 1091 1092 nvmet_rdma_free_queue(queue); 1093 1094 kref_put(&dev->ref, nvmet_rdma_free_dev); 1095 } 1096 1097 static int 1098 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, 1099 struct nvmet_rdma_queue *queue) 1100 { 1101 struct nvme_rdma_cm_req *req; 1102 1103 req = (struct nvme_rdma_cm_req *)conn->private_data; 1104 if (!req || conn->private_data_len == 0) 1105 return NVME_RDMA_CM_INVALID_LEN; 1106 1107 if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0) 1108 return NVME_RDMA_CM_INVALID_RECFMT; 1109 1110 queue->host_qid = le16_to_cpu(req->qid); 1111 1112 /* 1113 * req->hsqsize corresponds to our recv queue size plus 1 1114 * req->hrqsize corresponds to our send queue size 1115 */ 1116 queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; 1117 queue->send_queue_size = le16_to_cpu(req->hrqsize); 1118 1119 if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH) 1120 return NVME_RDMA_CM_INVALID_HSQSIZE; 1121 1122 /* XXX: Should we enforce some kind of max for IO queues? */ 1123 1124 return 0; 1125 } 1126 1127 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, 1128 enum nvme_rdma_cm_status status) 1129 { 1130 struct nvme_rdma_cm_rej rej; 1131 1132 pr_debug("rejecting connect request: status %d (%s)\n", 1133 status, nvme_rdma_cm_msg(status)); 1134 1135 rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1136 rej.sts = cpu_to_le16(status); 1137 1138 return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); 1139 } 1140 1141 static struct nvmet_rdma_queue * 1142 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, 1143 struct rdma_cm_id *cm_id, 1144 struct rdma_cm_event *event) 1145 { 1146 struct nvmet_rdma_queue *queue; 1147 int ret; 1148 1149 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 1150 if (!queue) { 1151 ret = NVME_RDMA_CM_NO_RSC; 1152 goto out_reject; 1153 } 1154 1155 ret = nvmet_sq_init(&queue->nvme_sq); 1156 if (ret) { 1157 ret = NVME_RDMA_CM_NO_RSC; 1158 goto out_free_queue; 1159 } 1160 1161 ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); 1162 if (ret) 1163 goto out_destroy_sq; 1164 1165 /* 1166 * Schedules the actual release because calling rdma_destroy_id from 1167 * inside a CM callback would trigger a deadlock. (great API design..) 1168 */ 1169 INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); 1170 queue->dev = ndev; 1171 queue->cm_id = cm_id; 1172 1173 spin_lock_init(&queue->state_lock); 1174 queue->state = NVMET_RDMA_Q_CONNECTING; 1175 INIT_LIST_HEAD(&queue->rsp_wait_list); 1176 INIT_LIST_HEAD(&queue->rsp_wr_wait_list); 1177 spin_lock_init(&queue->rsp_wr_wait_lock); 1178 INIT_LIST_HEAD(&queue->free_rsps); 1179 spin_lock_init(&queue->rsps_lock); 1180 INIT_LIST_HEAD(&queue->queue_list); 1181 1182 queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); 1183 if (queue->idx < 0) { 1184 ret = NVME_RDMA_CM_NO_RSC; 1185 goto out_destroy_sq; 1186 } 1187 1188 ret = nvmet_rdma_alloc_rsps(queue); 1189 if (ret) { 1190 ret = NVME_RDMA_CM_NO_RSC; 1191 goto out_ida_remove; 1192 } 1193 1194 if (!ndev->srq) { 1195 queue->cmds = nvmet_rdma_alloc_cmds(ndev, 1196 queue->recv_queue_size, 1197 !queue->host_qid); 1198 if (IS_ERR(queue->cmds)) { 1199 ret = NVME_RDMA_CM_NO_RSC; 1200 goto out_free_responses; 1201 } 1202 } 1203 1204 ret = nvmet_rdma_create_queue_ib(queue); 1205 if (ret) { 1206 pr_err("%s: creating RDMA queue failed (%d).\n", 1207 __func__, ret); 1208 ret = NVME_RDMA_CM_NO_RSC; 1209 goto out_free_cmds; 1210 } 1211 1212 return queue; 1213 1214 out_free_cmds: 1215 if (!ndev->srq) { 1216 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1217 queue->recv_queue_size, 1218 !queue->host_qid); 1219 } 1220 out_free_responses: 1221 nvmet_rdma_free_rsps(queue); 1222 out_ida_remove: 1223 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1224 out_destroy_sq: 1225 nvmet_sq_destroy(&queue->nvme_sq); 1226 out_free_queue: 1227 kfree(queue); 1228 out_reject: 1229 nvmet_rdma_cm_reject(cm_id, ret); 1230 return NULL; 1231 } 1232 1233 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) 1234 { 1235 struct nvmet_rdma_queue *queue = priv; 1236 1237 switch (event->event) { 1238 case IB_EVENT_COMM_EST: 1239 rdma_notify(queue->cm_id, event->event); 1240 break; 1241 default: 1242 pr_err("received IB QP event: %s (%d)\n", 1243 ib_event_msg(event->event), event->event); 1244 break; 1245 } 1246 } 1247 1248 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, 1249 struct nvmet_rdma_queue *queue, 1250 struct rdma_conn_param *p) 1251 { 1252 struct rdma_conn_param param = { }; 1253 struct nvme_rdma_cm_rep priv = { }; 1254 int ret = -ENOMEM; 1255 1256 param.rnr_retry_count = 7; 1257 param.flow_control = 1; 1258 param.initiator_depth = min_t(u8, p->initiator_depth, 1259 queue->dev->device->attrs.max_qp_init_rd_atom); 1260 param.private_data = &priv; 1261 param.private_data_len = sizeof(priv); 1262 priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1263 priv.crqsize = cpu_to_le16(queue->recv_queue_size); 1264 1265 ret = rdma_accept(cm_id, ¶m); 1266 if (ret) 1267 pr_err("rdma_accept failed (error code = %d)\n", ret); 1268 1269 return ret; 1270 } 1271 1272 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, 1273 struct rdma_cm_event *event) 1274 { 1275 struct nvmet_rdma_device *ndev; 1276 struct nvmet_rdma_queue *queue; 1277 int ret = -EINVAL; 1278 1279 ndev = nvmet_rdma_find_get_device(cm_id); 1280 if (!ndev) { 1281 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC); 1282 return -ECONNREFUSED; 1283 } 1284 1285 queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); 1286 if (!queue) { 1287 ret = -ENOMEM; 1288 goto put_device; 1289 } 1290 queue->port = cm_id->context; 1291 1292 if (queue->host_qid == 0) { 1293 /* Let inflight controller teardown complete */ 1294 flush_scheduled_work(); 1295 } 1296 1297 ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); 1298 if (ret) { 1299 schedule_work(&queue->release_work); 1300 /* Destroying rdma_cm id is not needed here */ 1301 return 0; 1302 } 1303 1304 mutex_lock(&nvmet_rdma_queue_mutex); 1305 list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); 1306 mutex_unlock(&nvmet_rdma_queue_mutex); 1307 1308 return 0; 1309 1310 put_device: 1311 kref_put(&ndev->ref, nvmet_rdma_free_dev); 1312 1313 return ret; 1314 } 1315 1316 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue) 1317 { 1318 unsigned long flags; 1319 1320 spin_lock_irqsave(&queue->state_lock, flags); 1321 if (queue->state != NVMET_RDMA_Q_CONNECTING) { 1322 pr_warn("trying to establish a connected queue\n"); 1323 goto out_unlock; 1324 } 1325 queue->state = NVMET_RDMA_Q_LIVE; 1326 1327 while (!list_empty(&queue->rsp_wait_list)) { 1328 struct nvmet_rdma_rsp *cmd; 1329 1330 cmd = list_first_entry(&queue->rsp_wait_list, 1331 struct nvmet_rdma_rsp, wait_list); 1332 list_del(&cmd->wait_list); 1333 1334 spin_unlock_irqrestore(&queue->state_lock, flags); 1335 nvmet_rdma_handle_command(queue, cmd); 1336 spin_lock_irqsave(&queue->state_lock, flags); 1337 } 1338 1339 out_unlock: 1340 spin_unlock_irqrestore(&queue->state_lock, flags); 1341 } 1342 1343 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1344 { 1345 bool disconnect = false; 1346 unsigned long flags; 1347 1348 pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state); 1349 1350 spin_lock_irqsave(&queue->state_lock, flags); 1351 switch (queue->state) { 1352 case NVMET_RDMA_Q_CONNECTING: 1353 case NVMET_RDMA_Q_LIVE: 1354 queue->state = NVMET_RDMA_Q_DISCONNECTING; 1355 disconnect = true; 1356 break; 1357 case NVMET_RDMA_Q_DISCONNECTING: 1358 break; 1359 } 1360 spin_unlock_irqrestore(&queue->state_lock, flags); 1361 1362 if (disconnect) { 1363 rdma_disconnect(queue->cm_id); 1364 schedule_work(&queue->release_work); 1365 } 1366 } 1367 1368 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1369 { 1370 bool disconnect = false; 1371 1372 mutex_lock(&nvmet_rdma_queue_mutex); 1373 if (!list_empty(&queue->queue_list)) { 1374 list_del_init(&queue->queue_list); 1375 disconnect = true; 1376 } 1377 mutex_unlock(&nvmet_rdma_queue_mutex); 1378 1379 if (disconnect) 1380 __nvmet_rdma_queue_disconnect(queue); 1381 } 1382 1383 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, 1384 struct nvmet_rdma_queue *queue) 1385 { 1386 WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); 1387 1388 mutex_lock(&nvmet_rdma_queue_mutex); 1389 if (!list_empty(&queue->queue_list)) 1390 list_del_init(&queue->queue_list); 1391 mutex_unlock(&nvmet_rdma_queue_mutex); 1392 1393 pr_err("failed to connect queue %d\n", queue->idx); 1394 schedule_work(&queue->release_work); 1395 } 1396 1397 /** 1398 * nvme_rdma_device_removal() - Handle RDMA device removal 1399 * @cm_id: rdma_cm id, used for nvmet port 1400 * @queue: nvmet rdma queue (cm id qp_context) 1401 * 1402 * DEVICE_REMOVAL event notifies us that the RDMA device is about 1403 * to unplug. Note that this event can be generated on a normal 1404 * queue cm_id and/or a device bound listener cm_id (where in this 1405 * case queue will be null). 1406 * 1407 * We registered an ib_client to handle device removal for queues, 1408 * so we only need to handle the listening port cm_ids. In this case 1409 * we nullify the priv to prevent double cm_id destruction and destroying 1410 * the cm_id implicitely by returning a non-zero rc to the callout. 1411 */ 1412 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, 1413 struct nvmet_rdma_queue *queue) 1414 { 1415 struct nvmet_port *port; 1416 1417 if (queue) { 1418 /* 1419 * This is a queue cm_id. we have registered 1420 * an ib_client to handle queues removal 1421 * so don't interfear and just return. 1422 */ 1423 return 0; 1424 } 1425 1426 port = cm_id->context; 1427 1428 /* 1429 * This is a listener cm_id. Make sure that 1430 * future remove_port won't invoke a double 1431 * cm_id destroy. use atomic xchg to make sure 1432 * we don't compete with remove_port. 1433 */ 1434 if (xchg(&port->priv, NULL) != cm_id) 1435 return 0; 1436 1437 /* 1438 * We need to return 1 so that the core will destroy 1439 * it's own ID. What a great API design.. 1440 */ 1441 return 1; 1442 } 1443 1444 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, 1445 struct rdma_cm_event *event) 1446 { 1447 struct nvmet_rdma_queue *queue = NULL; 1448 int ret = 0; 1449 1450 if (cm_id->qp) 1451 queue = cm_id->qp->qp_context; 1452 1453 pr_debug("%s (%d): status %d id %p\n", 1454 rdma_event_msg(event->event), event->event, 1455 event->status, cm_id); 1456 1457 switch (event->event) { 1458 case RDMA_CM_EVENT_CONNECT_REQUEST: 1459 ret = nvmet_rdma_queue_connect(cm_id, event); 1460 break; 1461 case RDMA_CM_EVENT_ESTABLISHED: 1462 nvmet_rdma_queue_established(queue); 1463 break; 1464 case RDMA_CM_EVENT_ADDR_CHANGE: 1465 case RDMA_CM_EVENT_DISCONNECTED: 1466 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1467 nvmet_rdma_queue_disconnect(queue); 1468 break; 1469 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1470 ret = nvmet_rdma_device_removal(cm_id, queue); 1471 break; 1472 case RDMA_CM_EVENT_REJECTED: 1473 pr_debug("Connection rejected: %s\n", 1474 rdma_reject_msg(cm_id, event->status)); 1475 /* FALLTHROUGH */ 1476 case RDMA_CM_EVENT_UNREACHABLE: 1477 case RDMA_CM_EVENT_CONNECT_ERROR: 1478 nvmet_rdma_queue_connect_fail(cm_id, queue); 1479 break; 1480 default: 1481 pr_err("received unrecognized RDMA CM event %d\n", 1482 event->event); 1483 break; 1484 } 1485 1486 return ret; 1487 } 1488 1489 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) 1490 { 1491 struct nvmet_rdma_queue *queue; 1492 1493 restart: 1494 mutex_lock(&nvmet_rdma_queue_mutex); 1495 list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { 1496 if (queue->nvme_sq.ctrl == ctrl) { 1497 list_del_init(&queue->queue_list); 1498 mutex_unlock(&nvmet_rdma_queue_mutex); 1499 1500 __nvmet_rdma_queue_disconnect(queue); 1501 goto restart; 1502 } 1503 } 1504 mutex_unlock(&nvmet_rdma_queue_mutex); 1505 } 1506 1507 static int nvmet_rdma_add_port(struct nvmet_port *port) 1508 { 1509 struct rdma_cm_id *cm_id; 1510 struct sockaddr_storage addr = { }; 1511 __kernel_sa_family_t af; 1512 int ret; 1513 1514 switch (port->disc_addr.adrfam) { 1515 case NVMF_ADDR_FAMILY_IP4: 1516 af = AF_INET; 1517 break; 1518 case NVMF_ADDR_FAMILY_IP6: 1519 af = AF_INET6; 1520 break; 1521 default: 1522 pr_err("address family %d not supported\n", 1523 port->disc_addr.adrfam); 1524 return -EINVAL; 1525 } 1526 1527 if (port->inline_data_size < 0) { 1528 port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; 1529 } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { 1530 pr_warn("inline_data_size %u is too large, reducing to %u\n", 1531 port->inline_data_size, 1532 NVMET_RDMA_MAX_INLINE_DATA_SIZE); 1533 port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; 1534 } 1535 1536 ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, 1537 port->disc_addr.trsvcid, &addr); 1538 if (ret) { 1539 pr_err("malformed ip/port passed: %s:%s\n", 1540 port->disc_addr.traddr, port->disc_addr.trsvcid); 1541 return ret; 1542 } 1543 1544 cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, 1545 RDMA_PS_TCP, IB_QPT_RC); 1546 if (IS_ERR(cm_id)) { 1547 pr_err("CM ID creation failed\n"); 1548 return PTR_ERR(cm_id); 1549 } 1550 1551 /* 1552 * Allow both IPv4 and IPv6 sockets to bind a single port 1553 * at the same time. 1554 */ 1555 ret = rdma_set_afonly(cm_id, 1); 1556 if (ret) { 1557 pr_err("rdma_set_afonly failed (%d)\n", ret); 1558 goto out_destroy_id; 1559 } 1560 1561 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr); 1562 if (ret) { 1563 pr_err("binding CM ID to %pISpcs failed (%d)\n", 1564 (struct sockaddr *)&addr, ret); 1565 goto out_destroy_id; 1566 } 1567 1568 ret = rdma_listen(cm_id, 128); 1569 if (ret) { 1570 pr_err("listening to %pISpcs failed (%d)\n", 1571 (struct sockaddr *)&addr, ret); 1572 goto out_destroy_id; 1573 } 1574 1575 pr_info("enabling port %d (%pISpcs)\n", 1576 le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr); 1577 port->priv = cm_id; 1578 return 0; 1579 1580 out_destroy_id: 1581 rdma_destroy_id(cm_id); 1582 return ret; 1583 } 1584 1585 static void nvmet_rdma_remove_port(struct nvmet_port *port) 1586 { 1587 struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); 1588 1589 if (cm_id) 1590 rdma_destroy_id(cm_id); 1591 } 1592 1593 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, 1594 struct nvmet_port *port, char *traddr) 1595 { 1596 struct rdma_cm_id *cm_id = port->priv; 1597 1598 if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { 1599 struct nvmet_rdma_rsp *rsp = 1600 container_of(req, struct nvmet_rdma_rsp, req); 1601 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id; 1602 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr; 1603 1604 sprintf(traddr, "%pISc", addr); 1605 } else { 1606 memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); 1607 } 1608 } 1609 1610 static u8 nvmet_rdma_get_mdts(const struct nvmet_ctrl *ctrl) 1611 { 1612 return NVMET_RDMA_MAX_MDTS; 1613 } 1614 1615 static const struct nvmet_fabrics_ops nvmet_rdma_ops = { 1616 .owner = THIS_MODULE, 1617 .type = NVMF_TRTYPE_RDMA, 1618 .msdbd = 1, 1619 .has_keyed_sgls = 1, 1620 .add_port = nvmet_rdma_add_port, 1621 .remove_port = nvmet_rdma_remove_port, 1622 .queue_response = nvmet_rdma_queue_response, 1623 .delete_ctrl = nvmet_rdma_delete_ctrl, 1624 .disc_traddr = nvmet_rdma_disc_port_addr, 1625 .get_mdts = nvmet_rdma_get_mdts, 1626 }; 1627 1628 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) 1629 { 1630 struct nvmet_rdma_queue *queue, *tmp; 1631 struct nvmet_rdma_device *ndev; 1632 bool found = false; 1633 1634 mutex_lock(&device_list_mutex); 1635 list_for_each_entry(ndev, &device_list, entry) { 1636 if (ndev->device == ib_device) { 1637 found = true; 1638 break; 1639 } 1640 } 1641 mutex_unlock(&device_list_mutex); 1642 1643 if (!found) 1644 return; 1645 1646 /* 1647 * IB Device that is used by nvmet controllers is being removed, 1648 * delete all queues using this device. 1649 */ 1650 mutex_lock(&nvmet_rdma_queue_mutex); 1651 list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, 1652 queue_list) { 1653 if (queue->dev->device != ib_device) 1654 continue; 1655 1656 pr_info("Removing queue %d\n", queue->idx); 1657 list_del_init(&queue->queue_list); 1658 __nvmet_rdma_queue_disconnect(queue); 1659 } 1660 mutex_unlock(&nvmet_rdma_queue_mutex); 1661 1662 flush_scheduled_work(); 1663 } 1664 1665 static struct ib_client nvmet_rdma_ib_client = { 1666 .name = "nvmet_rdma", 1667 .remove = nvmet_rdma_remove_one 1668 }; 1669 1670 static int __init nvmet_rdma_init(void) 1671 { 1672 int ret; 1673 1674 ret = ib_register_client(&nvmet_rdma_ib_client); 1675 if (ret) 1676 return ret; 1677 1678 ret = nvmet_register_transport(&nvmet_rdma_ops); 1679 if (ret) 1680 goto err_ib_client; 1681 1682 return 0; 1683 1684 err_ib_client: 1685 ib_unregister_client(&nvmet_rdma_ib_client); 1686 return ret; 1687 } 1688 1689 static void __exit nvmet_rdma_exit(void) 1690 { 1691 nvmet_unregister_transport(&nvmet_rdma_ops); 1692 ib_unregister_client(&nvmet_rdma_ib_client); 1693 WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list)); 1694 ida_destroy(&nvmet_rdma_queue_ida); 1695 } 1696 1697 module_init(nvmet_rdma_init); 1698 module_exit(nvmet_rdma_exit); 1699 1700 MODULE_LICENSE("GPL v2"); 1701 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ 1702