1 /* 2 * NVMe over Fabrics RDMA target. 3 * Copyright (c) 2015-2016 HGST, a Western Digital Company. 4 * 5 * This program is free software; you can redistribute it and/or modify it 6 * under the terms and conditions of the GNU General Public License, 7 * version 2, as published by the Free Software Foundation. 8 * 9 * This program is distributed in the hope it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for 12 * more details. 13 */ 14 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt 15 #include <linux/atomic.h> 16 #include <linux/ctype.h> 17 #include <linux/delay.h> 18 #include <linux/err.h> 19 #include <linux/init.h> 20 #include <linux/module.h> 21 #include <linux/nvme.h> 22 #include <linux/slab.h> 23 #include <linux/string.h> 24 #include <linux/wait.h> 25 #include <linux/inet.h> 26 #include <asm/unaligned.h> 27 28 #include <rdma/ib_verbs.h> 29 #include <rdma/rdma_cm.h> 30 #include <rdma/rw.h> 31 32 #include <linux/nvme-rdma.h> 33 #include "nvmet.h" 34 35 /* 36 * We allow at least 1 page, up to 4 SGEs, and up to 16KB of inline data 37 */ 38 #define NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE PAGE_SIZE 39 #define NVMET_RDMA_MAX_INLINE_SGE 4 40 #define NVMET_RDMA_MAX_INLINE_DATA_SIZE max_t(int, SZ_16K, PAGE_SIZE) 41 42 struct nvmet_rdma_cmd { 43 struct ib_sge sge[NVMET_RDMA_MAX_INLINE_SGE + 1]; 44 struct ib_cqe cqe; 45 struct ib_recv_wr wr; 46 struct scatterlist inline_sg[NVMET_RDMA_MAX_INLINE_SGE]; 47 struct nvme_command *nvme_cmd; 48 struct nvmet_rdma_queue *queue; 49 }; 50 51 enum { 52 NVMET_RDMA_REQ_INLINE_DATA = (1 << 0), 53 NVMET_RDMA_REQ_INVALIDATE_RKEY = (1 << 1), 54 }; 55 56 struct nvmet_rdma_rsp { 57 struct ib_sge send_sge; 58 struct ib_cqe send_cqe; 59 struct ib_send_wr send_wr; 60 61 struct nvmet_rdma_cmd *cmd; 62 struct nvmet_rdma_queue *queue; 63 64 struct ib_cqe read_cqe; 65 struct rdma_rw_ctx rw; 66 67 struct nvmet_req req; 68 69 u8 n_rdma; 70 u32 flags; 71 u32 invalidate_rkey; 72 73 struct list_head wait_list; 74 struct list_head free_list; 75 }; 76 77 enum nvmet_rdma_queue_state { 78 NVMET_RDMA_Q_CONNECTING, 79 NVMET_RDMA_Q_LIVE, 80 NVMET_RDMA_Q_DISCONNECTING, 81 }; 82 83 struct nvmet_rdma_queue { 84 struct rdma_cm_id *cm_id; 85 struct nvmet_port *port; 86 struct ib_cq *cq; 87 atomic_t sq_wr_avail; 88 struct nvmet_rdma_device *dev; 89 spinlock_t state_lock; 90 enum nvmet_rdma_queue_state state; 91 struct nvmet_cq nvme_cq; 92 struct nvmet_sq nvme_sq; 93 94 struct nvmet_rdma_rsp *rsps; 95 struct list_head free_rsps; 96 spinlock_t rsps_lock; 97 struct nvmet_rdma_cmd *cmds; 98 99 struct work_struct release_work; 100 struct list_head rsp_wait_list; 101 struct list_head rsp_wr_wait_list; 102 spinlock_t rsp_wr_wait_lock; 103 104 int idx; 105 int host_qid; 106 int recv_queue_size; 107 int send_queue_size; 108 109 struct list_head queue_list; 110 }; 111 112 struct nvmet_rdma_device { 113 struct ib_device *device; 114 struct ib_pd *pd; 115 struct ib_srq *srq; 116 struct nvmet_rdma_cmd *srq_cmds; 117 size_t srq_size; 118 struct kref ref; 119 struct list_head entry; 120 int inline_data_size; 121 int inline_page_count; 122 }; 123 124 static bool nvmet_rdma_use_srq; 125 module_param_named(use_srq, nvmet_rdma_use_srq, bool, 0444); 126 MODULE_PARM_DESC(use_srq, "Use shared receive queue."); 127 128 static DEFINE_IDA(nvmet_rdma_queue_ida); 129 static LIST_HEAD(nvmet_rdma_queue_list); 130 static DEFINE_MUTEX(nvmet_rdma_queue_mutex); 131 132 static LIST_HEAD(device_list); 133 static DEFINE_MUTEX(device_list_mutex); 134 135 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp); 136 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc); 137 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc); 138 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc); 139 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv); 140 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue); 141 142 static const struct nvmet_fabrics_ops nvmet_rdma_ops; 143 144 static int num_pages(int len) 145 { 146 return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT); 147 } 148 149 /* XXX: really should move to a generic header sooner or later.. */ 150 static inline u32 get_unaligned_le24(const u8 *p) 151 { 152 return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16; 153 } 154 155 static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp) 156 { 157 return nvme_is_write(rsp->req.cmd) && 158 rsp->req.transfer_len && 159 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 160 } 161 162 static inline bool nvmet_rdma_need_data_out(struct nvmet_rdma_rsp *rsp) 163 { 164 return !nvme_is_write(rsp->req.cmd) && 165 rsp->req.transfer_len && 166 !rsp->req.rsp->status && 167 !(rsp->flags & NVMET_RDMA_REQ_INLINE_DATA); 168 } 169 170 static inline struct nvmet_rdma_rsp * 171 nvmet_rdma_get_rsp(struct nvmet_rdma_queue *queue) 172 { 173 struct nvmet_rdma_rsp *rsp; 174 unsigned long flags; 175 176 spin_lock_irqsave(&queue->rsps_lock, flags); 177 rsp = list_first_entry(&queue->free_rsps, 178 struct nvmet_rdma_rsp, free_list); 179 list_del(&rsp->free_list); 180 spin_unlock_irqrestore(&queue->rsps_lock, flags); 181 182 return rsp; 183 } 184 185 static inline void 186 nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp) 187 { 188 unsigned long flags; 189 190 spin_lock_irqsave(&rsp->queue->rsps_lock, flags); 191 list_add_tail(&rsp->free_list, &rsp->queue->free_rsps); 192 spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags); 193 } 194 195 static void nvmet_rdma_free_inline_pages(struct nvmet_rdma_device *ndev, 196 struct nvmet_rdma_cmd *c) 197 { 198 struct scatterlist *sg; 199 struct ib_sge *sge; 200 int i; 201 202 if (!ndev->inline_data_size) 203 return; 204 205 sg = c->inline_sg; 206 sge = &c->sge[1]; 207 208 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { 209 if (sge->length) 210 ib_dma_unmap_page(ndev->device, sge->addr, 211 sge->length, DMA_FROM_DEVICE); 212 if (sg_page(sg)) 213 __free_page(sg_page(sg)); 214 } 215 } 216 217 static int nvmet_rdma_alloc_inline_pages(struct nvmet_rdma_device *ndev, 218 struct nvmet_rdma_cmd *c) 219 { 220 struct scatterlist *sg; 221 struct ib_sge *sge; 222 struct page *pg; 223 int len; 224 int i; 225 226 if (!ndev->inline_data_size) 227 return 0; 228 229 sg = c->inline_sg; 230 sg_init_table(sg, ndev->inline_page_count); 231 sge = &c->sge[1]; 232 len = ndev->inline_data_size; 233 234 for (i = 0; i < ndev->inline_page_count; i++, sg++, sge++) { 235 pg = alloc_page(GFP_KERNEL); 236 if (!pg) 237 goto out_err; 238 sg_assign_page(sg, pg); 239 sge->addr = ib_dma_map_page(ndev->device, 240 pg, 0, PAGE_SIZE, DMA_FROM_DEVICE); 241 if (ib_dma_mapping_error(ndev->device, sge->addr)) 242 goto out_err; 243 sge->length = min_t(int, len, PAGE_SIZE); 244 sge->lkey = ndev->pd->local_dma_lkey; 245 len -= sge->length; 246 } 247 248 return 0; 249 out_err: 250 for (; i >= 0; i--, sg--, sge--) { 251 if (sge->length) 252 ib_dma_unmap_page(ndev->device, sge->addr, 253 sge->length, DMA_FROM_DEVICE); 254 if (sg_page(sg)) 255 __free_page(sg_page(sg)); 256 } 257 return -ENOMEM; 258 } 259 260 static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev, 261 struct nvmet_rdma_cmd *c, bool admin) 262 { 263 /* NVMe command / RDMA RECV */ 264 c->nvme_cmd = kmalloc(sizeof(*c->nvme_cmd), GFP_KERNEL); 265 if (!c->nvme_cmd) 266 goto out; 267 268 c->sge[0].addr = ib_dma_map_single(ndev->device, c->nvme_cmd, 269 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 270 if (ib_dma_mapping_error(ndev->device, c->sge[0].addr)) 271 goto out_free_cmd; 272 273 c->sge[0].length = sizeof(*c->nvme_cmd); 274 c->sge[0].lkey = ndev->pd->local_dma_lkey; 275 276 if (!admin && nvmet_rdma_alloc_inline_pages(ndev, c)) 277 goto out_unmap_cmd; 278 279 c->cqe.done = nvmet_rdma_recv_done; 280 281 c->wr.wr_cqe = &c->cqe; 282 c->wr.sg_list = c->sge; 283 c->wr.num_sge = admin ? 1 : ndev->inline_page_count + 1; 284 285 return 0; 286 287 out_unmap_cmd: 288 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 289 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 290 out_free_cmd: 291 kfree(c->nvme_cmd); 292 293 out: 294 return -ENOMEM; 295 } 296 297 static void nvmet_rdma_free_cmd(struct nvmet_rdma_device *ndev, 298 struct nvmet_rdma_cmd *c, bool admin) 299 { 300 if (!admin) 301 nvmet_rdma_free_inline_pages(ndev, c); 302 ib_dma_unmap_single(ndev->device, c->sge[0].addr, 303 sizeof(*c->nvme_cmd), DMA_FROM_DEVICE); 304 kfree(c->nvme_cmd); 305 } 306 307 static struct nvmet_rdma_cmd * 308 nvmet_rdma_alloc_cmds(struct nvmet_rdma_device *ndev, 309 int nr_cmds, bool admin) 310 { 311 struct nvmet_rdma_cmd *cmds; 312 int ret = -EINVAL, i; 313 314 cmds = kcalloc(nr_cmds, sizeof(struct nvmet_rdma_cmd), GFP_KERNEL); 315 if (!cmds) 316 goto out; 317 318 for (i = 0; i < nr_cmds; i++) { 319 ret = nvmet_rdma_alloc_cmd(ndev, cmds + i, admin); 320 if (ret) 321 goto out_free; 322 } 323 324 return cmds; 325 326 out_free: 327 while (--i >= 0) 328 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 329 kfree(cmds); 330 out: 331 return ERR_PTR(ret); 332 } 333 334 static void nvmet_rdma_free_cmds(struct nvmet_rdma_device *ndev, 335 struct nvmet_rdma_cmd *cmds, int nr_cmds, bool admin) 336 { 337 int i; 338 339 for (i = 0; i < nr_cmds; i++) 340 nvmet_rdma_free_cmd(ndev, cmds + i, admin); 341 kfree(cmds); 342 } 343 344 static int nvmet_rdma_alloc_rsp(struct nvmet_rdma_device *ndev, 345 struct nvmet_rdma_rsp *r) 346 { 347 /* NVMe CQE / RDMA SEND */ 348 r->req.rsp = kmalloc(sizeof(*r->req.rsp), GFP_KERNEL); 349 if (!r->req.rsp) 350 goto out; 351 352 r->send_sge.addr = ib_dma_map_single(ndev->device, r->req.rsp, 353 sizeof(*r->req.rsp), DMA_TO_DEVICE); 354 if (ib_dma_mapping_error(ndev->device, r->send_sge.addr)) 355 goto out_free_rsp; 356 357 r->send_sge.length = sizeof(*r->req.rsp); 358 r->send_sge.lkey = ndev->pd->local_dma_lkey; 359 360 r->send_cqe.done = nvmet_rdma_send_done; 361 362 r->send_wr.wr_cqe = &r->send_cqe; 363 r->send_wr.sg_list = &r->send_sge; 364 r->send_wr.num_sge = 1; 365 r->send_wr.send_flags = IB_SEND_SIGNALED; 366 367 /* Data In / RDMA READ */ 368 r->read_cqe.done = nvmet_rdma_read_data_done; 369 return 0; 370 371 out_free_rsp: 372 kfree(r->req.rsp); 373 out: 374 return -ENOMEM; 375 } 376 377 static void nvmet_rdma_free_rsp(struct nvmet_rdma_device *ndev, 378 struct nvmet_rdma_rsp *r) 379 { 380 ib_dma_unmap_single(ndev->device, r->send_sge.addr, 381 sizeof(*r->req.rsp), DMA_TO_DEVICE); 382 kfree(r->req.rsp); 383 } 384 385 static int 386 nvmet_rdma_alloc_rsps(struct nvmet_rdma_queue *queue) 387 { 388 struct nvmet_rdma_device *ndev = queue->dev; 389 int nr_rsps = queue->recv_queue_size * 2; 390 int ret = -EINVAL, i; 391 392 queue->rsps = kcalloc(nr_rsps, sizeof(struct nvmet_rdma_rsp), 393 GFP_KERNEL); 394 if (!queue->rsps) 395 goto out; 396 397 for (i = 0; i < nr_rsps; i++) { 398 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 399 400 ret = nvmet_rdma_alloc_rsp(ndev, rsp); 401 if (ret) 402 goto out_free; 403 404 list_add_tail(&rsp->free_list, &queue->free_rsps); 405 } 406 407 return 0; 408 409 out_free: 410 while (--i >= 0) { 411 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 412 413 list_del(&rsp->free_list); 414 nvmet_rdma_free_rsp(ndev, rsp); 415 } 416 kfree(queue->rsps); 417 out: 418 return ret; 419 } 420 421 static void nvmet_rdma_free_rsps(struct nvmet_rdma_queue *queue) 422 { 423 struct nvmet_rdma_device *ndev = queue->dev; 424 int i, nr_rsps = queue->recv_queue_size * 2; 425 426 for (i = 0; i < nr_rsps; i++) { 427 struct nvmet_rdma_rsp *rsp = &queue->rsps[i]; 428 429 list_del(&rsp->free_list); 430 nvmet_rdma_free_rsp(ndev, rsp); 431 } 432 kfree(queue->rsps); 433 } 434 435 static int nvmet_rdma_post_recv(struct nvmet_rdma_device *ndev, 436 struct nvmet_rdma_cmd *cmd) 437 { 438 int ret; 439 440 ib_dma_sync_single_for_device(ndev->device, 441 cmd->sge[0].addr, cmd->sge[0].length, 442 DMA_FROM_DEVICE); 443 444 if (ndev->srq) 445 ret = ib_post_srq_recv(ndev->srq, &cmd->wr, NULL); 446 else 447 ret = ib_post_recv(cmd->queue->cm_id->qp, &cmd->wr, NULL); 448 449 if (unlikely(ret)) 450 pr_err("post_recv cmd failed\n"); 451 452 return ret; 453 } 454 455 static void nvmet_rdma_process_wr_wait_list(struct nvmet_rdma_queue *queue) 456 { 457 spin_lock(&queue->rsp_wr_wait_lock); 458 while (!list_empty(&queue->rsp_wr_wait_list)) { 459 struct nvmet_rdma_rsp *rsp; 460 bool ret; 461 462 rsp = list_entry(queue->rsp_wr_wait_list.next, 463 struct nvmet_rdma_rsp, wait_list); 464 list_del(&rsp->wait_list); 465 466 spin_unlock(&queue->rsp_wr_wait_lock); 467 ret = nvmet_rdma_execute_command(rsp); 468 spin_lock(&queue->rsp_wr_wait_lock); 469 470 if (!ret) { 471 list_add(&rsp->wait_list, &queue->rsp_wr_wait_list); 472 break; 473 } 474 } 475 spin_unlock(&queue->rsp_wr_wait_lock); 476 } 477 478 479 static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp) 480 { 481 struct nvmet_rdma_queue *queue = rsp->queue; 482 483 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 484 485 if (rsp->n_rdma) { 486 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 487 queue->cm_id->port_num, rsp->req.sg, 488 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 489 } 490 491 if (rsp->req.sg != rsp->cmd->inline_sg) 492 sgl_free(rsp->req.sg); 493 494 if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list))) 495 nvmet_rdma_process_wr_wait_list(queue); 496 497 nvmet_rdma_put_rsp(rsp); 498 } 499 500 static void nvmet_rdma_error_comp(struct nvmet_rdma_queue *queue) 501 { 502 if (queue->nvme_sq.ctrl) { 503 nvmet_ctrl_fatal_error(queue->nvme_sq.ctrl); 504 } else { 505 /* 506 * we didn't setup the controller yet in case 507 * of admin connect error, just disconnect and 508 * cleanup the queue 509 */ 510 nvmet_rdma_queue_disconnect(queue); 511 } 512 } 513 514 static void nvmet_rdma_send_done(struct ib_cq *cq, struct ib_wc *wc) 515 { 516 struct nvmet_rdma_rsp *rsp = 517 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, send_cqe); 518 519 nvmet_rdma_release_rsp(rsp); 520 521 if (unlikely(wc->status != IB_WC_SUCCESS && 522 wc->status != IB_WC_WR_FLUSH_ERR)) { 523 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n", 524 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 525 nvmet_rdma_error_comp(rsp->queue); 526 } 527 } 528 529 static void nvmet_rdma_queue_response(struct nvmet_req *req) 530 { 531 struct nvmet_rdma_rsp *rsp = 532 container_of(req, struct nvmet_rdma_rsp, req); 533 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 534 struct ib_send_wr *first_wr; 535 536 if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) { 537 rsp->send_wr.opcode = IB_WR_SEND_WITH_INV; 538 rsp->send_wr.ex.invalidate_rkey = rsp->invalidate_rkey; 539 } else { 540 rsp->send_wr.opcode = IB_WR_SEND; 541 } 542 543 if (nvmet_rdma_need_data_out(rsp)) 544 first_wr = rdma_rw_ctx_wrs(&rsp->rw, cm_id->qp, 545 cm_id->port_num, NULL, &rsp->send_wr); 546 else 547 first_wr = &rsp->send_wr; 548 549 nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd); 550 551 ib_dma_sync_single_for_device(rsp->queue->dev->device, 552 rsp->send_sge.addr, rsp->send_sge.length, 553 DMA_TO_DEVICE); 554 555 if (unlikely(ib_post_send(cm_id->qp, first_wr, NULL))) { 556 pr_err("sending cmd response failed\n"); 557 nvmet_rdma_release_rsp(rsp); 558 } 559 } 560 561 static void nvmet_rdma_read_data_done(struct ib_cq *cq, struct ib_wc *wc) 562 { 563 struct nvmet_rdma_rsp *rsp = 564 container_of(wc->wr_cqe, struct nvmet_rdma_rsp, read_cqe); 565 struct nvmet_rdma_queue *queue = cq->cq_context; 566 567 WARN_ON(rsp->n_rdma <= 0); 568 atomic_add(rsp->n_rdma, &queue->sq_wr_avail); 569 rdma_rw_ctx_destroy(&rsp->rw, queue->cm_id->qp, 570 queue->cm_id->port_num, rsp->req.sg, 571 rsp->req.sg_cnt, nvmet_data_dir(&rsp->req)); 572 rsp->n_rdma = 0; 573 574 if (unlikely(wc->status != IB_WC_SUCCESS)) { 575 nvmet_req_uninit(&rsp->req); 576 nvmet_rdma_release_rsp(rsp); 577 if (wc->status != IB_WC_WR_FLUSH_ERR) { 578 pr_info("RDMA READ for CQE 0x%p failed with status %s (%d).\n", 579 wc->wr_cqe, ib_wc_status_msg(wc->status), wc->status); 580 nvmet_rdma_error_comp(queue); 581 } 582 return; 583 } 584 585 nvmet_req_execute(&rsp->req); 586 } 587 588 static void nvmet_rdma_use_inline_sg(struct nvmet_rdma_rsp *rsp, u32 len, 589 u64 off) 590 { 591 int sg_count = num_pages(len); 592 struct scatterlist *sg; 593 int i; 594 595 sg = rsp->cmd->inline_sg; 596 for (i = 0; i < sg_count; i++, sg++) { 597 if (i < sg_count - 1) 598 sg_unmark_end(sg); 599 else 600 sg_mark_end(sg); 601 sg->offset = off; 602 sg->length = min_t(int, len, PAGE_SIZE - off); 603 len -= sg->length; 604 if (!i) 605 off = 0; 606 } 607 608 rsp->req.sg = rsp->cmd->inline_sg; 609 rsp->req.sg_cnt = sg_count; 610 } 611 612 static u16 nvmet_rdma_map_sgl_inline(struct nvmet_rdma_rsp *rsp) 613 { 614 struct nvme_sgl_desc *sgl = &rsp->req.cmd->common.dptr.sgl; 615 u64 off = le64_to_cpu(sgl->addr); 616 u32 len = le32_to_cpu(sgl->length); 617 618 if (!nvme_is_write(rsp->req.cmd)) 619 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 620 621 if (off + len > rsp->queue->dev->inline_data_size) { 622 pr_err("invalid inline data offset!\n"); 623 return NVME_SC_SGL_INVALID_OFFSET | NVME_SC_DNR; 624 } 625 626 /* no data command? */ 627 if (!len) 628 return 0; 629 630 nvmet_rdma_use_inline_sg(rsp, len, off); 631 rsp->flags |= NVMET_RDMA_REQ_INLINE_DATA; 632 rsp->req.transfer_len += len; 633 return 0; 634 } 635 636 static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp, 637 struct nvme_keyed_sgl_desc *sgl, bool invalidate) 638 { 639 struct rdma_cm_id *cm_id = rsp->queue->cm_id; 640 u64 addr = le64_to_cpu(sgl->addr); 641 u32 len = get_unaligned_le24(sgl->length); 642 u32 key = get_unaligned_le32(sgl->key); 643 int ret; 644 645 /* no data command? */ 646 if (!len) 647 return 0; 648 649 rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt); 650 if (!rsp->req.sg) 651 return NVME_SC_INTERNAL; 652 653 ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num, 654 rsp->req.sg, rsp->req.sg_cnt, 0, addr, key, 655 nvmet_data_dir(&rsp->req)); 656 if (ret < 0) 657 return NVME_SC_INTERNAL; 658 rsp->req.transfer_len += len; 659 rsp->n_rdma += ret; 660 661 if (invalidate) { 662 rsp->invalidate_rkey = key; 663 rsp->flags |= NVMET_RDMA_REQ_INVALIDATE_RKEY; 664 } 665 666 return 0; 667 } 668 669 static u16 nvmet_rdma_map_sgl(struct nvmet_rdma_rsp *rsp) 670 { 671 struct nvme_keyed_sgl_desc *sgl = &rsp->req.cmd->common.dptr.ksgl; 672 673 switch (sgl->type >> 4) { 674 case NVME_SGL_FMT_DATA_DESC: 675 switch (sgl->type & 0xf) { 676 case NVME_SGL_FMT_OFFSET: 677 return nvmet_rdma_map_sgl_inline(rsp); 678 default: 679 pr_err("invalid SGL subtype: %#x\n", sgl->type); 680 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 681 } 682 case NVME_KEY_SGL_FMT_DATA_DESC: 683 switch (sgl->type & 0xf) { 684 case NVME_SGL_FMT_ADDRESS | NVME_SGL_FMT_INVALIDATE: 685 return nvmet_rdma_map_sgl_keyed(rsp, sgl, true); 686 case NVME_SGL_FMT_ADDRESS: 687 return nvmet_rdma_map_sgl_keyed(rsp, sgl, false); 688 default: 689 pr_err("invalid SGL subtype: %#x\n", sgl->type); 690 return NVME_SC_INVALID_FIELD | NVME_SC_DNR; 691 } 692 default: 693 pr_err("invalid SGL type: %#x\n", sgl->type); 694 return NVME_SC_SGL_INVALID_TYPE | NVME_SC_DNR; 695 } 696 } 697 698 static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp) 699 { 700 struct nvmet_rdma_queue *queue = rsp->queue; 701 702 if (unlikely(atomic_sub_return(1 + rsp->n_rdma, 703 &queue->sq_wr_avail) < 0)) { 704 pr_debug("IB send queue full (needed %d): queue %u cntlid %u\n", 705 1 + rsp->n_rdma, queue->idx, 706 queue->nvme_sq.ctrl->cntlid); 707 atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail); 708 return false; 709 } 710 711 if (nvmet_rdma_need_data_in(rsp)) { 712 if (rdma_rw_ctx_post(&rsp->rw, queue->cm_id->qp, 713 queue->cm_id->port_num, &rsp->read_cqe, NULL)) 714 nvmet_req_complete(&rsp->req, NVME_SC_DATA_XFER_ERROR); 715 } else { 716 nvmet_req_execute(&rsp->req); 717 } 718 719 return true; 720 } 721 722 static void nvmet_rdma_handle_command(struct nvmet_rdma_queue *queue, 723 struct nvmet_rdma_rsp *cmd) 724 { 725 u16 status; 726 727 ib_dma_sync_single_for_cpu(queue->dev->device, 728 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length, 729 DMA_FROM_DEVICE); 730 ib_dma_sync_single_for_cpu(queue->dev->device, 731 cmd->send_sge.addr, cmd->send_sge.length, 732 DMA_TO_DEVICE); 733 734 if (!nvmet_req_init(&cmd->req, &queue->nvme_cq, 735 &queue->nvme_sq, &nvmet_rdma_ops)) 736 return; 737 738 status = nvmet_rdma_map_sgl(cmd); 739 if (status) 740 goto out_err; 741 742 if (unlikely(!nvmet_rdma_execute_command(cmd))) { 743 spin_lock(&queue->rsp_wr_wait_lock); 744 list_add_tail(&cmd->wait_list, &queue->rsp_wr_wait_list); 745 spin_unlock(&queue->rsp_wr_wait_lock); 746 } 747 748 return; 749 750 out_err: 751 nvmet_req_complete(&cmd->req, status); 752 } 753 754 static void nvmet_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc) 755 { 756 struct nvmet_rdma_cmd *cmd = 757 container_of(wc->wr_cqe, struct nvmet_rdma_cmd, cqe); 758 struct nvmet_rdma_queue *queue = cq->cq_context; 759 struct nvmet_rdma_rsp *rsp; 760 761 if (unlikely(wc->status != IB_WC_SUCCESS)) { 762 if (wc->status != IB_WC_WR_FLUSH_ERR) { 763 pr_err("RECV for CQE 0x%p failed with status %s (%d)\n", 764 wc->wr_cqe, ib_wc_status_msg(wc->status), 765 wc->status); 766 nvmet_rdma_error_comp(queue); 767 } 768 return; 769 } 770 771 if (unlikely(wc->byte_len < sizeof(struct nvme_command))) { 772 pr_err("Ctrl Fatal Error: capsule size less than 64 bytes\n"); 773 nvmet_rdma_error_comp(queue); 774 return; 775 } 776 777 cmd->queue = queue; 778 rsp = nvmet_rdma_get_rsp(queue); 779 rsp->queue = queue; 780 rsp->cmd = cmd; 781 rsp->flags = 0; 782 rsp->req.cmd = cmd->nvme_cmd; 783 rsp->req.port = queue->port; 784 rsp->n_rdma = 0; 785 786 if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) { 787 unsigned long flags; 788 789 spin_lock_irqsave(&queue->state_lock, flags); 790 if (queue->state == NVMET_RDMA_Q_CONNECTING) 791 list_add_tail(&rsp->wait_list, &queue->rsp_wait_list); 792 else 793 nvmet_rdma_put_rsp(rsp); 794 spin_unlock_irqrestore(&queue->state_lock, flags); 795 return; 796 } 797 798 nvmet_rdma_handle_command(queue, rsp); 799 } 800 801 static void nvmet_rdma_destroy_srq(struct nvmet_rdma_device *ndev) 802 { 803 if (!ndev->srq) 804 return; 805 806 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 807 ib_destroy_srq(ndev->srq); 808 } 809 810 static int nvmet_rdma_init_srq(struct nvmet_rdma_device *ndev) 811 { 812 struct ib_srq_init_attr srq_attr = { NULL, }; 813 struct ib_srq *srq; 814 size_t srq_size; 815 int ret, i; 816 817 srq_size = 4095; /* XXX: tune */ 818 819 srq_attr.attr.max_wr = srq_size; 820 srq_attr.attr.max_sge = 1 + ndev->inline_page_count; 821 srq_attr.attr.srq_limit = 0; 822 srq_attr.srq_type = IB_SRQT_BASIC; 823 srq = ib_create_srq(ndev->pd, &srq_attr); 824 if (IS_ERR(srq)) { 825 /* 826 * If SRQs aren't supported we just go ahead and use normal 827 * non-shared receive queues. 828 */ 829 pr_info("SRQ requested but not supported.\n"); 830 return 0; 831 } 832 833 ndev->srq_cmds = nvmet_rdma_alloc_cmds(ndev, srq_size, false); 834 if (IS_ERR(ndev->srq_cmds)) { 835 ret = PTR_ERR(ndev->srq_cmds); 836 goto out_destroy_srq; 837 } 838 839 ndev->srq = srq; 840 ndev->srq_size = srq_size; 841 842 for (i = 0; i < srq_size; i++) { 843 ret = nvmet_rdma_post_recv(ndev, &ndev->srq_cmds[i]); 844 if (ret) 845 goto out_free_cmds; 846 } 847 848 return 0; 849 850 out_free_cmds: 851 nvmet_rdma_free_cmds(ndev, ndev->srq_cmds, ndev->srq_size, false); 852 out_destroy_srq: 853 ib_destroy_srq(srq); 854 return ret; 855 } 856 857 static void nvmet_rdma_free_dev(struct kref *ref) 858 { 859 struct nvmet_rdma_device *ndev = 860 container_of(ref, struct nvmet_rdma_device, ref); 861 862 mutex_lock(&device_list_mutex); 863 list_del(&ndev->entry); 864 mutex_unlock(&device_list_mutex); 865 866 nvmet_rdma_destroy_srq(ndev); 867 ib_dealloc_pd(ndev->pd); 868 869 kfree(ndev); 870 } 871 872 static struct nvmet_rdma_device * 873 nvmet_rdma_find_get_device(struct rdma_cm_id *cm_id) 874 { 875 struct nvmet_port *port = cm_id->context; 876 struct nvmet_rdma_device *ndev; 877 int inline_page_count; 878 int inline_sge_count; 879 int ret; 880 881 mutex_lock(&device_list_mutex); 882 list_for_each_entry(ndev, &device_list, entry) { 883 if (ndev->device->node_guid == cm_id->device->node_guid && 884 kref_get_unless_zero(&ndev->ref)) 885 goto out_unlock; 886 } 887 888 ndev = kzalloc(sizeof(*ndev), GFP_KERNEL); 889 if (!ndev) 890 goto out_err; 891 892 inline_page_count = num_pages(port->inline_data_size); 893 inline_sge_count = max(cm_id->device->attrs.max_sge_rd, 894 cm_id->device->attrs.max_recv_sge) - 1; 895 if (inline_page_count > inline_sge_count) { 896 pr_warn("inline_data_size %d cannot be supported by device %s. Reducing to %lu.\n", 897 port->inline_data_size, cm_id->device->name, 898 inline_sge_count * PAGE_SIZE); 899 port->inline_data_size = inline_sge_count * PAGE_SIZE; 900 inline_page_count = inline_sge_count; 901 } 902 ndev->inline_data_size = port->inline_data_size; 903 ndev->inline_page_count = inline_page_count; 904 ndev->device = cm_id->device; 905 kref_init(&ndev->ref); 906 907 ndev->pd = ib_alloc_pd(ndev->device, 0); 908 if (IS_ERR(ndev->pd)) 909 goto out_free_dev; 910 911 if (nvmet_rdma_use_srq) { 912 ret = nvmet_rdma_init_srq(ndev); 913 if (ret) 914 goto out_free_pd; 915 } 916 917 list_add(&ndev->entry, &device_list); 918 out_unlock: 919 mutex_unlock(&device_list_mutex); 920 pr_debug("added %s.\n", ndev->device->name); 921 return ndev; 922 923 out_free_pd: 924 ib_dealloc_pd(ndev->pd); 925 out_free_dev: 926 kfree(ndev); 927 out_err: 928 mutex_unlock(&device_list_mutex); 929 return NULL; 930 } 931 932 static int nvmet_rdma_create_queue_ib(struct nvmet_rdma_queue *queue) 933 { 934 struct ib_qp_init_attr qp_attr; 935 struct nvmet_rdma_device *ndev = queue->dev; 936 int comp_vector, nr_cqe, ret, i; 937 938 /* 939 * Spread the io queues across completion vectors, 940 * but still keep all admin queues on vector 0. 941 */ 942 comp_vector = !queue->host_qid ? 0 : 943 queue->idx % ndev->device->num_comp_vectors; 944 945 /* 946 * Reserve CQ slots for RECV + RDMA_READ/RDMA_WRITE + RDMA_SEND. 947 */ 948 nr_cqe = queue->recv_queue_size + 2 * queue->send_queue_size; 949 950 queue->cq = ib_alloc_cq(ndev->device, queue, 951 nr_cqe + 1, comp_vector, 952 IB_POLL_WORKQUEUE); 953 if (IS_ERR(queue->cq)) { 954 ret = PTR_ERR(queue->cq); 955 pr_err("failed to create CQ cqe= %d ret= %d\n", 956 nr_cqe + 1, ret); 957 goto out; 958 } 959 960 memset(&qp_attr, 0, sizeof(qp_attr)); 961 qp_attr.qp_context = queue; 962 qp_attr.event_handler = nvmet_rdma_qp_event; 963 qp_attr.send_cq = queue->cq; 964 qp_attr.recv_cq = queue->cq; 965 qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 966 qp_attr.qp_type = IB_QPT_RC; 967 /* +1 for drain */ 968 qp_attr.cap.max_send_wr = queue->send_queue_size + 1; 969 qp_attr.cap.max_rdma_ctxs = queue->send_queue_size; 970 qp_attr.cap.max_send_sge = max(ndev->device->attrs.max_sge_rd, 971 ndev->device->attrs.max_send_sge); 972 973 if (ndev->srq) { 974 qp_attr.srq = ndev->srq; 975 } else { 976 /* +1 for drain */ 977 qp_attr.cap.max_recv_wr = 1 + queue->recv_queue_size; 978 qp_attr.cap.max_recv_sge = 1 + ndev->inline_page_count; 979 } 980 981 ret = rdma_create_qp(queue->cm_id, ndev->pd, &qp_attr); 982 if (ret) { 983 pr_err("failed to create_qp ret= %d\n", ret); 984 goto err_destroy_cq; 985 } 986 987 atomic_set(&queue->sq_wr_avail, qp_attr.cap.max_send_wr); 988 989 pr_debug("%s: max_cqe= %d max_sge= %d sq_size = %d cm_id= %p\n", 990 __func__, queue->cq->cqe, qp_attr.cap.max_send_sge, 991 qp_attr.cap.max_send_wr, queue->cm_id); 992 993 if (!ndev->srq) { 994 for (i = 0; i < queue->recv_queue_size; i++) { 995 queue->cmds[i].queue = queue; 996 ret = nvmet_rdma_post_recv(ndev, &queue->cmds[i]); 997 if (ret) 998 goto err_destroy_qp; 999 } 1000 } 1001 1002 out: 1003 return ret; 1004 1005 err_destroy_qp: 1006 rdma_destroy_qp(queue->cm_id); 1007 err_destroy_cq: 1008 ib_free_cq(queue->cq); 1009 goto out; 1010 } 1011 1012 static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue) 1013 { 1014 struct ib_qp *qp = queue->cm_id->qp; 1015 1016 ib_drain_qp(qp); 1017 rdma_destroy_id(queue->cm_id); 1018 ib_destroy_qp(qp); 1019 ib_free_cq(queue->cq); 1020 } 1021 1022 static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue) 1023 { 1024 pr_debug("freeing queue %d\n", queue->idx); 1025 1026 nvmet_sq_destroy(&queue->nvme_sq); 1027 1028 nvmet_rdma_destroy_queue_ib(queue); 1029 if (!queue->dev->srq) { 1030 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1031 queue->recv_queue_size, 1032 !queue->host_qid); 1033 } 1034 nvmet_rdma_free_rsps(queue); 1035 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1036 kfree(queue); 1037 } 1038 1039 static void nvmet_rdma_release_queue_work(struct work_struct *w) 1040 { 1041 struct nvmet_rdma_queue *queue = 1042 container_of(w, struct nvmet_rdma_queue, release_work); 1043 struct nvmet_rdma_device *dev = queue->dev; 1044 1045 nvmet_rdma_free_queue(queue); 1046 1047 kref_put(&dev->ref, nvmet_rdma_free_dev); 1048 } 1049 1050 static int 1051 nvmet_rdma_parse_cm_connect_req(struct rdma_conn_param *conn, 1052 struct nvmet_rdma_queue *queue) 1053 { 1054 struct nvme_rdma_cm_req *req; 1055 1056 req = (struct nvme_rdma_cm_req *)conn->private_data; 1057 if (!req || conn->private_data_len == 0) 1058 return NVME_RDMA_CM_INVALID_LEN; 1059 1060 if (le16_to_cpu(req->recfmt) != NVME_RDMA_CM_FMT_1_0) 1061 return NVME_RDMA_CM_INVALID_RECFMT; 1062 1063 queue->host_qid = le16_to_cpu(req->qid); 1064 1065 /* 1066 * req->hsqsize corresponds to our recv queue size plus 1 1067 * req->hrqsize corresponds to our send queue size 1068 */ 1069 queue->recv_queue_size = le16_to_cpu(req->hsqsize) + 1; 1070 queue->send_queue_size = le16_to_cpu(req->hrqsize); 1071 1072 if (!queue->host_qid && queue->recv_queue_size > NVME_AQ_DEPTH) 1073 return NVME_RDMA_CM_INVALID_HSQSIZE; 1074 1075 /* XXX: Should we enforce some kind of max for IO queues? */ 1076 1077 return 0; 1078 } 1079 1080 static int nvmet_rdma_cm_reject(struct rdma_cm_id *cm_id, 1081 enum nvme_rdma_cm_status status) 1082 { 1083 struct nvme_rdma_cm_rej rej; 1084 1085 pr_debug("rejecting connect request: status %d (%s)\n", 1086 status, nvme_rdma_cm_msg(status)); 1087 1088 rej.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1089 rej.sts = cpu_to_le16(status); 1090 1091 return rdma_reject(cm_id, (void *)&rej, sizeof(rej)); 1092 } 1093 1094 static struct nvmet_rdma_queue * 1095 nvmet_rdma_alloc_queue(struct nvmet_rdma_device *ndev, 1096 struct rdma_cm_id *cm_id, 1097 struct rdma_cm_event *event) 1098 { 1099 struct nvmet_rdma_queue *queue; 1100 int ret; 1101 1102 queue = kzalloc(sizeof(*queue), GFP_KERNEL); 1103 if (!queue) { 1104 ret = NVME_RDMA_CM_NO_RSC; 1105 goto out_reject; 1106 } 1107 1108 ret = nvmet_sq_init(&queue->nvme_sq); 1109 if (ret) { 1110 ret = NVME_RDMA_CM_NO_RSC; 1111 goto out_free_queue; 1112 } 1113 1114 ret = nvmet_rdma_parse_cm_connect_req(&event->param.conn, queue); 1115 if (ret) 1116 goto out_destroy_sq; 1117 1118 /* 1119 * Schedules the actual release because calling rdma_destroy_id from 1120 * inside a CM callback would trigger a deadlock. (great API design..) 1121 */ 1122 INIT_WORK(&queue->release_work, nvmet_rdma_release_queue_work); 1123 queue->dev = ndev; 1124 queue->cm_id = cm_id; 1125 1126 spin_lock_init(&queue->state_lock); 1127 queue->state = NVMET_RDMA_Q_CONNECTING; 1128 INIT_LIST_HEAD(&queue->rsp_wait_list); 1129 INIT_LIST_HEAD(&queue->rsp_wr_wait_list); 1130 spin_lock_init(&queue->rsp_wr_wait_lock); 1131 INIT_LIST_HEAD(&queue->free_rsps); 1132 spin_lock_init(&queue->rsps_lock); 1133 INIT_LIST_HEAD(&queue->queue_list); 1134 1135 queue->idx = ida_simple_get(&nvmet_rdma_queue_ida, 0, 0, GFP_KERNEL); 1136 if (queue->idx < 0) { 1137 ret = NVME_RDMA_CM_NO_RSC; 1138 goto out_destroy_sq; 1139 } 1140 1141 ret = nvmet_rdma_alloc_rsps(queue); 1142 if (ret) { 1143 ret = NVME_RDMA_CM_NO_RSC; 1144 goto out_ida_remove; 1145 } 1146 1147 if (!ndev->srq) { 1148 queue->cmds = nvmet_rdma_alloc_cmds(ndev, 1149 queue->recv_queue_size, 1150 !queue->host_qid); 1151 if (IS_ERR(queue->cmds)) { 1152 ret = NVME_RDMA_CM_NO_RSC; 1153 goto out_free_responses; 1154 } 1155 } 1156 1157 ret = nvmet_rdma_create_queue_ib(queue); 1158 if (ret) { 1159 pr_err("%s: creating RDMA queue failed (%d).\n", 1160 __func__, ret); 1161 ret = NVME_RDMA_CM_NO_RSC; 1162 goto out_free_cmds; 1163 } 1164 1165 return queue; 1166 1167 out_free_cmds: 1168 if (!ndev->srq) { 1169 nvmet_rdma_free_cmds(queue->dev, queue->cmds, 1170 queue->recv_queue_size, 1171 !queue->host_qid); 1172 } 1173 out_free_responses: 1174 nvmet_rdma_free_rsps(queue); 1175 out_ida_remove: 1176 ida_simple_remove(&nvmet_rdma_queue_ida, queue->idx); 1177 out_destroy_sq: 1178 nvmet_sq_destroy(&queue->nvme_sq); 1179 out_free_queue: 1180 kfree(queue); 1181 out_reject: 1182 nvmet_rdma_cm_reject(cm_id, ret); 1183 return NULL; 1184 } 1185 1186 static void nvmet_rdma_qp_event(struct ib_event *event, void *priv) 1187 { 1188 struct nvmet_rdma_queue *queue = priv; 1189 1190 switch (event->event) { 1191 case IB_EVENT_COMM_EST: 1192 rdma_notify(queue->cm_id, event->event); 1193 break; 1194 default: 1195 pr_err("received IB QP event: %s (%d)\n", 1196 ib_event_msg(event->event), event->event); 1197 break; 1198 } 1199 } 1200 1201 static int nvmet_rdma_cm_accept(struct rdma_cm_id *cm_id, 1202 struct nvmet_rdma_queue *queue, 1203 struct rdma_conn_param *p) 1204 { 1205 struct rdma_conn_param param = { }; 1206 struct nvme_rdma_cm_rep priv = { }; 1207 int ret = -ENOMEM; 1208 1209 param.rnr_retry_count = 7; 1210 param.flow_control = 1; 1211 param.initiator_depth = min_t(u8, p->initiator_depth, 1212 queue->dev->device->attrs.max_qp_init_rd_atom); 1213 param.private_data = &priv; 1214 param.private_data_len = sizeof(priv); 1215 priv.recfmt = cpu_to_le16(NVME_RDMA_CM_FMT_1_0); 1216 priv.crqsize = cpu_to_le16(queue->recv_queue_size); 1217 1218 ret = rdma_accept(cm_id, ¶m); 1219 if (ret) 1220 pr_err("rdma_accept failed (error code = %d)\n", ret); 1221 1222 return ret; 1223 } 1224 1225 static int nvmet_rdma_queue_connect(struct rdma_cm_id *cm_id, 1226 struct rdma_cm_event *event) 1227 { 1228 struct nvmet_rdma_device *ndev; 1229 struct nvmet_rdma_queue *queue; 1230 int ret = -EINVAL; 1231 1232 ndev = nvmet_rdma_find_get_device(cm_id); 1233 if (!ndev) { 1234 nvmet_rdma_cm_reject(cm_id, NVME_RDMA_CM_NO_RSC); 1235 return -ECONNREFUSED; 1236 } 1237 1238 queue = nvmet_rdma_alloc_queue(ndev, cm_id, event); 1239 if (!queue) { 1240 ret = -ENOMEM; 1241 goto put_device; 1242 } 1243 queue->port = cm_id->context; 1244 1245 if (queue->host_qid == 0) { 1246 /* Let inflight controller teardown complete */ 1247 flush_scheduled_work(); 1248 } 1249 1250 ret = nvmet_rdma_cm_accept(cm_id, queue, &event->param.conn); 1251 if (ret) { 1252 schedule_work(&queue->release_work); 1253 /* Destroying rdma_cm id is not needed here */ 1254 return 0; 1255 } 1256 1257 mutex_lock(&nvmet_rdma_queue_mutex); 1258 list_add_tail(&queue->queue_list, &nvmet_rdma_queue_list); 1259 mutex_unlock(&nvmet_rdma_queue_mutex); 1260 1261 return 0; 1262 1263 put_device: 1264 kref_put(&ndev->ref, nvmet_rdma_free_dev); 1265 1266 return ret; 1267 } 1268 1269 static void nvmet_rdma_queue_established(struct nvmet_rdma_queue *queue) 1270 { 1271 unsigned long flags; 1272 1273 spin_lock_irqsave(&queue->state_lock, flags); 1274 if (queue->state != NVMET_RDMA_Q_CONNECTING) { 1275 pr_warn("trying to establish a connected queue\n"); 1276 goto out_unlock; 1277 } 1278 queue->state = NVMET_RDMA_Q_LIVE; 1279 1280 while (!list_empty(&queue->rsp_wait_list)) { 1281 struct nvmet_rdma_rsp *cmd; 1282 1283 cmd = list_first_entry(&queue->rsp_wait_list, 1284 struct nvmet_rdma_rsp, wait_list); 1285 list_del(&cmd->wait_list); 1286 1287 spin_unlock_irqrestore(&queue->state_lock, flags); 1288 nvmet_rdma_handle_command(queue, cmd); 1289 spin_lock_irqsave(&queue->state_lock, flags); 1290 } 1291 1292 out_unlock: 1293 spin_unlock_irqrestore(&queue->state_lock, flags); 1294 } 1295 1296 static void __nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1297 { 1298 bool disconnect = false; 1299 unsigned long flags; 1300 1301 pr_debug("cm_id= %p queue->state= %d\n", queue->cm_id, queue->state); 1302 1303 spin_lock_irqsave(&queue->state_lock, flags); 1304 switch (queue->state) { 1305 case NVMET_RDMA_Q_CONNECTING: 1306 case NVMET_RDMA_Q_LIVE: 1307 queue->state = NVMET_RDMA_Q_DISCONNECTING; 1308 disconnect = true; 1309 break; 1310 case NVMET_RDMA_Q_DISCONNECTING: 1311 break; 1312 } 1313 spin_unlock_irqrestore(&queue->state_lock, flags); 1314 1315 if (disconnect) { 1316 rdma_disconnect(queue->cm_id); 1317 schedule_work(&queue->release_work); 1318 } 1319 } 1320 1321 static void nvmet_rdma_queue_disconnect(struct nvmet_rdma_queue *queue) 1322 { 1323 bool disconnect = false; 1324 1325 mutex_lock(&nvmet_rdma_queue_mutex); 1326 if (!list_empty(&queue->queue_list)) { 1327 list_del_init(&queue->queue_list); 1328 disconnect = true; 1329 } 1330 mutex_unlock(&nvmet_rdma_queue_mutex); 1331 1332 if (disconnect) 1333 __nvmet_rdma_queue_disconnect(queue); 1334 } 1335 1336 static void nvmet_rdma_queue_connect_fail(struct rdma_cm_id *cm_id, 1337 struct nvmet_rdma_queue *queue) 1338 { 1339 WARN_ON_ONCE(queue->state != NVMET_RDMA_Q_CONNECTING); 1340 1341 mutex_lock(&nvmet_rdma_queue_mutex); 1342 if (!list_empty(&queue->queue_list)) 1343 list_del_init(&queue->queue_list); 1344 mutex_unlock(&nvmet_rdma_queue_mutex); 1345 1346 pr_err("failed to connect queue %d\n", queue->idx); 1347 schedule_work(&queue->release_work); 1348 } 1349 1350 /** 1351 * nvme_rdma_device_removal() - Handle RDMA device removal 1352 * @cm_id: rdma_cm id, used for nvmet port 1353 * @queue: nvmet rdma queue (cm id qp_context) 1354 * 1355 * DEVICE_REMOVAL event notifies us that the RDMA device is about 1356 * to unplug. Note that this event can be generated on a normal 1357 * queue cm_id and/or a device bound listener cm_id (where in this 1358 * case queue will be null). 1359 * 1360 * We registered an ib_client to handle device removal for queues, 1361 * so we only need to handle the listening port cm_ids. In this case 1362 * we nullify the priv to prevent double cm_id destruction and destroying 1363 * the cm_id implicitely by returning a non-zero rc to the callout. 1364 */ 1365 static int nvmet_rdma_device_removal(struct rdma_cm_id *cm_id, 1366 struct nvmet_rdma_queue *queue) 1367 { 1368 struct nvmet_port *port; 1369 1370 if (queue) { 1371 /* 1372 * This is a queue cm_id. we have registered 1373 * an ib_client to handle queues removal 1374 * so don't interfear and just return. 1375 */ 1376 return 0; 1377 } 1378 1379 port = cm_id->context; 1380 1381 /* 1382 * This is a listener cm_id. Make sure that 1383 * future remove_port won't invoke a double 1384 * cm_id destroy. use atomic xchg to make sure 1385 * we don't compete with remove_port. 1386 */ 1387 if (xchg(&port->priv, NULL) != cm_id) 1388 return 0; 1389 1390 /* 1391 * We need to return 1 so that the core will destroy 1392 * it's own ID. What a great API design.. 1393 */ 1394 return 1; 1395 } 1396 1397 static int nvmet_rdma_cm_handler(struct rdma_cm_id *cm_id, 1398 struct rdma_cm_event *event) 1399 { 1400 struct nvmet_rdma_queue *queue = NULL; 1401 int ret = 0; 1402 1403 if (cm_id->qp) 1404 queue = cm_id->qp->qp_context; 1405 1406 pr_debug("%s (%d): status %d id %p\n", 1407 rdma_event_msg(event->event), event->event, 1408 event->status, cm_id); 1409 1410 switch (event->event) { 1411 case RDMA_CM_EVENT_CONNECT_REQUEST: 1412 ret = nvmet_rdma_queue_connect(cm_id, event); 1413 break; 1414 case RDMA_CM_EVENT_ESTABLISHED: 1415 nvmet_rdma_queue_established(queue); 1416 break; 1417 case RDMA_CM_EVENT_ADDR_CHANGE: 1418 case RDMA_CM_EVENT_DISCONNECTED: 1419 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 1420 nvmet_rdma_queue_disconnect(queue); 1421 break; 1422 case RDMA_CM_EVENT_DEVICE_REMOVAL: 1423 ret = nvmet_rdma_device_removal(cm_id, queue); 1424 break; 1425 case RDMA_CM_EVENT_REJECTED: 1426 pr_debug("Connection rejected: %s\n", 1427 rdma_reject_msg(cm_id, event->status)); 1428 /* FALLTHROUGH */ 1429 case RDMA_CM_EVENT_UNREACHABLE: 1430 case RDMA_CM_EVENT_CONNECT_ERROR: 1431 nvmet_rdma_queue_connect_fail(cm_id, queue); 1432 break; 1433 default: 1434 pr_err("received unrecognized RDMA CM event %d\n", 1435 event->event); 1436 break; 1437 } 1438 1439 return ret; 1440 } 1441 1442 static void nvmet_rdma_delete_ctrl(struct nvmet_ctrl *ctrl) 1443 { 1444 struct nvmet_rdma_queue *queue; 1445 1446 restart: 1447 mutex_lock(&nvmet_rdma_queue_mutex); 1448 list_for_each_entry(queue, &nvmet_rdma_queue_list, queue_list) { 1449 if (queue->nvme_sq.ctrl == ctrl) { 1450 list_del_init(&queue->queue_list); 1451 mutex_unlock(&nvmet_rdma_queue_mutex); 1452 1453 __nvmet_rdma_queue_disconnect(queue); 1454 goto restart; 1455 } 1456 } 1457 mutex_unlock(&nvmet_rdma_queue_mutex); 1458 } 1459 1460 static int nvmet_rdma_add_port(struct nvmet_port *port) 1461 { 1462 struct rdma_cm_id *cm_id; 1463 struct sockaddr_storage addr = { }; 1464 __kernel_sa_family_t af; 1465 int ret; 1466 1467 switch (port->disc_addr.adrfam) { 1468 case NVMF_ADDR_FAMILY_IP4: 1469 af = AF_INET; 1470 break; 1471 case NVMF_ADDR_FAMILY_IP6: 1472 af = AF_INET6; 1473 break; 1474 default: 1475 pr_err("address family %d not supported\n", 1476 port->disc_addr.adrfam); 1477 return -EINVAL; 1478 } 1479 1480 if (port->inline_data_size < 0) { 1481 port->inline_data_size = NVMET_RDMA_DEFAULT_INLINE_DATA_SIZE; 1482 } else if (port->inline_data_size > NVMET_RDMA_MAX_INLINE_DATA_SIZE) { 1483 pr_warn("inline_data_size %u is too large, reducing to %u\n", 1484 port->inline_data_size, 1485 NVMET_RDMA_MAX_INLINE_DATA_SIZE); 1486 port->inline_data_size = NVMET_RDMA_MAX_INLINE_DATA_SIZE; 1487 } 1488 1489 ret = inet_pton_with_scope(&init_net, af, port->disc_addr.traddr, 1490 port->disc_addr.trsvcid, &addr); 1491 if (ret) { 1492 pr_err("malformed ip/port passed: %s:%s\n", 1493 port->disc_addr.traddr, port->disc_addr.trsvcid); 1494 return ret; 1495 } 1496 1497 cm_id = rdma_create_id(&init_net, nvmet_rdma_cm_handler, port, 1498 RDMA_PS_TCP, IB_QPT_RC); 1499 if (IS_ERR(cm_id)) { 1500 pr_err("CM ID creation failed\n"); 1501 return PTR_ERR(cm_id); 1502 } 1503 1504 /* 1505 * Allow both IPv4 and IPv6 sockets to bind a single port 1506 * at the same time. 1507 */ 1508 ret = rdma_set_afonly(cm_id, 1); 1509 if (ret) { 1510 pr_err("rdma_set_afonly failed (%d)\n", ret); 1511 goto out_destroy_id; 1512 } 1513 1514 ret = rdma_bind_addr(cm_id, (struct sockaddr *)&addr); 1515 if (ret) { 1516 pr_err("binding CM ID to %pISpcs failed (%d)\n", 1517 (struct sockaddr *)&addr, ret); 1518 goto out_destroy_id; 1519 } 1520 1521 ret = rdma_listen(cm_id, 128); 1522 if (ret) { 1523 pr_err("listening to %pISpcs failed (%d)\n", 1524 (struct sockaddr *)&addr, ret); 1525 goto out_destroy_id; 1526 } 1527 1528 pr_info("enabling port %d (%pISpcs)\n", 1529 le16_to_cpu(port->disc_addr.portid), (struct sockaddr *)&addr); 1530 port->priv = cm_id; 1531 return 0; 1532 1533 out_destroy_id: 1534 rdma_destroy_id(cm_id); 1535 return ret; 1536 } 1537 1538 static void nvmet_rdma_remove_port(struct nvmet_port *port) 1539 { 1540 struct rdma_cm_id *cm_id = xchg(&port->priv, NULL); 1541 1542 if (cm_id) 1543 rdma_destroy_id(cm_id); 1544 } 1545 1546 static void nvmet_rdma_disc_port_addr(struct nvmet_req *req, 1547 struct nvmet_port *port, char *traddr) 1548 { 1549 struct rdma_cm_id *cm_id = port->priv; 1550 1551 if (inet_addr_is_any((struct sockaddr *)&cm_id->route.addr.src_addr)) { 1552 struct nvmet_rdma_rsp *rsp = 1553 container_of(req, struct nvmet_rdma_rsp, req); 1554 struct rdma_cm_id *req_cm_id = rsp->queue->cm_id; 1555 struct sockaddr *addr = (void *)&req_cm_id->route.addr.src_addr; 1556 1557 sprintf(traddr, "%pISc", addr); 1558 } else { 1559 memcpy(traddr, port->disc_addr.traddr, NVMF_TRADDR_SIZE); 1560 } 1561 } 1562 1563 static const struct nvmet_fabrics_ops nvmet_rdma_ops = { 1564 .owner = THIS_MODULE, 1565 .type = NVMF_TRTYPE_RDMA, 1566 .msdbd = 1, 1567 .has_keyed_sgls = 1, 1568 .add_port = nvmet_rdma_add_port, 1569 .remove_port = nvmet_rdma_remove_port, 1570 .queue_response = nvmet_rdma_queue_response, 1571 .delete_ctrl = nvmet_rdma_delete_ctrl, 1572 .disc_traddr = nvmet_rdma_disc_port_addr, 1573 }; 1574 1575 static void nvmet_rdma_remove_one(struct ib_device *ib_device, void *client_data) 1576 { 1577 struct nvmet_rdma_queue *queue, *tmp; 1578 struct nvmet_rdma_device *ndev; 1579 bool found = false; 1580 1581 mutex_lock(&device_list_mutex); 1582 list_for_each_entry(ndev, &device_list, entry) { 1583 if (ndev->device == ib_device) { 1584 found = true; 1585 break; 1586 } 1587 } 1588 mutex_unlock(&device_list_mutex); 1589 1590 if (!found) 1591 return; 1592 1593 /* 1594 * IB Device that is used by nvmet controllers is being removed, 1595 * delete all queues using this device. 1596 */ 1597 mutex_lock(&nvmet_rdma_queue_mutex); 1598 list_for_each_entry_safe(queue, tmp, &nvmet_rdma_queue_list, 1599 queue_list) { 1600 if (queue->dev->device != ib_device) 1601 continue; 1602 1603 pr_info("Removing queue %d\n", queue->idx); 1604 list_del_init(&queue->queue_list); 1605 __nvmet_rdma_queue_disconnect(queue); 1606 } 1607 mutex_unlock(&nvmet_rdma_queue_mutex); 1608 1609 flush_scheduled_work(); 1610 } 1611 1612 static struct ib_client nvmet_rdma_ib_client = { 1613 .name = "nvmet_rdma", 1614 .remove = nvmet_rdma_remove_one 1615 }; 1616 1617 static int __init nvmet_rdma_init(void) 1618 { 1619 int ret; 1620 1621 ret = ib_register_client(&nvmet_rdma_ib_client); 1622 if (ret) 1623 return ret; 1624 1625 ret = nvmet_register_transport(&nvmet_rdma_ops); 1626 if (ret) 1627 goto err_ib_client; 1628 1629 return 0; 1630 1631 err_ib_client: 1632 ib_unregister_client(&nvmet_rdma_ib_client); 1633 return ret; 1634 } 1635 1636 static void __exit nvmet_rdma_exit(void) 1637 { 1638 nvmet_unregister_transport(&nvmet_rdma_ops); 1639 ib_unregister_client(&nvmet_rdma_ib_client); 1640 WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list)); 1641 ida_destroy(&nvmet_rdma_queue_ida); 1642 } 1643 1644 module_init(nvmet_rdma_init); 1645 module_exit(nvmet_rdma_exit); 1646 1647 MODULE_LICENSE("GPL v2"); 1648 MODULE_ALIAS("nvmet-transport-1"); /* 1 == NVMF_TRTYPE_RDMA */ 1649