1 /* 2 * Copyright (c) 2004, 2005, 2006 Voltaire, Inc. All rights reserved. 3 * Copyright (c) 2005, 2006 Cisco Systems. All rights reserved. 4 * Copyright (c) 2013-2014 Mellanox Technologies. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 */ 34 #include <linux/kernel.h> 35 #include <linux/module.h> 36 #include <linux/slab.h> 37 #include <linux/delay.h> 38 39 #include "iscsi_iser.h" 40 41 #define ISCSI_ISER_MAX_CONN 8 42 #define ISER_MAX_RX_CQ_LEN (ISER_QP_MAX_RECV_DTOS * ISCSI_ISER_MAX_CONN) 43 #define ISER_MAX_TX_CQ_LEN (ISER_QP_MAX_REQ_DTOS * ISCSI_ISER_MAX_CONN) 44 45 static void iser_cq_tasklet_fn(unsigned long data); 46 static void iser_cq_callback(struct ib_cq *cq, void *cq_context); 47 48 static void iser_cq_event_callback(struct ib_event *cause, void *context) 49 { 50 iser_err("got cq event %d \n", cause->event); 51 } 52 53 static void iser_qp_event_callback(struct ib_event *cause, void *context) 54 { 55 iser_err("got qp event %d\n",cause->event); 56 } 57 58 static void iser_event_handler(struct ib_event_handler *handler, 59 struct ib_event *event) 60 { 61 iser_err("async event %d on device %s port %d\n", event->event, 62 event->device->name, event->element.port_num); 63 } 64 65 /** 66 * iser_create_device_ib_res - creates Protection Domain (PD), Completion 67 * Queue (CQ), DMA Memory Region (DMA MR) with the device associated with 68 * the adapator. 69 * 70 * returns 0 on success, -1 on failure 71 */ 72 static int iser_create_device_ib_res(struct iser_device *device) 73 { 74 struct iser_cq_desc *cq_desc; 75 struct ib_device_attr *dev_attr = &device->dev_attr; 76 int ret, i; 77 78 ret = ib_query_device(device->ib_device, dev_attr); 79 if (ret) { 80 pr_warn("Query device failed for %s\n", device->ib_device->name); 81 return ret; 82 } 83 84 /* Assign function handles - based on FMR support */ 85 if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr && 86 device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) { 87 iser_info("FMR supported, using FMR for registration\n"); 88 device->iser_alloc_rdma_reg_res = iser_create_fmr_pool; 89 device->iser_free_rdma_reg_res = iser_free_fmr_pool; 90 device->iser_reg_rdma_mem = iser_reg_rdma_mem_fmr; 91 device->iser_unreg_rdma_mem = iser_unreg_mem_fmr; 92 } else 93 if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { 94 iser_info("FastReg supported, using FastReg for registration\n"); 95 device->iser_alloc_rdma_reg_res = iser_create_fastreg_pool; 96 device->iser_free_rdma_reg_res = iser_free_fastreg_pool; 97 device->iser_reg_rdma_mem = iser_reg_rdma_mem_fastreg; 98 device->iser_unreg_rdma_mem = iser_unreg_mem_fastreg; 99 } else { 100 iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n"); 101 return -1; 102 } 103 104 device->cqs_used = min(ISER_MAX_CQ, device->ib_device->num_comp_vectors); 105 iser_info("using %d CQs, device %s supports %d vectors\n", 106 device->cqs_used, device->ib_device->name, 107 device->ib_device->num_comp_vectors); 108 109 device->cq_desc = kmalloc(sizeof(struct iser_cq_desc) * device->cqs_used, 110 GFP_KERNEL); 111 if (device->cq_desc == NULL) 112 goto cq_desc_err; 113 cq_desc = device->cq_desc; 114 115 device->pd = ib_alloc_pd(device->ib_device); 116 if (IS_ERR(device->pd)) 117 goto pd_err; 118 119 for (i = 0; i < device->cqs_used; i++) { 120 cq_desc[i].device = device; 121 cq_desc[i].cq_index = i; 122 123 device->rx_cq[i] = ib_create_cq(device->ib_device, 124 iser_cq_callback, 125 iser_cq_event_callback, 126 (void *)&cq_desc[i], 127 ISER_MAX_RX_CQ_LEN, i); 128 if (IS_ERR(device->rx_cq[i])) { 129 device->rx_cq[i] = NULL; 130 goto cq_err; 131 } 132 133 device->tx_cq[i] = ib_create_cq(device->ib_device, 134 NULL, iser_cq_event_callback, 135 (void *)&cq_desc[i], 136 ISER_MAX_TX_CQ_LEN, i); 137 138 if (IS_ERR(device->tx_cq[i])) { 139 device->tx_cq[i] = NULL; 140 goto cq_err; 141 } 142 143 if (ib_req_notify_cq(device->rx_cq[i], IB_CQ_NEXT_COMP)) 144 goto cq_err; 145 146 tasklet_init(&device->cq_tasklet[i], 147 iser_cq_tasklet_fn, 148 (unsigned long)&cq_desc[i]); 149 } 150 151 device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE | 152 IB_ACCESS_REMOTE_WRITE | 153 IB_ACCESS_REMOTE_READ); 154 if (IS_ERR(device->mr)) 155 goto dma_mr_err; 156 157 INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device, 158 iser_event_handler); 159 if (ib_register_event_handler(&device->event_handler)) 160 goto handler_err; 161 162 return 0; 163 164 handler_err: 165 ib_dereg_mr(device->mr); 166 dma_mr_err: 167 for (i = 0; i < device->cqs_used; i++) 168 tasklet_kill(&device->cq_tasklet[i]); 169 cq_err: 170 for (i = 0; i < device->cqs_used; i++) { 171 if (device->tx_cq[i]) 172 ib_destroy_cq(device->tx_cq[i]); 173 if (device->rx_cq[i]) 174 ib_destroy_cq(device->rx_cq[i]); 175 } 176 ib_dealloc_pd(device->pd); 177 pd_err: 178 kfree(device->cq_desc); 179 cq_desc_err: 180 iser_err("failed to allocate an IB resource\n"); 181 return -1; 182 } 183 184 /** 185 * iser_free_device_ib_res - destroy/dealloc/dereg the DMA MR, 186 * CQ and PD created with the device associated with the adapator. 187 */ 188 static void iser_free_device_ib_res(struct iser_device *device) 189 { 190 int i; 191 BUG_ON(device->mr == NULL); 192 193 for (i = 0; i < device->cqs_used; i++) { 194 tasklet_kill(&device->cq_tasklet[i]); 195 (void)ib_destroy_cq(device->tx_cq[i]); 196 (void)ib_destroy_cq(device->rx_cq[i]); 197 device->tx_cq[i] = NULL; 198 device->rx_cq[i] = NULL; 199 } 200 201 (void)ib_unregister_event_handler(&device->event_handler); 202 (void)ib_dereg_mr(device->mr); 203 (void)ib_dealloc_pd(device->pd); 204 205 kfree(device->cq_desc); 206 207 device->mr = NULL; 208 device->pd = NULL; 209 } 210 211 /** 212 * iser_create_fmr_pool - Creates FMR pool and page_vector 213 * 214 * returns 0 on success, or errno code on failure 215 */ 216 int iser_create_fmr_pool(struct iser_conn *ib_conn, unsigned cmds_max) 217 { 218 struct iser_device *device = ib_conn->device; 219 struct ib_fmr_pool_param params; 220 int ret = -ENOMEM; 221 222 ib_conn->fmr.page_vec = kmalloc(sizeof(*ib_conn->fmr.page_vec) + 223 (sizeof(u64)*(ISCSI_ISER_SG_TABLESIZE + 1)), 224 GFP_KERNEL); 225 if (!ib_conn->fmr.page_vec) 226 return ret; 227 228 ib_conn->fmr.page_vec->pages = (u64 *)(ib_conn->fmr.page_vec + 1); 229 230 params.page_shift = SHIFT_4K; 231 /* when the first/last SG element are not start/end * 232 * page aligned, the map whould be of N+1 pages */ 233 params.max_pages_per_fmr = ISCSI_ISER_SG_TABLESIZE + 1; 234 /* make the pool size twice the max number of SCSI commands * 235 * the ML is expected to queue, watermark for unmap at 50% */ 236 params.pool_size = cmds_max * 2; 237 params.dirty_watermark = cmds_max; 238 params.cache = 0; 239 params.flush_function = NULL; 240 params.access = (IB_ACCESS_LOCAL_WRITE | 241 IB_ACCESS_REMOTE_WRITE | 242 IB_ACCESS_REMOTE_READ); 243 244 ib_conn->fmr.pool = ib_create_fmr_pool(device->pd, ¶ms); 245 if (!IS_ERR(ib_conn->fmr.pool)) 246 return 0; 247 248 /* no FMR => no need for page_vec */ 249 kfree(ib_conn->fmr.page_vec); 250 ib_conn->fmr.page_vec = NULL; 251 252 ret = PTR_ERR(ib_conn->fmr.pool); 253 ib_conn->fmr.pool = NULL; 254 if (ret != -ENOSYS) { 255 iser_err("FMR allocation failed, err %d\n", ret); 256 return ret; 257 } else { 258 iser_warn("FMRs are not supported, using unaligned mode\n"); 259 return 0; 260 } 261 } 262 263 /** 264 * iser_free_fmr_pool - releases the FMR pool and page vec 265 */ 266 void iser_free_fmr_pool(struct iser_conn *ib_conn) 267 { 268 iser_info("freeing conn %p fmr pool %p\n", 269 ib_conn, ib_conn->fmr.pool); 270 271 if (ib_conn->fmr.pool != NULL) 272 ib_destroy_fmr_pool(ib_conn->fmr.pool); 273 274 ib_conn->fmr.pool = NULL; 275 276 kfree(ib_conn->fmr.page_vec); 277 ib_conn->fmr.page_vec = NULL; 278 } 279 280 static int 281 iser_create_fastreg_desc(struct ib_device *ib_device, struct ib_pd *pd, 282 bool pi_enable, struct fast_reg_descriptor *desc) 283 { 284 int ret; 285 286 desc->data_frpl = ib_alloc_fast_reg_page_list(ib_device, 287 ISCSI_ISER_SG_TABLESIZE + 1); 288 if (IS_ERR(desc->data_frpl)) { 289 ret = PTR_ERR(desc->data_frpl); 290 iser_err("Failed to allocate ib_fast_reg_page_list err=%d\n", 291 ret); 292 return PTR_ERR(desc->data_frpl); 293 } 294 295 desc->data_mr = ib_alloc_fast_reg_mr(pd, ISCSI_ISER_SG_TABLESIZE + 1); 296 if (IS_ERR(desc->data_mr)) { 297 ret = PTR_ERR(desc->data_mr); 298 iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); 299 goto fast_reg_mr_failure; 300 } 301 desc->reg_indicators |= ISER_DATA_KEY_VALID; 302 303 if (pi_enable) { 304 struct ib_mr_init_attr mr_init_attr = {0}; 305 struct iser_pi_context *pi_ctx = NULL; 306 307 desc->pi_ctx = kzalloc(sizeof(*desc->pi_ctx), GFP_KERNEL); 308 if (!desc->pi_ctx) { 309 iser_err("Failed to allocate pi context\n"); 310 ret = -ENOMEM; 311 goto pi_ctx_alloc_failure; 312 } 313 pi_ctx = desc->pi_ctx; 314 315 pi_ctx->prot_frpl = ib_alloc_fast_reg_page_list(ib_device, 316 ISCSI_ISER_SG_TABLESIZE); 317 if (IS_ERR(pi_ctx->prot_frpl)) { 318 ret = PTR_ERR(pi_ctx->prot_frpl); 319 iser_err("Failed to allocate prot frpl ret=%d\n", 320 ret); 321 goto prot_frpl_failure; 322 } 323 324 pi_ctx->prot_mr = ib_alloc_fast_reg_mr(pd, 325 ISCSI_ISER_SG_TABLESIZE + 1); 326 if (IS_ERR(pi_ctx->prot_mr)) { 327 ret = PTR_ERR(pi_ctx->prot_mr); 328 iser_err("Failed to allocate prot frmr ret=%d\n", 329 ret); 330 goto prot_mr_failure; 331 } 332 desc->reg_indicators |= ISER_PROT_KEY_VALID; 333 334 mr_init_attr.max_reg_descriptors = 2; 335 mr_init_attr.flags |= IB_MR_SIGNATURE_EN; 336 pi_ctx->sig_mr = ib_create_mr(pd, &mr_init_attr); 337 if (IS_ERR(pi_ctx->sig_mr)) { 338 ret = PTR_ERR(pi_ctx->sig_mr); 339 iser_err("Failed to allocate signature enabled mr err=%d\n", 340 ret); 341 goto sig_mr_failure; 342 } 343 desc->reg_indicators |= ISER_SIG_KEY_VALID; 344 } 345 desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; 346 347 iser_dbg("Create fr_desc %p page_list %p\n", 348 desc, desc->data_frpl->page_list); 349 350 return 0; 351 sig_mr_failure: 352 ib_dereg_mr(desc->pi_ctx->prot_mr); 353 prot_mr_failure: 354 ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); 355 prot_frpl_failure: 356 kfree(desc->pi_ctx); 357 pi_ctx_alloc_failure: 358 ib_dereg_mr(desc->data_mr); 359 fast_reg_mr_failure: 360 ib_free_fast_reg_page_list(desc->data_frpl); 361 362 return ret; 363 } 364 365 /** 366 * iser_create_fastreg_pool - Creates pool of fast_reg descriptors 367 * for fast registration work requests. 368 * returns 0 on success, or errno code on failure 369 */ 370 int iser_create_fastreg_pool(struct iser_conn *ib_conn, unsigned cmds_max) 371 { 372 struct iser_device *device = ib_conn->device; 373 struct fast_reg_descriptor *desc; 374 int i, ret; 375 376 INIT_LIST_HEAD(&ib_conn->fastreg.pool); 377 ib_conn->fastreg.pool_size = 0; 378 for (i = 0; i < cmds_max; i++) { 379 desc = kzalloc(sizeof(*desc), GFP_KERNEL); 380 if (!desc) { 381 iser_err("Failed to allocate a new fast_reg descriptor\n"); 382 ret = -ENOMEM; 383 goto err; 384 } 385 386 ret = iser_create_fastreg_desc(device->ib_device, device->pd, 387 ib_conn->pi_support, desc); 388 if (ret) { 389 iser_err("Failed to create fastreg descriptor err=%d\n", 390 ret); 391 kfree(desc); 392 goto err; 393 } 394 395 list_add_tail(&desc->list, &ib_conn->fastreg.pool); 396 ib_conn->fastreg.pool_size++; 397 } 398 399 return 0; 400 401 err: 402 iser_free_fastreg_pool(ib_conn); 403 return ret; 404 } 405 406 /** 407 * iser_free_fastreg_pool - releases the pool of fast_reg descriptors 408 */ 409 void iser_free_fastreg_pool(struct iser_conn *ib_conn) 410 { 411 struct fast_reg_descriptor *desc, *tmp; 412 int i = 0; 413 414 if (list_empty(&ib_conn->fastreg.pool)) 415 return; 416 417 iser_info("freeing conn %p fr pool\n", ib_conn); 418 419 list_for_each_entry_safe(desc, tmp, &ib_conn->fastreg.pool, list) { 420 list_del(&desc->list); 421 ib_free_fast_reg_page_list(desc->data_frpl); 422 ib_dereg_mr(desc->data_mr); 423 if (desc->pi_ctx) { 424 ib_free_fast_reg_page_list(desc->pi_ctx->prot_frpl); 425 ib_dereg_mr(desc->pi_ctx->prot_mr); 426 ib_destroy_mr(desc->pi_ctx->sig_mr); 427 kfree(desc->pi_ctx); 428 } 429 kfree(desc); 430 ++i; 431 } 432 433 if (i < ib_conn->fastreg.pool_size) 434 iser_warn("pool still has %d regions registered\n", 435 ib_conn->fastreg.pool_size - i); 436 } 437 438 /** 439 * iser_create_ib_conn_res - Queue-Pair (QP) 440 * 441 * returns 0 on success, -1 on failure 442 */ 443 static int iser_create_ib_conn_res(struct iser_conn *ib_conn) 444 { 445 struct iser_device *device; 446 struct ib_qp_init_attr init_attr; 447 int ret = -ENOMEM; 448 int index, min_index = 0; 449 450 BUG_ON(ib_conn->device == NULL); 451 452 device = ib_conn->device; 453 454 memset(&init_attr, 0, sizeof init_attr); 455 456 mutex_lock(&ig.connlist_mutex); 457 /* select the CQ with the minimal number of usages */ 458 for (index = 0; index < device->cqs_used; index++) 459 if (device->cq_active_qps[index] < 460 device->cq_active_qps[min_index]) 461 min_index = index; 462 device->cq_active_qps[min_index]++; 463 mutex_unlock(&ig.connlist_mutex); 464 iser_info("cq index %d used for ib_conn %p\n", min_index, ib_conn); 465 466 init_attr.event_handler = iser_qp_event_callback; 467 init_attr.qp_context = (void *)ib_conn; 468 init_attr.send_cq = device->tx_cq[min_index]; 469 init_attr.recv_cq = device->rx_cq[min_index]; 470 init_attr.cap.max_recv_wr = ISER_QP_MAX_RECV_DTOS; 471 init_attr.cap.max_send_sge = 2; 472 init_attr.cap.max_recv_sge = 1; 473 init_attr.sq_sig_type = IB_SIGNAL_REQ_WR; 474 init_attr.qp_type = IB_QPT_RC; 475 if (ib_conn->pi_support) { 476 init_attr.cap.max_send_wr = ISER_QP_SIG_MAX_REQ_DTOS; 477 init_attr.create_flags |= IB_QP_CREATE_SIGNATURE_EN; 478 } else { 479 init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS; 480 } 481 482 ret = rdma_create_qp(ib_conn->cma_id, device->pd, &init_attr); 483 if (ret) 484 goto out_err; 485 486 ib_conn->qp = ib_conn->cma_id->qp; 487 iser_info("setting conn %p cma_id %p qp %p\n", 488 ib_conn, ib_conn->cma_id, 489 ib_conn->cma_id->qp); 490 return ret; 491 492 out_err: 493 iser_err("unable to alloc mem or create resource, err %d\n", ret); 494 return ret; 495 } 496 497 /** 498 * releases the QP object 499 */ 500 static void iser_free_ib_conn_res(struct iser_conn *ib_conn) 501 { 502 int cq_index; 503 BUG_ON(ib_conn == NULL); 504 505 iser_info("freeing conn %p cma_id %p qp %p\n", 506 ib_conn, ib_conn->cma_id, 507 ib_conn->qp); 508 509 /* qp is created only once both addr & route are resolved */ 510 511 if (ib_conn->qp != NULL) { 512 cq_index = ((struct iser_cq_desc *)ib_conn->qp->recv_cq->cq_context)->cq_index; 513 ib_conn->device->cq_active_qps[cq_index]--; 514 515 rdma_destroy_qp(ib_conn->cma_id); 516 } 517 518 ib_conn->qp = NULL; 519 } 520 521 /** 522 * based on the resolved device node GUID see if there already allocated 523 * device for this device. If there's no such, create one. 524 */ 525 static 526 struct iser_device *iser_device_find_by_ib_device(struct rdma_cm_id *cma_id) 527 { 528 struct iser_device *device; 529 530 mutex_lock(&ig.device_list_mutex); 531 532 list_for_each_entry(device, &ig.device_list, ig_list) 533 /* find if there's a match using the node GUID */ 534 if (device->ib_device->node_guid == cma_id->device->node_guid) 535 goto inc_refcnt; 536 537 device = kzalloc(sizeof *device, GFP_KERNEL); 538 if (device == NULL) 539 goto out; 540 541 /* assign this device to the device */ 542 device->ib_device = cma_id->device; 543 /* init the device and link it into ig device list */ 544 if (iser_create_device_ib_res(device)) { 545 kfree(device); 546 device = NULL; 547 goto out; 548 } 549 list_add(&device->ig_list, &ig.device_list); 550 551 inc_refcnt: 552 device->refcount++; 553 out: 554 mutex_unlock(&ig.device_list_mutex); 555 return device; 556 } 557 558 /* if there's no demand for this device, release it */ 559 static void iser_device_try_release(struct iser_device *device) 560 { 561 mutex_lock(&ig.device_list_mutex); 562 device->refcount--; 563 iser_info("device %p refcount %d\n", device, device->refcount); 564 if (!device->refcount) { 565 iser_free_device_ib_res(device); 566 list_del(&device->ig_list); 567 kfree(device); 568 } 569 mutex_unlock(&ig.device_list_mutex); 570 } 571 572 /** 573 * Called with state mutex held 574 **/ 575 static int iser_conn_state_comp_exch(struct iser_conn *ib_conn, 576 enum iser_ib_conn_state comp, 577 enum iser_ib_conn_state exch) 578 { 579 int ret; 580 581 if ((ret = (ib_conn->state == comp))) 582 ib_conn->state = exch; 583 return ret; 584 } 585 586 void iser_release_work(struct work_struct *work) 587 { 588 struct iser_conn *ib_conn; 589 int rc; 590 591 ib_conn = container_of(work, struct iser_conn, release_work); 592 593 /* wait for .conn_stop callback */ 594 rc = wait_for_completion_timeout(&ib_conn->stop_completion, 30 * HZ); 595 WARN_ON(rc == 0); 596 597 /* wait for the qp`s post send and post receive buffers to empty */ 598 rc = wait_for_completion_timeout(&ib_conn->flush_completion, 30 * HZ); 599 WARN_ON(rc == 0); 600 601 ib_conn->state = ISER_CONN_DOWN; 602 603 mutex_lock(&ib_conn->state_mutex); 604 ib_conn->state = ISER_CONN_DOWN; 605 mutex_unlock(&ib_conn->state_mutex); 606 607 iser_conn_release(ib_conn); 608 } 609 610 /** 611 * Frees all conn objects and deallocs conn descriptor 612 */ 613 void iser_conn_release(struct iser_conn *ib_conn) 614 { 615 struct iser_device *device = ib_conn->device; 616 617 mutex_lock(&ig.connlist_mutex); 618 list_del(&ib_conn->conn_list); 619 mutex_unlock(&ig.connlist_mutex); 620 621 mutex_lock(&ib_conn->state_mutex); 622 BUG_ON(ib_conn->state != ISER_CONN_DOWN); 623 624 iser_free_rx_descriptors(ib_conn); 625 iser_free_ib_conn_res(ib_conn); 626 ib_conn->device = NULL; 627 /* on EVENT_ADDR_ERROR there's no device yet for this conn */ 628 if (device != NULL) 629 iser_device_try_release(device); 630 mutex_unlock(&ib_conn->state_mutex); 631 632 /* if cma handler context, the caller actually destroy the id */ 633 if (ib_conn->cma_id != NULL) { 634 rdma_destroy_id(ib_conn->cma_id); 635 ib_conn->cma_id = NULL; 636 } 637 kfree(ib_conn); 638 } 639 640 /** 641 * triggers start of the disconnect procedures and wait for them to be done 642 */ 643 void iser_conn_terminate(struct iser_conn *ib_conn) 644 { 645 int err = 0; 646 647 /* change the ib conn state only if the conn is UP, however always call 648 * rdma_disconnect since this is the only way to cause the CMA to change 649 * the QP state to ERROR 650 */ 651 652 iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, ISER_CONN_TERMINATING); 653 err = rdma_disconnect(ib_conn->cma_id); 654 if (err) 655 iser_err("Failed to disconnect, conn: 0x%p err %d\n", 656 ib_conn,err); 657 } 658 659 /** 660 * Called with state mutex held 661 **/ 662 static void iser_connect_error(struct rdma_cm_id *cma_id) 663 { 664 struct iser_conn *ib_conn; 665 666 ib_conn = (struct iser_conn *)cma_id->context; 667 ib_conn->state = ISER_CONN_DOWN; 668 } 669 670 /** 671 * Called with state mutex held 672 **/ 673 static void iser_addr_handler(struct rdma_cm_id *cma_id) 674 { 675 struct iser_device *device; 676 struct iser_conn *ib_conn; 677 int ret; 678 679 ib_conn = (struct iser_conn *)cma_id->context; 680 if (ib_conn->state != ISER_CONN_PENDING) 681 /* bailout */ 682 return; 683 684 device = iser_device_find_by_ib_device(cma_id); 685 if (!device) { 686 iser_err("device lookup/creation failed\n"); 687 iser_connect_error(cma_id); 688 return; 689 } 690 691 ib_conn->device = device; 692 693 /* connection T10-PI support */ 694 if (iser_pi_enable) { 695 if (!(device->dev_attr.device_cap_flags & 696 IB_DEVICE_SIGNATURE_HANDOVER)) { 697 iser_warn("T10-PI requested but not supported on %s, " 698 "continue without T10-PI\n", 699 ib_conn->device->ib_device->name); 700 ib_conn->pi_support = false; 701 } else { 702 ib_conn->pi_support = true; 703 } 704 } 705 706 ret = rdma_resolve_route(cma_id, 1000); 707 if (ret) { 708 iser_err("resolve route failed: %d\n", ret); 709 iser_connect_error(cma_id); 710 return; 711 } 712 } 713 714 /** 715 * Called with state mutex held 716 **/ 717 static void iser_route_handler(struct rdma_cm_id *cma_id) 718 { 719 struct rdma_conn_param conn_param; 720 int ret; 721 struct iser_cm_hdr req_hdr; 722 struct iser_conn *ib_conn = (struct iser_conn *)cma_id->context; 723 struct iser_device *device = ib_conn->device; 724 725 if (ib_conn->state != ISER_CONN_PENDING) 726 /* bailout */ 727 return; 728 729 ret = iser_create_ib_conn_res((struct iser_conn *)cma_id->context); 730 if (ret) 731 goto failure; 732 733 memset(&conn_param, 0, sizeof conn_param); 734 conn_param.responder_resources = device->dev_attr.max_qp_rd_atom; 735 conn_param.initiator_depth = 1; 736 conn_param.retry_count = 7; 737 conn_param.rnr_retry_count = 6; 738 739 memset(&req_hdr, 0, sizeof(req_hdr)); 740 req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED | 741 ISER_SEND_W_INV_NOT_SUPPORTED); 742 conn_param.private_data = (void *)&req_hdr; 743 conn_param.private_data_len = sizeof(struct iser_cm_hdr); 744 745 ret = rdma_connect(cma_id, &conn_param); 746 if (ret) { 747 iser_err("failure connecting: %d\n", ret); 748 goto failure; 749 } 750 751 return; 752 failure: 753 iser_connect_error(cma_id); 754 } 755 756 static void iser_connected_handler(struct rdma_cm_id *cma_id) 757 { 758 struct iser_conn *ib_conn; 759 struct ib_qp_attr attr; 760 struct ib_qp_init_attr init_attr; 761 762 ib_conn = (struct iser_conn *)cma_id->context; 763 if (ib_conn->state != ISER_CONN_PENDING) 764 /* bailout */ 765 return; 766 767 (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); 768 iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num); 769 770 ib_conn->state = ISER_CONN_UP; 771 complete(&ib_conn->up_completion); 772 } 773 774 static void iser_disconnected_handler(struct rdma_cm_id *cma_id) 775 { 776 struct iser_conn *ib_conn; 777 778 ib_conn = (struct iser_conn *)cma_id->context; 779 780 /* getting here when the state is UP means that the conn is being * 781 * terminated asynchronously from the iSCSI layer's perspective. */ 782 if (iser_conn_state_comp_exch(ib_conn, ISER_CONN_UP, 783 ISER_CONN_TERMINATING)){ 784 if (ib_conn->iscsi_conn) 785 iscsi_conn_failure(ib_conn->iscsi_conn, ISCSI_ERR_CONN_FAILED); 786 else 787 iser_err("iscsi_iser connection isn't bound\n"); 788 } 789 790 /* Complete the termination process if no posts are pending. This code 791 * block also exists in iser_handle_comp_error(), but it is needed here 792 * for cases of no flushes at all, e.g. discovery over rdma. 793 */ 794 if (ib_conn->post_recv_buf_count == 0 && 795 (atomic_read(&ib_conn->post_send_buf_count) == 0)) { 796 complete(&ib_conn->flush_completion); 797 } 798 } 799 800 static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) 801 { 802 struct iser_conn *ib_conn; 803 804 ib_conn = (struct iser_conn *)cma_id->context; 805 iser_info("event %d status %d conn %p id %p\n", 806 event->event, event->status, cma_id->context, cma_id); 807 808 mutex_lock(&ib_conn->state_mutex); 809 switch (event->event) { 810 case RDMA_CM_EVENT_ADDR_RESOLVED: 811 iser_addr_handler(cma_id); 812 break; 813 case RDMA_CM_EVENT_ROUTE_RESOLVED: 814 iser_route_handler(cma_id); 815 break; 816 case RDMA_CM_EVENT_ESTABLISHED: 817 iser_connected_handler(cma_id); 818 break; 819 case RDMA_CM_EVENT_ADDR_ERROR: 820 case RDMA_CM_EVENT_ROUTE_ERROR: 821 case RDMA_CM_EVENT_CONNECT_ERROR: 822 case RDMA_CM_EVENT_UNREACHABLE: 823 case RDMA_CM_EVENT_REJECTED: 824 iser_connect_error(cma_id); 825 break; 826 case RDMA_CM_EVENT_DISCONNECTED: 827 case RDMA_CM_EVENT_DEVICE_REMOVAL: 828 case RDMA_CM_EVENT_ADDR_CHANGE: 829 case RDMA_CM_EVENT_TIMEWAIT_EXIT: 830 iser_disconnected_handler(cma_id); 831 break; 832 default: 833 iser_err("Unexpected RDMA CM event (%d)\n", event->event); 834 break; 835 } 836 mutex_unlock(&ib_conn->state_mutex); 837 return 0; 838 } 839 840 void iser_conn_init(struct iser_conn *ib_conn) 841 { 842 ib_conn->state = ISER_CONN_INIT; 843 ib_conn->post_recv_buf_count = 0; 844 atomic_set(&ib_conn->post_send_buf_count, 0); 845 init_completion(&ib_conn->stop_completion); 846 init_completion(&ib_conn->flush_completion); 847 init_completion(&ib_conn->up_completion); 848 INIT_LIST_HEAD(&ib_conn->conn_list); 849 spin_lock_init(&ib_conn->lock); 850 mutex_init(&ib_conn->state_mutex); 851 } 852 853 /** 854 * starts the process of connecting to the target 855 * sleeps until the connection is established or rejected 856 */ 857 int iser_connect(struct iser_conn *ib_conn, 858 struct sockaddr *src_addr, 859 struct sockaddr *dst_addr, 860 int non_blocking) 861 { 862 int err = 0; 863 864 mutex_lock(&ib_conn->state_mutex); 865 866 sprintf(ib_conn->name, "%pISp", dst_addr); 867 868 iser_info("connecting to: %s\n", ib_conn->name); 869 870 /* the device is known only --after-- address resolution */ 871 ib_conn->device = NULL; 872 873 ib_conn->state = ISER_CONN_PENDING; 874 875 ib_conn->cma_id = rdma_create_id(iser_cma_handler, 876 (void *)ib_conn, 877 RDMA_PS_TCP, IB_QPT_RC); 878 if (IS_ERR(ib_conn->cma_id)) { 879 err = PTR_ERR(ib_conn->cma_id); 880 iser_err("rdma_create_id failed: %d\n", err); 881 goto id_failure; 882 } 883 884 err = rdma_resolve_addr(ib_conn->cma_id, src_addr, dst_addr, 1000); 885 if (err) { 886 iser_err("rdma_resolve_addr failed: %d\n", err); 887 goto addr_failure; 888 } 889 890 if (!non_blocking) { 891 wait_for_completion_interruptible(&ib_conn->up_completion); 892 893 if (ib_conn->state != ISER_CONN_UP) { 894 err = -EIO; 895 goto connect_failure; 896 } 897 } 898 mutex_unlock(&ib_conn->state_mutex); 899 900 mutex_lock(&ig.connlist_mutex); 901 list_add(&ib_conn->conn_list, &ig.connlist); 902 mutex_unlock(&ig.connlist_mutex); 903 return 0; 904 905 id_failure: 906 ib_conn->cma_id = NULL; 907 addr_failure: 908 ib_conn->state = ISER_CONN_DOWN; 909 connect_failure: 910 mutex_unlock(&ib_conn->state_mutex); 911 iser_conn_release(ib_conn); 912 return err; 913 } 914 915 /** 916 * iser_reg_page_vec - Register physical memory 917 * 918 * returns: 0 on success, errno code on failure 919 */ 920 int iser_reg_page_vec(struct iser_conn *ib_conn, 921 struct iser_page_vec *page_vec, 922 struct iser_mem_reg *mem_reg) 923 { 924 struct ib_pool_fmr *mem; 925 u64 io_addr; 926 u64 *page_list; 927 int status; 928 929 page_list = page_vec->pages; 930 io_addr = page_list[0]; 931 932 mem = ib_fmr_pool_map_phys(ib_conn->fmr.pool, 933 page_list, 934 page_vec->length, 935 io_addr); 936 937 if (IS_ERR(mem)) { 938 status = (int)PTR_ERR(mem); 939 iser_err("ib_fmr_pool_map_phys failed: %d\n", status); 940 return status; 941 } 942 943 mem_reg->lkey = mem->fmr->lkey; 944 mem_reg->rkey = mem->fmr->rkey; 945 mem_reg->len = page_vec->length * SIZE_4K; 946 mem_reg->va = io_addr; 947 mem_reg->is_mr = 1; 948 mem_reg->mem_h = (void *)mem; 949 950 mem_reg->va += page_vec->offset; 951 mem_reg->len = page_vec->data_size; 952 953 iser_dbg("PHYSICAL Mem.register, [PHYS p_array: 0x%p, sz: %d, " 954 "entry[0]: (0x%08lx,%ld)] -> " 955 "[lkey: 0x%08X mem_h: 0x%p va: 0x%08lX sz: %ld]\n", 956 page_vec, page_vec->length, 957 (unsigned long)page_vec->pages[0], 958 (unsigned long)page_vec->data_size, 959 (unsigned int)mem_reg->lkey, mem_reg->mem_h, 960 (unsigned long)mem_reg->va, (unsigned long)mem_reg->len); 961 return 0; 962 } 963 964 /** 965 * Unregister (previosuly registered using FMR) memory. 966 * If memory is non-FMR does nothing. 967 */ 968 void iser_unreg_mem_fmr(struct iscsi_iser_task *iser_task, 969 enum iser_data_dir cmd_dir) 970 { 971 struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; 972 int ret; 973 974 if (!reg->is_mr) 975 return; 976 977 iser_dbg("PHYSICAL Mem.Unregister mem_h %p\n",reg->mem_h); 978 979 ret = ib_fmr_pool_unmap((struct ib_pool_fmr *)reg->mem_h); 980 if (ret) 981 iser_err("ib_fmr_pool_unmap failed %d\n", ret); 982 983 reg->mem_h = NULL; 984 } 985 986 void iser_unreg_mem_fastreg(struct iscsi_iser_task *iser_task, 987 enum iser_data_dir cmd_dir) 988 { 989 struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; 990 struct iser_conn *ib_conn = iser_task->ib_conn; 991 struct fast_reg_descriptor *desc = reg->mem_h; 992 993 if (!reg->is_mr) 994 return; 995 996 reg->mem_h = NULL; 997 reg->is_mr = 0; 998 spin_lock_bh(&ib_conn->lock); 999 list_add_tail(&desc->list, &ib_conn->fastreg.pool); 1000 spin_unlock_bh(&ib_conn->lock); 1001 } 1002 1003 int iser_post_recvl(struct iser_conn *ib_conn) 1004 { 1005 struct ib_recv_wr rx_wr, *rx_wr_failed; 1006 struct ib_sge sge; 1007 int ib_ret; 1008 1009 sge.addr = ib_conn->login_resp_dma; 1010 sge.length = ISER_RX_LOGIN_SIZE; 1011 sge.lkey = ib_conn->device->mr->lkey; 1012 1013 rx_wr.wr_id = (unsigned long)ib_conn->login_resp_buf; 1014 rx_wr.sg_list = &sge; 1015 rx_wr.num_sge = 1; 1016 rx_wr.next = NULL; 1017 1018 ib_conn->post_recv_buf_count++; 1019 ib_ret = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed); 1020 if (ib_ret) { 1021 iser_err("ib_post_recv failed ret=%d\n", ib_ret); 1022 ib_conn->post_recv_buf_count--; 1023 } 1024 return ib_ret; 1025 } 1026 1027 int iser_post_recvm(struct iser_conn *ib_conn, int count) 1028 { 1029 struct ib_recv_wr *rx_wr, *rx_wr_failed; 1030 int i, ib_ret; 1031 unsigned int my_rx_head = ib_conn->rx_desc_head; 1032 struct iser_rx_desc *rx_desc; 1033 1034 for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { 1035 rx_desc = &ib_conn->rx_descs[my_rx_head]; 1036 rx_wr->wr_id = (unsigned long)rx_desc; 1037 rx_wr->sg_list = &rx_desc->rx_sg; 1038 rx_wr->num_sge = 1; 1039 rx_wr->next = rx_wr + 1; 1040 my_rx_head = (my_rx_head + 1) & ib_conn->qp_max_recv_dtos_mask; 1041 } 1042 1043 rx_wr--; 1044 rx_wr->next = NULL; /* mark end of work requests list */ 1045 1046 ib_conn->post_recv_buf_count += count; 1047 ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed); 1048 if (ib_ret) { 1049 iser_err("ib_post_recv failed ret=%d\n", ib_ret); 1050 ib_conn->post_recv_buf_count -= count; 1051 } else 1052 ib_conn->rx_desc_head = my_rx_head; 1053 return ib_ret; 1054 } 1055 1056 1057 /** 1058 * iser_start_send - Initiate a Send DTO operation 1059 * 1060 * returns 0 on success, -1 on failure 1061 */ 1062 int iser_post_send(struct iser_conn *ib_conn, struct iser_tx_desc *tx_desc) 1063 { 1064 int ib_ret; 1065 struct ib_send_wr send_wr, *send_wr_failed; 1066 1067 ib_dma_sync_single_for_device(ib_conn->device->ib_device, 1068 tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); 1069 1070 send_wr.next = NULL; 1071 send_wr.wr_id = (unsigned long)tx_desc; 1072 send_wr.sg_list = tx_desc->tx_sg; 1073 send_wr.num_sge = tx_desc->num_sge; 1074 send_wr.opcode = IB_WR_SEND; 1075 send_wr.send_flags = IB_SEND_SIGNALED; 1076 1077 atomic_inc(&ib_conn->post_send_buf_count); 1078 1079 ib_ret = ib_post_send(ib_conn->qp, &send_wr, &send_wr_failed); 1080 if (ib_ret) { 1081 iser_err("ib_post_send failed, ret:%d\n", ib_ret); 1082 atomic_dec(&ib_conn->post_send_buf_count); 1083 } 1084 return ib_ret; 1085 } 1086 1087 static void iser_handle_comp_error(struct iser_tx_desc *desc, 1088 struct iser_conn *ib_conn) 1089 { 1090 if (desc && desc->type == ISCSI_TX_DATAOUT) 1091 kmem_cache_free(ig.desc_cache, desc); 1092 1093 if (ib_conn->post_recv_buf_count == 0 && 1094 atomic_read(&ib_conn->post_send_buf_count) == 0) { 1095 /** 1096 * getting here when the state is UP means that the conn is 1097 * being terminated asynchronously from the iSCSI layer's 1098 * perspective. It is safe to peek at the connection state 1099 * since iscsi_conn_failure is allowed to be called twice. 1100 **/ 1101 if (ib_conn->state == ISER_CONN_UP) 1102 iscsi_conn_failure(ib_conn->iscsi_conn, 1103 ISCSI_ERR_CONN_FAILED); 1104 1105 /* no more non completed posts to the QP, complete the 1106 * termination process w.o worrying on disconnect event */ 1107 complete(&ib_conn->flush_completion); 1108 } 1109 } 1110 1111 static int iser_drain_tx_cq(struct iser_device *device, int cq_index) 1112 { 1113 struct ib_cq *cq = device->tx_cq[cq_index]; 1114 struct ib_wc wc; 1115 struct iser_tx_desc *tx_desc; 1116 struct iser_conn *ib_conn; 1117 int completed_tx = 0; 1118 1119 while (ib_poll_cq(cq, 1, &wc) == 1) { 1120 tx_desc = (struct iser_tx_desc *) (unsigned long) wc.wr_id; 1121 ib_conn = wc.qp->qp_context; 1122 if (wc.status == IB_WC_SUCCESS) { 1123 if (wc.opcode == IB_WC_SEND) 1124 iser_snd_completion(tx_desc, ib_conn); 1125 else 1126 iser_err("expected opcode %d got %d\n", 1127 IB_WC_SEND, wc.opcode); 1128 } else { 1129 iser_err("tx id %llx status %d vend_err %x\n", 1130 wc.wr_id, wc.status, wc.vendor_err); 1131 if (wc.wr_id != ISER_FASTREG_LI_WRID) { 1132 atomic_dec(&ib_conn->post_send_buf_count); 1133 iser_handle_comp_error(tx_desc, ib_conn); 1134 } 1135 } 1136 completed_tx++; 1137 } 1138 return completed_tx; 1139 } 1140 1141 1142 static void iser_cq_tasklet_fn(unsigned long data) 1143 { 1144 struct iser_cq_desc *cq_desc = (struct iser_cq_desc *)data; 1145 struct iser_device *device = cq_desc->device; 1146 int cq_index = cq_desc->cq_index; 1147 struct ib_cq *cq = device->rx_cq[cq_index]; 1148 struct ib_wc wc; 1149 struct iser_rx_desc *desc; 1150 unsigned long xfer_len; 1151 struct iser_conn *ib_conn; 1152 int completed_tx, completed_rx = 0; 1153 1154 /* First do tx drain, so in a case where we have rx flushes and a successful 1155 * tx completion we will still go through completion error handling. 1156 */ 1157 completed_tx = iser_drain_tx_cq(device, cq_index); 1158 1159 while (ib_poll_cq(cq, 1, &wc) == 1) { 1160 desc = (struct iser_rx_desc *) (unsigned long) wc.wr_id; 1161 BUG_ON(desc == NULL); 1162 ib_conn = wc.qp->qp_context; 1163 if (wc.status == IB_WC_SUCCESS) { 1164 if (wc.opcode == IB_WC_RECV) { 1165 xfer_len = (unsigned long)wc.byte_len; 1166 iser_rcv_completion(desc, xfer_len, ib_conn); 1167 } else 1168 iser_err("expected opcode %d got %d\n", 1169 IB_WC_RECV, wc.opcode); 1170 } else { 1171 if (wc.status != IB_WC_WR_FLUSH_ERR) 1172 iser_err("rx id %llx status %d vend_err %x\n", 1173 wc.wr_id, wc.status, wc.vendor_err); 1174 ib_conn->post_recv_buf_count--; 1175 iser_handle_comp_error(NULL, ib_conn); 1176 } 1177 completed_rx++; 1178 if (!(completed_rx & 63)) 1179 completed_tx += iser_drain_tx_cq(device, cq_index); 1180 } 1181 /* #warning "it is assumed here that arming CQ only once its empty" * 1182 * " would not cause interrupts to be missed" */ 1183 ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); 1184 1185 iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx); 1186 } 1187 1188 static void iser_cq_callback(struct ib_cq *cq, void *cq_context) 1189 { 1190 struct iser_cq_desc *cq_desc = (struct iser_cq_desc *)cq_context; 1191 struct iser_device *device = cq_desc->device; 1192 int cq_index = cq_desc->cq_index; 1193 1194 tasklet_schedule(&device->cq_tasklet[cq_index]); 1195 } 1196 1197 u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, 1198 enum iser_data_dir cmd_dir, sector_t *sector) 1199 { 1200 struct iser_mem_reg *reg = &iser_task->rdma_regd[cmd_dir].reg; 1201 struct fast_reg_descriptor *desc = reg->mem_h; 1202 unsigned long sector_size = iser_task->sc->device->sector_size; 1203 struct ib_mr_status mr_status; 1204 int ret; 1205 1206 if (desc && desc->reg_indicators & ISER_FASTREG_PROTECTED) { 1207 desc->reg_indicators &= ~ISER_FASTREG_PROTECTED; 1208 ret = ib_check_mr_status(desc->pi_ctx->sig_mr, 1209 IB_MR_CHECK_SIG_STATUS, &mr_status); 1210 if (ret) { 1211 pr_err("ib_check_mr_status failed, ret %d\n", ret); 1212 goto err; 1213 } 1214 1215 if (mr_status.fail_status & IB_MR_CHECK_SIG_STATUS) { 1216 sector_t sector_off = mr_status.sig_err.sig_err_offset; 1217 1218 do_div(sector_off, sector_size + 8); 1219 *sector = scsi_get_lba(iser_task->sc) + sector_off; 1220 1221 pr_err("PI error found type %d at sector %llx " 1222 "expected %x vs actual %x\n", 1223 mr_status.sig_err.err_type, 1224 (unsigned long long)*sector, 1225 mr_status.sig_err.expected, 1226 mr_status.sig_err.actual); 1227 1228 switch (mr_status.sig_err.err_type) { 1229 case IB_SIG_BAD_GUARD: 1230 return 0x1; 1231 case IB_SIG_BAD_REFTAG: 1232 return 0x3; 1233 case IB_SIG_BAD_APPTAG: 1234 return 0x2; 1235 } 1236 } 1237 } 1238 1239 return 0; 1240 err: 1241 /* Not alot we can do here, return ambiguous guard error */ 1242 return 0x1; 1243 } 1244