1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 4 /* Copyright (c) 2008-2019, IBM Corporation */ 5 6 #include <linux/errno.h> 7 #include <linux/types.h> 8 #include <linux/uaccess.h> 9 #include <linux/vmalloc.h> 10 #include <linux/xarray.h> 11 #include <net/addrconf.h> 12 13 #include <rdma/iw_cm.h> 14 #include <rdma/ib_verbs.h> 15 #include <rdma/ib_user_verbs.h> 16 #include <rdma/uverbs_ioctl.h> 17 18 #include "siw.h" 19 #include "siw_verbs.h" 20 #include "siw_mem.h" 21 22 static int siw_qp_state_to_ib_qp_state[SIW_QP_STATE_COUNT] = { 23 [SIW_QP_STATE_IDLE] = IB_QPS_INIT, 24 [SIW_QP_STATE_RTR] = IB_QPS_RTR, 25 [SIW_QP_STATE_RTS] = IB_QPS_RTS, 26 [SIW_QP_STATE_CLOSING] = IB_QPS_SQD, 27 [SIW_QP_STATE_TERMINATE] = IB_QPS_SQE, 28 [SIW_QP_STATE_ERROR] = IB_QPS_ERR 29 }; 30 31 static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = { 32 [IB_QPS_RESET] = SIW_QP_STATE_IDLE, 33 [IB_QPS_INIT] = SIW_QP_STATE_IDLE, 34 [IB_QPS_RTR] = SIW_QP_STATE_RTR, 35 [IB_QPS_RTS] = SIW_QP_STATE_RTS, 36 [IB_QPS_SQD] = SIW_QP_STATE_CLOSING, 37 [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE, 38 [IB_QPS_ERR] = SIW_QP_STATE_ERROR 39 }; 40 41 static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = { 42 [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR", 43 [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE", 44 [IB_QPS_ERR] = "ERR" 45 }; 46 47 void siw_mmap_free(struct rdma_user_mmap_entry *rdma_entry) 48 { 49 struct siw_user_mmap_entry *entry = to_siw_mmap_entry(rdma_entry); 50 51 kfree(entry); 52 } 53 54 int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) 55 { 56 struct siw_ucontext *uctx = to_siw_ctx(ctx); 57 size_t size = vma->vm_end - vma->vm_start; 58 struct rdma_user_mmap_entry *rdma_entry; 59 struct siw_user_mmap_entry *entry; 60 int rv = -EINVAL; 61 62 /* 63 * Must be page aligned 64 */ 65 if (vma->vm_start & (PAGE_SIZE - 1)) { 66 pr_warn("siw: mmap not page aligned\n"); 67 return -EINVAL; 68 } 69 rdma_entry = rdma_user_mmap_entry_get(&uctx->base_ucontext, vma); 70 if (!rdma_entry) { 71 siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %#zx\n", 72 vma->vm_pgoff, size); 73 return -EINVAL; 74 } 75 entry = to_siw_mmap_entry(rdma_entry); 76 77 rv = remap_vmalloc_range(vma, entry->address, 0); 78 if (rv) 79 pr_warn("remap_vmalloc_range failed: %lu, %zu\n", vma->vm_pgoff, 80 size); 81 rdma_user_mmap_entry_put(rdma_entry); 82 83 return rv; 84 } 85 86 int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata) 87 { 88 struct siw_device *sdev = to_siw_dev(base_ctx->device); 89 struct siw_ucontext *ctx = to_siw_ctx(base_ctx); 90 struct siw_uresp_alloc_ctx uresp = {}; 91 int rv; 92 93 if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) { 94 rv = -ENOMEM; 95 goto err_out; 96 } 97 ctx->sdev = sdev; 98 99 uresp.dev_id = sdev->vendor_part_id; 100 101 if (udata->outlen < sizeof(uresp)) { 102 rv = -EINVAL; 103 goto err_out; 104 } 105 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 106 if (rv) 107 goto err_out; 108 109 siw_dbg(base_ctx->device, "success. now %d context(s)\n", 110 atomic_read(&sdev->num_ctx)); 111 112 return 0; 113 114 err_out: 115 atomic_dec(&sdev->num_ctx); 116 siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv, 117 atomic_read(&sdev->num_ctx)); 118 119 return rv; 120 } 121 122 void siw_dealloc_ucontext(struct ib_ucontext *base_ctx) 123 { 124 struct siw_ucontext *uctx = to_siw_ctx(base_ctx); 125 126 atomic_dec(&uctx->sdev->num_ctx); 127 } 128 129 int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, 130 struct ib_udata *udata) 131 { 132 struct siw_device *sdev = to_siw_dev(base_dev); 133 134 if (udata->inlen || udata->outlen) 135 return -EINVAL; 136 137 memset(attr, 0, sizeof(*attr)); 138 139 /* Revisit atomic caps if RFC 7306 gets supported */ 140 attr->atomic_cap = 0; 141 attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS; 142 attr->kernel_cap_flags = IBK_ALLOW_USER_UNREG; 143 attr->max_cq = sdev->attrs.max_cq; 144 attr->max_cqe = sdev->attrs.max_cqe; 145 attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; 146 attr->max_mr = sdev->attrs.max_mr; 147 attr->max_mw = sdev->attrs.max_mw; 148 attr->max_mr_size = ~0ull; 149 attr->max_pd = sdev->attrs.max_pd; 150 attr->max_qp = sdev->attrs.max_qp; 151 attr->max_qp_init_rd_atom = sdev->attrs.max_ird; 152 attr->max_qp_rd_atom = sdev->attrs.max_ord; 153 attr->max_qp_wr = sdev->attrs.max_qp_wr; 154 attr->max_recv_sge = sdev->attrs.max_sge; 155 attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird; 156 attr->max_send_sge = sdev->attrs.max_sge; 157 attr->max_sge_rd = sdev->attrs.max_sge_rd; 158 attr->max_srq = sdev->attrs.max_srq; 159 attr->max_srq_sge = sdev->attrs.max_srq_sge; 160 attr->max_srq_wr = sdev->attrs.max_srq_wr; 161 attr->page_size_cap = PAGE_SIZE; 162 attr->vendor_id = SIW_VENDOR_ID; 163 attr->vendor_part_id = sdev->vendor_part_id; 164 165 addrconf_addr_eui48((u8 *)&attr->sys_image_guid, 166 sdev->raw_gid); 167 168 return 0; 169 } 170 171 int siw_query_port(struct ib_device *base_dev, u32 port, 172 struct ib_port_attr *attr) 173 { 174 struct siw_device *sdev = to_siw_dev(base_dev); 175 int rv; 176 177 memset(attr, 0, sizeof(*attr)); 178 179 rv = ib_get_eth_speed(base_dev, port, &attr->active_speed, 180 &attr->active_width); 181 attr->gid_tbl_len = 1; 182 attr->max_msg_sz = -1; 183 attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); 184 attr->active_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); 185 attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 186 IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED; 187 attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; 188 attr->state = sdev->state; 189 /* 190 * All zero 191 * 192 * attr->lid = 0; 193 * attr->bad_pkey_cntr = 0; 194 * attr->qkey_viol_cntr = 0; 195 * attr->sm_lid = 0; 196 * attr->lmc = 0; 197 * attr->max_vl_num = 0; 198 * attr->sm_sl = 0; 199 * attr->subnet_timeout = 0; 200 * attr->init_type_repy = 0; 201 */ 202 return rv; 203 } 204 205 int siw_get_port_immutable(struct ib_device *base_dev, u32 port, 206 struct ib_port_immutable *port_immutable) 207 { 208 struct ib_port_attr attr; 209 int rv = siw_query_port(base_dev, port, &attr); 210 211 if (rv) 212 return rv; 213 214 port_immutable->gid_tbl_len = attr.gid_tbl_len; 215 port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; 216 217 return 0; 218 } 219 220 int siw_query_gid(struct ib_device *base_dev, u32 port, int idx, 221 union ib_gid *gid) 222 { 223 struct siw_device *sdev = to_siw_dev(base_dev); 224 225 /* subnet_prefix == interface_id == 0; */ 226 memset(gid, 0, sizeof(*gid)); 227 memcpy(gid->raw, sdev->raw_gid, ETH_ALEN); 228 229 return 0; 230 } 231 232 int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata) 233 { 234 struct siw_device *sdev = to_siw_dev(pd->device); 235 236 if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) { 237 atomic_dec(&sdev->num_pd); 238 return -ENOMEM; 239 } 240 siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd)); 241 242 return 0; 243 } 244 245 int siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) 246 { 247 struct siw_device *sdev = to_siw_dev(pd->device); 248 249 siw_dbg_pd(pd, "free PD\n"); 250 atomic_dec(&sdev->num_pd); 251 return 0; 252 } 253 254 void siw_qp_get_ref(struct ib_qp *base_qp) 255 { 256 siw_qp_get(to_siw_qp(base_qp)); 257 } 258 259 void siw_qp_put_ref(struct ib_qp *base_qp) 260 { 261 siw_qp_put(to_siw_qp(base_qp)); 262 } 263 264 static struct rdma_user_mmap_entry * 265 siw_mmap_entry_insert(struct siw_ucontext *uctx, 266 void *address, size_t length, 267 u64 *offset) 268 { 269 struct siw_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL); 270 int rv; 271 272 *offset = SIW_INVAL_UOBJ_KEY; 273 if (!entry) 274 return NULL; 275 276 entry->address = address; 277 278 rv = rdma_user_mmap_entry_insert(&uctx->base_ucontext, 279 &entry->rdma_entry, 280 length); 281 if (rv) { 282 kfree(entry); 283 return NULL; 284 } 285 286 *offset = rdma_user_mmap_get_offset(&entry->rdma_entry); 287 288 return &entry->rdma_entry; 289 } 290 291 /* 292 * siw_create_qp() 293 * 294 * Create QP of requested size on given device. 295 * 296 * @qp: Queue pait 297 * @attrs: Initial QP attributes. 298 * @udata: used to provide QP ID, SQ and RQ size back to user. 299 */ 300 301 int siw_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, 302 struct ib_udata *udata) 303 { 304 struct ib_pd *pd = ibqp->pd; 305 struct siw_qp *qp = to_siw_qp(ibqp); 306 struct ib_device *base_dev = pd->device; 307 struct siw_device *sdev = to_siw_dev(base_dev); 308 struct siw_ucontext *uctx = 309 rdma_udata_to_drv_context(udata, struct siw_ucontext, 310 base_ucontext); 311 unsigned long flags; 312 int num_sqe, num_rqe, rv = 0; 313 size_t length; 314 315 siw_dbg(base_dev, "create new QP\n"); 316 317 if (attrs->create_flags) 318 return -EOPNOTSUPP; 319 320 if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { 321 siw_dbg(base_dev, "too many QP's\n"); 322 rv = -ENOMEM; 323 goto err_atomic; 324 } 325 if (attrs->qp_type != IB_QPT_RC) { 326 siw_dbg(base_dev, "only RC QP's supported\n"); 327 rv = -EOPNOTSUPP; 328 goto err_atomic; 329 } 330 if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) || 331 (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) || 332 (attrs->cap.max_send_sge > SIW_MAX_SGE) || 333 (attrs->cap.max_recv_sge > SIW_MAX_SGE)) { 334 siw_dbg(base_dev, "QP size error\n"); 335 rv = -EINVAL; 336 goto err_atomic; 337 } 338 if (attrs->cap.max_inline_data > SIW_MAX_INLINE) { 339 siw_dbg(base_dev, "max inline send: %d > %d\n", 340 attrs->cap.max_inline_data, (int)SIW_MAX_INLINE); 341 rv = -EINVAL; 342 goto err_atomic; 343 } 344 /* 345 * NOTE: we don't allow for a QP unable to hold any SQ WQE 346 */ 347 if (attrs->cap.max_send_wr == 0) { 348 siw_dbg(base_dev, "QP must have send queue\n"); 349 rv = -EINVAL; 350 goto err_atomic; 351 } 352 353 if (!attrs->send_cq || (!attrs->recv_cq && !attrs->srq)) { 354 siw_dbg(base_dev, "send CQ or receive CQ invalid\n"); 355 rv = -EINVAL; 356 goto err_atomic; 357 } 358 359 init_rwsem(&qp->state_lock); 360 spin_lock_init(&qp->sq_lock); 361 spin_lock_init(&qp->rq_lock); 362 spin_lock_init(&qp->orq_lock); 363 364 rv = siw_qp_add(sdev, qp); 365 if (rv) 366 goto err_atomic; 367 368 369 /* All queue indices are derived from modulo operations 370 * on a free running 'get' (consumer) and 'put' (producer) 371 * unsigned counter. Having queue sizes at power of two 372 * avoids handling counter wrap around. 373 */ 374 num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr); 375 num_rqe = attrs->cap.max_recv_wr; 376 if (num_rqe) 377 num_rqe = roundup_pow_of_two(num_rqe); 378 379 if (udata) 380 qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe)); 381 else 382 qp->sendq = vcalloc(num_sqe, sizeof(struct siw_sqe)); 383 384 if (qp->sendq == NULL) { 385 rv = -ENOMEM; 386 goto err_out_xa; 387 } 388 if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) { 389 if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) 390 qp->attrs.flags |= SIW_SIGNAL_ALL_WR; 391 else { 392 rv = -EINVAL; 393 goto err_out_xa; 394 } 395 } 396 qp->pd = pd; 397 qp->scq = to_siw_cq(attrs->send_cq); 398 qp->rcq = to_siw_cq(attrs->recv_cq); 399 400 if (attrs->srq) { 401 /* 402 * SRQ support. 403 * Verbs 6.3.7: ignore RQ size, if SRQ present 404 * Verbs 6.3.5: do not check PD of SRQ against PD of QP 405 */ 406 qp->srq = to_siw_srq(attrs->srq); 407 qp->attrs.rq_size = 0; 408 siw_dbg(base_dev, "QP [%u]: SRQ attached\n", 409 qp->base_qp.qp_num); 410 } else if (num_rqe) { 411 if (udata) 412 qp->recvq = 413 vmalloc_user(num_rqe * sizeof(struct siw_rqe)); 414 else 415 qp->recvq = vcalloc(num_rqe, sizeof(struct siw_rqe)); 416 417 if (qp->recvq == NULL) { 418 rv = -ENOMEM; 419 goto err_out_xa; 420 } 421 qp->attrs.rq_size = num_rqe; 422 } 423 qp->attrs.sq_size = num_sqe; 424 qp->attrs.sq_max_sges = attrs->cap.max_send_sge; 425 qp->attrs.rq_max_sges = attrs->cap.max_recv_sge; 426 427 /* Make those two tunables fixed for now. */ 428 qp->tx_ctx.gso_seg_limit = 1; 429 qp->tx_ctx.zcopy_tx = zcopy_tx; 430 431 qp->attrs.state = SIW_QP_STATE_IDLE; 432 433 if (udata) { 434 struct siw_uresp_create_qp uresp = {}; 435 436 uresp.num_sqe = num_sqe; 437 uresp.num_rqe = num_rqe; 438 uresp.qp_id = qp_id(qp); 439 440 if (qp->sendq) { 441 length = num_sqe * sizeof(struct siw_sqe); 442 qp->sq_entry = 443 siw_mmap_entry_insert(uctx, qp->sendq, 444 length, &uresp.sq_key); 445 if (!qp->sq_entry) { 446 rv = -ENOMEM; 447 goto err_out_xa; 448 } 449 } 450 451 if (qp->recvq) { 452 length = num_rqe * sizeof(struct siw_rqe); 453 qp->rq_entry = 454 siw_mmap_entry_insert(uctx, qp->recvq, 455 length, &uresp.rq_key); 456 if (!qp->rq_entry) { 457 uresp.sq_key = SIW_INVAL_UOBJ_KEY; 458 rv = -ENOMEM; 459 goto err_out_xa; 460 } 461 } 462 463 if (udata->outlen < sizeof(uresp)) { 464 rv = -EINVAL; 465 goto err_out_xa; 466 } 467 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 468 if (rv) 469 goto err_out_xa; 470 } 471 qp->tx_cpu = siw_get_tx_cpu(sdev); 472 if (qp->tx_cpu < 0) { 473 rv = -EINVAL; 474 goto err_out_xa; 475 } 476 INIT_LIST_HEAD(&qp->devq); 477 spin_lock_irqsave(&sdev->lock, flags); 478 list_add_tail(&qp->devq, &sdev->qp_list); 479 spin_unlock_irqrestore(&sdev->lock, flags); 480 481 init_completion(&qp->qp_free); 482 483 return 0; 484 485 err_out_xa: 486 xa_erase(&sdev->qp_xa, qp_id(qp)); 487 if (uctx) { 488 rdma_user_mmap_entry_remove(qp->sq_entry); 489 rdma_user_mmap_entry_remove(qp->rq_entry); 490 } 491 vfree(qp->sendq); 492 vfree(qp->recvq); 493 494 err_atomic: 495 atomic_dec(&sdev->num_qp); 496 return rv; 497 } 498 499 /* 500 * Minimum siw_query_qp() verb interface. 501 * 502 * @qp_attr_mask is not used but all available information is provided 503 */ 504 int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, 505 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) 506 { 507 struct siw_qp *qp; 508 struct siw_device *sdev; 509 510 if (base_qp && qp_attr && qp_init_attr) { 511 qp = to_siw_qp(base_qp); 512 sdev = to_siw_dev(base_qp->device); 513 } else { 514 return -EINVAL; 515 } 516 qp_attr->qp_state = siw_qp_state_to_ib_qp_state[qp->attrs.state]; 517 qp_attr->cap.max_inline_data = SIW_MAX_INLINE; 518 qp_attr->cap.max_send_wr = qp->attrs.sq_size; 519 qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; 520 qp_attr->cap.max_recv_wr = qp->attrs.rq_size; 521 qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; 522 qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); 523 qp_attr->max_rd_atomic = qp->attrs.irq_size; 524 qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; 525 526 qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | 527 IB_ACCESS_REMOTE_WRITE | 528 IB_ACCESS_REMOTE_READ; 529 530 qp_init_attr->qp_type = base_qp->qp_type; 531 qp_init_attr->send_cq = base_qp->send_cq; 532 qp_init_attr->recv_cq = base_qp->recv_cq; 533 qp_init_attr->srq = base_qp->srq; 534 535 qp_init_attr->cap = qp_attr->cap; 536 537 return 0; 538 } 539 540 int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, 541 int attr_mask, struct ib_udata *udata) 542 { 543 struct siw_qp_attrs new_attrs; 544 enum siw_qp_attr_mask siw_attr_mask = 0; 545 struct siw_qp *qp = to_siw_qp(base_qp); 546 int rv = 0; 547 548 if (!attr_mask) 549 return 0; 550 551 if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS) 552 return -EOPNOTSUPP; 553 554 memset(&new_attrs, 0, sizeof(new_attrs)); 555 556 if (attr_mask & IB_QP_ACCESS_FLAGS) { 557 siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS; 558 559 if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) 560 new_attrs.flags |= SIW_RDMA_READ_ENABLED; 561 if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) 562 new_attrs.flags |= SIW_RDMA_WRITE_ENABLED; 563 if (attr->qp_access_flags & IB_ACCESS_MW_BIND) 564 new_attrs.flags |= SIW_RDMA_BIND_ENABLED; 565 } 566 if (attr_mask & IB_QP_STATE) { 567 siw_dbg_qp(qp, "desired IB QP state: %s\n", 568 ib_qp_state_to_string[attr->qp_state]); 569 570 new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state]; 571 572 if (new_attrs.state > SIW_QP_STATE_RTS) 573 qp->tx_ctx.tx_suspend = 1; 574 575 siw_attr_mask |= SIW_QP_ATTR_STATE; 576 } 577 if (!siw_attr_mask) 578 goto out; 579 580 down_write(&qp->state_lock); 581 582 rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask); 583 584 up_write(&qp->state_lock); 585 out: 586 return rv; 587 } 588 589 int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata) 590 { 591 struct siw_qp *qp = to_siw_qp(base_qp); 592 struct siw_ucontext *uctx = 593 rdma_udata_to_drv_context(udata, struct siw_ucontext, 594 base_ucontext); 595 struct siw_qp_attrs qp_attrs; 596 597 siw_dbg_qp(qp, "state %d\n", qp->attrs.state); 598 599 /* 600 * Mark QP as in process of destruction to prevent from 601 * any async callbacks to RDMA core 602 */ 603 qp->attrs.flags |= SIW_QP_IN_DESTROY; 604 qp->rx_stream.rx_suspend = 1; 605 606 if (uctx) { 607 rdma_user_mmap_entry_remove(qp->sq_entry); 608 rdma_user_mmap_entry_remove(qp->rq_entry); 609 } 610 611 down_write(&qp->state_lock); 612 613 qp_attrs.state = SIW_QP_STATE_ERROR; 614 siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE); 615 616 if (qp->cep) { 617 siw_cep_put(qp->cep); 618 qp->cep = NULL; 619 } 620 up_write(&qp->state_lock); 621 622 kfree(qp->tx_ctx.mpa_crc_hd); 623 kfree(qp->rx_stream.mpa_crc_hd); 624 625 qp->scq = qp->rcq = NULL; 626 627 siw_qp_put(qp); 628 wait_for_completion(&qp->qp_free); 629 630 return 0; 631 } 632 633 /* 634 * siw_copy_inline_sgl() 635 * 636 * Prepare sgl of inlined data for sending. For userland callers 637 * function checks if given buffer addresses and len's are within 638 * process context bounds. 639 * Data from all provided sge's are copied together into the wqe, 640 * referenced by a single sge. 641 */ 642 static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, 643 struct siw_sqe *sqe) 644 { 645 struct ib_sge *core_sge = core_wr->sg_list; 646 void *kbuf = &sqe->sge[1]; 647 int num_sge = core_wr->num_sge, bytes = 0; 648 649 sqe->sge[0].laddr = (uintptr_t)kbuf; 650 sqe->sge[0].lkey = 0; 651 652 while (num_sge--) { 653 if (!core_sge->length) { 654 core_sge++; 655 continue; 656 } 657 bytes += core_sge->length; 658 if (bytes > SIW_MAX_INLINE) { 659 bytes = -EINVAL; 660 break; 661 } 662 memcpy(kbuf, ib_virt_dma_to_ptr(core_sge->addr), 663 core_sge->length); 664 665 kbuf += core_sge->length; 666 core_sge++; 667 } 668 sqe->sge[0].length = max(bytes, 0); 669 sqe->num_sge = bytes > 0 ? 1 : 0; 670 671 return bytes; 672 } 673 674 /* Complete SQ WR's without processing */ 675 static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr, 676 const struct ib_send_wr **bad_wr) 677 { 678 int rv = 0; 679 680 while (wr) { 681 struct siw_sqe sqe = {}; 682 683 switch (wr->opcode) { 684 case IB_WR_RDMA_WRITE: 685 sqe.opcode = SIW_OP_WRITE; 686 break; 687 case IB_WR_RDMA_READ: 688 sqe.opcode = SIW_OP_READ; 689 break; 690 case IB_WR_RDMA_READ_WITH_INV: 691 sqe.opcode = SIW_OP_READ_LOCAL_INV; 692 break; 693 case IB_WR_SEND: 694 sqe.opcode = SIW_OP_SEND; 695 break; 696 case IB_WR_SEND_WITH_IMM: 697 sqe.opcode = SIW_OP_SEND_WITH_IMM; 698 break; 699 case IB_WR_SEND_WITH_INV: 700 sqe.opcode = SIW_OP_SEND_REMOTE_INV; 701 break; 702 case IB_WR_LOCAL_INV: 703 sqe.opcode = SIW_OP_INVAL_STAG; 704 break; 705 case IB_WR_REG_MR: 706 sqe.opcode = SIW_OP_REG_MR; 707 break; 708 default: 709 rv = -EINVAL; 710 break; 711 } 712 if (!rv) { 713 sqe.id = wr->wr_id; 714 rv = siw_sqe_complete(qp, &sqe, 0, 715 SIW_WC_WR_FLUSH_ERR); 716 } 717 if (rv) { 718 if (bad_wr) 719 *bad_wr = wr; 720 break; 721 } 722 wr = wr->next; 723 } 724 return rv; 725 } 726 727 /* Complete RQ WR's without processing */ 728 static int siw_rq_flush_wr(struct siw_qp *qp, const struct ib_recv_wr *wr, 729 const struct ib_recv_wr **bad_wr) 730 { 731 struct siw_rqe rqe = {}; 732 int rv = 0; 733 734 while (wr) { 735 rqe.id = wr->wr_id; 736 rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR); 737 if (rv) { 738 if (bad_wr) 739 *bad_wr = wr; 740 break; 741 } 742 wr = wr->next; 743 } 744 return rv; 745 } 746 747 /* 748 * siw_post_send() 749 * 750 * Post a list of S-WR's to a SQ. 751 * 752 * @base_qp: Base QP contained in siw QP 753 * @wr: Null terminated list of user WR's 754 * @bad_wr: Points to failing WR in case of synchronous failure. 755 */ 756 int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, 757 const struct ib_send_wr **bad_wr) 758 { 759 struct siw_qp *qp = to_siw_qp(base_qp); 760 struct siw_wqe *wqe = tx_wqe(qp); 761 762 unsigned long flags; 763 int rv = 0; 764 765 if (wr && !rdma_is_kernel_res(&qp->base_qp.res)) { 766 siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); 767 *bad_wr = wr; 768 return -EINVAL; 769 } 770 771 /* 772 * Try to acquire QP state lock. Must be non-blocking 773 * to accommodate kernel clients needs. 774 */ 775 if (!down_read_trylock(&qp->state_lock)) { 776 if (qp->attrs.state == SIW_QP_STATE_ERROR) { 777 /* 778 * ERROR state is final, so we can be sure 779 * this state will not change as long as the QP 780 * exists. 781 * 782 * This handles an ib_drain_sq() call with 783 * a concurrent request to set the QP state 784 * to ERROR. 785 */ 786 rv = siw_sq_flush_wr(qp, wr, bad_wr); 787 } else { 788 siw_dbg_qp(qp, "QP locked, state %d\n", 789 qp->attrs.state); 790 *bad_wr = wr; 791 rv = -ENOTCONN; 792 } 793 return rv; 794 } 795 if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { 796 if (qp->attrs.state == SIW_QP_STATE_ERROR) { 797 /* 798 * Immediately flush this WR to CQ, if QP 799 * is in ERROR state. SQ is guaranteed to 800 * be empty, so WR complets in-order. 801 * 802 * Typically triggered by ib_drain_sq(). 803 */ 804 rv = siw_sq_flush_wr(qp, wr, bad_wr); 805 } else { 806 siw_dbg_qp(qp, "QP out of state %d\n", 807 qp->attrs.state); 808 *bad_wr = wr; 809 rv = -ENOTCONN; 810 } 811 up_read(&qp->state_lock); 812 return rv; 813 } 814 spin_lock_irqsave(&qp->sq_lock, flags); 815 816 while (wr) { 817 u32 idx = qp->sq_put % qp->attrs.sq_size; 818 struct siw_sqe *sqe = &qp->sendq[idx]; 819 820 if (sqe->flags) { 821 siw_dbg_qp(qp, "sq full\n"); 822 rv = -ENOMEM; 823 break; 824 } 825 if (wr->num_sge > qp->attrs.sq_max_sges) { 826 siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); 827 rv = -EINVAL; 828 break; 829 } 830 sqe->id = wr->wr_id; 831 832 if ((wr->send_flags & IB_SEND_SIGNALED) || 833 (qp->attrs.flags & SIW_SIGNAL_ALL_WR)) 834 sqe->flags |= SIW_WQE_SIGNALLED; 835 836 if (wr->send_flags & IB_SEND_FENCE) 837 sqe->flags |= SIW_WQE_READ_FENCE; 838 839 switch (wr->opcode) { 840 case IB_WR_SEND: 841 case IB_WR_SEND_WITH_INV: 842 if (wr->send_flags & IB_SEND_SOLICITED) 843 sqe->flags |= SIW_WQE_SOLICITED; 844 845 if (!(wr->send_flags & IB_SEND_INLINE)) { 846 siw_copy_sgl(wr->sg_list, sqe->sge, 847 wr->num_sge); 848 sqe->num_sge = wr->num_sge; 849 } else { 850 rv = siw_copy_inline_sgl(wr, sqe); 851 if (rv <= 0) { 852 rv = -EINVAL; 853 break; 854 } 855 sqe->flags |= SIW_WQE_INLINE; 856 sqe->num_sge = 1; 857 } 858 if (wr->opcode == IB_WR_SEND) 859 sqe->opcode = SIW_OP_SEND; 860 else { 861 sqe->opcode = SIW_OP_SEND_REMOTE_INV; 862 sqe->rkey = wr->ex.invalidate_rkey; 863 } 864 break; 865 866 case IB_WR_RDMA_READ_WITH_INV: 867 case IB_WR_RDMA_READ: 868 /* 869 * iWarp restricts RREAD sink to SGL containing 870 * 1 SGE only. we could relax to SGL with multiple 871 * elements referring the SAME ltag or even sending 872 * a private per-rreq tag referring to a checked 873 * local sgl with MULTIPLE ltag's. 874 */ 875 if (unlikely(wr->num_sge != 1)) { 876 rv = -EINVAL; 877 break; 878 } 879 siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1); 880 /* 881 * NOTE: zero length RREAD is allowed! 882 */ 883 sqe->raddr = rdma_wr(wr)->remote_addr; 884 sqe->rkey = rdma_wr(wr)->rkey; 885 sqe->num_sge = 1; 886 887 if (wr->opcode == IB_WR_RDMA_READ) 888 sqe->opcode = SIW_OP_READ; 889 else 890 sqe->opcode = SIW_OP_READ_LOCAL_INV; 891 break; 892 893 case IB_WR_RDMA_WRITE: 894 if (!(wr->send_flags & IB_SEND_INLINE)) { 895 siw_copy_sgl(wr->sg_list, &sqe->sge[0], 896 wr->num_sge); 897 sqe->num_sge = wr->num_sge; 898 } else { 899 rv = siw_copy_inline_sgl(wr, sqe); 900 if (unlikely(rv < 0)) { 901 rv = -EINVAL; 902 break; 903 } 904 sqe->flags |= SIW_WQE_INLINE; 905 sqe->num_sge = 1; 906 } 907 sqe->raddr = rdma_wr(wr)->remote_addr; 908 sqe->rkey = rdma_wr(wr)->rkey; 909 sqe->opcode = SIW_OP_WRITE; 910 break; 911 912 case IB_WR_REG_MR: 913 sqe->base_mr = (uintptr_t)reg_wr(wr)->mr; 914 sqe->rkey = reg_wr(wr)->key; 915 sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK; 916 sqe->opcode = SIW_OP_REG_MR; 917 break; 918 919 case IB_WR_LOCAL_INV: 920 sqe->rkey = wr->ex.invalidate_rkey; 921 sqe->opcode = SIW_OP_INVAL_STAG; 922 break; 923 924 default: 925 siw_dbg_qp(qp, "ib wr type %d unsupported\n", 926 wr->opcode); 927 rv = -EINVAL; 928 break; 929 } 930 siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n", 931 sqe->opcode, sqe->flags, 932 (void *)(uintptr_t)sqe->id); 933 934 if (unlikely(rv < 0)) 935 break; 936 937 /* make SQE only valid after completely written */ 938 smp_wmb(); 939 sqe->flags |= SIW_WQE_VALID; 940 941 qp->sq_put++; 942 wr = wr->next; 943 } 944 945 /* 946 * Send directly if SQ processing is not in progress. 947 * Eventual immediate errors (rv < 0) do not affect the involved 948 * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ 949 * processing, if new work is already pending. But rv must be passed 950 * to caller. 951 */ 952 if (wqe->wr_status != SIW_WR_IDLE) { 953 spin_unlock_irqrestore(&qp->sq_lock, flags); 954 goto skip_direct_sending; 955 } 956 rv = siw_activate_tx(qp); 957 spin_unlock_irqrestore(&qp->sq_lock, flags); 958 959 if (rv <= 0) 960 goto skip_direct_sending; 961 962 if (rdma_is_kernel_res(&qp->base_qp.res)) { 963 rv = siw_sq_start(qp); 964 } else { 965 qp->tx_ctx.in_syscall = 1; 966 967 if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend)) 968 siw_qp_cm_drop(qp, 0); 969 970 qp->tx_ctx.in_syscall = 0; 971 } 972 skip_direct_sending: 973 974 up_read(&qp->state_lock); 975 976 if (rv >= 0) 977 return 0; 978 /* 979 * Immediate error 980 */ 981 siw_dbg_qp(qp, "error %d\n", rv); 982 983 *bad_wr = wr; 984 return rv; 985 } 986 987 /* 988 * siw_post_receive() 989 * 990 * Post a list of R-WR's to a RQ. 991 * 992 * @base_qp: Base QP contained in siw QP 993 * @wr: Null terminated list of user WR's 994 * @bad_wr: Points to failing WR in case of synchronous failure. 995 */ 996 int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, 997 const struct ib_recv_wr **bad_wr) 998 { 999 struct siw_qp *qp = to_siw_qp(base_qp); 1000 unsigned long flags; 1001 int rv = 0; 1002 1003 if (qp->srq || qp->attrs.rq_size == 0) { 1004 *bad_wr = wr; 1005 return -EINVAL; 1006 } 1007 if (!rdma_is_kernel_res(&qp->base_qp.res)) { 1008 siw_dbg_qp(qp, "no kernel post_recv for user mapped rq\n"); 1009 *bad_wr = wr; 1010 return -EINVAL; 1011 } 1012 1013 /* 1014 * Try to acquire QP state lock. Must be non-blocking 1015 * to accommodate kernel clients needs. 1016 */ 1017 if (!down_read_trylock(&qp->state_lock)) { 1018 if (qp->attrs.state == SIW_QP_STATE_ERROR) { 1019 /* 1020 * ERROR state is final, so we can be sure 1021 * this state will not change as long as the QP 1022 * exists. 1023 * 1024 * This handles an ib_drain_rq() call with 1025 * a concurrent request to set the QP state 1026 * to ERROR. 1027 */ 1028 rv = siw_rq_flush_wr(qp, wr, bad_wr); 1029 } else { 1030 siw_dbg_qp(qp, "QP locked, state %d\n", 1031 qp->attrs.state); 1032 *bad_wr = wr; 1033 rv = -ENOTCONN; 1034 } 1035 return rv; 1036 } 1037 if (qp->attrs.state > SIW_QP_STATE_RTS) { 1038 if (qp->attrs.state == SIW_QP_STATE_ERROR) { 1039 /* 1040 * Immediately flush this WR to CQ, if QP 1041 * is in ERROR state. RQ is guaranteed to 1042 * be empty, so WR complets in-order. 1043 * 1044 * Typically triggered by ib_drain_rq(). 1045 */ 1046 rv = siw_rq_flush_wr(qp, wr, bad_wr); 1047 } else { 1048 siw_dbg_qp(qp, "QP out of state %d\n", 1049 qp->attrs.state); 1050 *bad_wr = wr; 1051 rv = -ENOTCONN; 1052 } 1053 up_read(&qp->state_lock); 1054 return rv; 1055 } 1056 /* 1057 * Serialize potentially multiple producers. 1058 * Not needed for single threaded consumer side. 1059 */ 1060 spin_lock_irqsave(&qp->rq_lock, flags); 1061 1062 while (wr) { 1063 u32 idx = qp->rq_put % qp->attrs.rq_size; 1064 struct siw_rqe *rqe = &qp->recvq[idx]; 1065 1066 if (rqe->flags) { 1067 siw_dbg_qp(qp, "RQ full\n"); 1068 rv = -ENOMEM; 1069 break; 1070 } 1071 if (wr->num_sge > qp->attrs.rq_max_sges) { 1072 siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); 1073 rv = -EINVAL; 1074 break; 1075 } 1076 rqe->id = wr->wr_id; 1077 rqe->num_sge = wr->num_sge; 1078 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); 1079 1080 /* make sure RQE is completely written before valid */ 1081 smp_wmb(); 1082 1083 rqe->flags = SIW_WQE_VALID; 1084 1085 qp->rq_put++; 1086 wr = wr->next; 1087 } 1088 spin_unlock_irqrestore(&qp->rq_lock, flags); 1089 1090 up_read(&qp->state_lock); 1091 1092 if (rv < 0) { 1093 siw_dbg_qp(qp, "error %d\n", rv); 1094 *bad_wr = wr; 1095 } 1096 return rv > 0 ? 0 : rv; 1097 } 1098 1099 int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) 1100 { 1101 struct siw_cq *cq = to_siw_cq(base_cq); 1102 struct siw_device *sdev = to_siw_dev(base_cq->device); 1103 struct siw_ucontext *ctx = 1104 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1105 base_ucontext); 1106 1107 siw_dbg_cq(cq, "free CQ resources\n"); 1108 1109 siw_cq_flush(cq); 1110 1111 if (ctx) 1112 rdma_user_mmap_entry_remove(cq->cq_entry); 1113 1114 atomic_dec(&sdev->num_cq); 1115 1116 vfree(cq->queue); 1117 return 0; 1118 } 1119 1120 /* 1121 * siw_create_cq() 1122 * 1123 * Populate CQ of requested size 1124 * 1125 * @base_cq: CQ as allocated by RDMA midlayer 1126 * @attr: Initial CQ attributes 1127 * @udata: relates to user context 1128 */ 1129 1130 int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, 1131 struct ib_udata *udata) 1132 { 1133 struct siw_device *sdev = to_siw_dev(base_cq->device); 1134 struct siw_cq *cq = to_siw_cq(base_cq); 1135 int rv, size = attr->cqe; 1136 1137 if (attr->flags) 1138 return -EOPNOTSUPP; 1139 1140 if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) { 1141 siw_dbg(base_cq->device, "too many CQ's\n"); 1142 rv = -ENOMEM; 1143 goto err_out; 1144 } 1145 if (size < 1 || size > sdev->attrs.max_cqe) { 1146 siw_dbg(base_cq->device, "CQ size error: %d\n", size); 1147 rv = -EINVAL; 1148 goto err_out; 1149 } 1150 size = roundup_pow_of_two(size); 1151 cq->base_cq.cqe = size; 1152 cq->num_cqe = size; 1153 1154 if (udata) 1155 cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) + 1156 sizeof(struct siw_cq_ctrl)); 1157 else 1158 cq->queue = vzalloc(size * sizeof(struct siw_cqe) + 1159 sizeof(struct siw_cq_ctrl)); 1160 1161 if (cq->queue == NULL) { 1162 rv = -ENOMEM; 1163 goto err_out; 1164 } 1165 get_random_bytes(&cq->id, 4); 1166 siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id); 1167 1168 spin_lock_init(&cq->lock); 1169 1170 cq->notify = (struct siw_cq_ctrl *)&cq->queue[size]; 1171 1172 if (udata) { 1173 struct siw_uresp_create_cq uresp = {}; 1174 struct siw_ucontext *ctx = 1175 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1176 base_ucontext); 1177 size_t length = size * sizeof(struct siw_cqe) + 1178 sizeof(struct siw_cq_ctrl); 1179 1180 cq->cq_entry = 1181 siw_mmap_entry_insert(ctx, cq->queue, 1182 length, &uresp.cq_key); 1183 if (!cq->cq_entry) { 1184 rv = -ENOMEM; 1185 goto err_out; 1186 } 1187 1188 uresp.cq_id = cq->id; 1189 uresp.num_cqe = size; 1190 1191 if (udata->outlen < sizeof(uresp)) { 1192 rv = -EINVAL; 1193 goto err_out; 1194 } 1195 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1196 if (rv) 1197 goto err_out; 1198 } 1199 return 0; 1200 1201 err_out: 1202 siw_dbg(base_cq->device, "CQ creation failed: %d", rv); 1203 1204 if (cq->queue) { 1205 struct siw_ucontext *ctx = 1206 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1207 base_ucontext); 1208 if (ctx) 1209 rdma_user_mmap_entry_remove(cq->cq_entry); 1210 vfree(cq->queue); 1211 } 1212 atomic_dec(&sdev->num_cq); 1213 1214 return rv; 1215 } 1216 1217 /* 1218 * siw_poll_cq() 1219 * 1220 * Reap CQ entries if available and copy work completion status into 1221 * array of WC's provided by caller. Returns number of reaped CQE's. 1222 * 1223 * @base_cq: Base CQ contained in siw CQ. 1224 * @num_cqe: Maximum number of CQE's to reap. 1225 * @wc: Array of work completions to be filled by siw. 1226 */ 1227 int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc) 1228 { 1229 struct siw_cq *cq = to_siw_cq(base_cq); 1230 int i; 1231 1232 for (i = 0; i < num_cqe; i++) { 1233 if (!siw_reap_cqe(cq, wc)) 1234 break; 1235 wc++; 1236 } 1237 return i; 1238 } 1239 1240 /* 1241 * siw_req_notify_cq() 1242 * 1243 * Request notification for new CQE's added to that CQ. 1244 * Defined flags: 1245 * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification 1246 * event if a WQE with notification flag set enters the CQ 1247 * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification 1248 * event if a WQE enters the CQ. 1249 * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the 1250 * number of not reaped CQE's regardless of its notification 1251 * type and current or new CQ notification settings. 1252 * 1253 * @base_cq: Base CQ contained in siw CQ. 1254 * @flags: Requested notification flags. 1255 */ 1256 int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) 1257 { 1258 struct siw_cq *cq = to_siw_cq(base_cq); 1259 1260 siw_dbg_cq(cq, "flags: 0x%02x\n", flags); 1261 1262 if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) 1263 /* 1264 * Enable CQ event for next solicited completion. 1265 * and make it visible to all associated producers. 1266 */ 1267 smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED); 1268 else 1269 /* 1270 * Enable CQ event for any signalled completion. 1271 * and make it visible to all associated producers. 1272 */ 1273 smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL); 1274 1275 if (flags & IB_CQ_REPORT_MISSED_EVENTS) 1276 return cq->cq_put - cq->cq_get; 1277 1278 return 0; 1279 } 1280 1281 /* 1282 * siw_dereg_mr() 1283 * 1284 * Release Memory Region. 1285 * 1286 * @base_mr: Base MR contained in siw MR. 1287 * @udata: points to user context, unused. 1288 */ 1289 int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata) 1290 { 1291 struct siw_mr *mr = to_siw_mr(base_mr); 1292 struct siw_device *sdev = to_siw_dev(base_mr->device); 1293 1294 siw_dbg_mem(mr->mem, "deregister MR\n"); 1295 1296 atomic_dec(&sdev->num_mr); 1297 1298 siw_mr_drop_mem(mr); 1299 kfree_rcu(mr, rcu); 1300 1301 return 0; 1302 } 1303 1304 /* 1305 * siw_reg_user_mr() 1306 * 1307 * Register Memory Region. 1308 * 1309 * @pd: Protection Domain 1310 * @start: starting address of MR (virtual address) 1311 * @len: len of MR 1312 * @rnic_va: not used by siw 1313 * @rights: MR access rights 1314 * @udata: user buffer to communicate STag and Key. 1315 */ 1316 struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, 1317 u64 rnic_va, int rights, struct ib_udata *udata) 1318 { 1319 struct siw_mr *mr = NULL; 1320 struct siw_umem *umem = NULL; 1321 struct siw_ureq_reg_mr ureq; 1322 struct siw_device *sdev = to_siw_dev(pd->device); 1323 int rv; 1324 1325 siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n", 1326 (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va, 1327 (unsigned long long)len); 1328 1329 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1330 siw_dbg_pd(pd, "too many mr's\n"); 1331 rv = -ENOMEM; 1332 goto err_out; 1333 } 1334 if (!len) { 1335 rv = -EINVAL; 1336 goto err_out; 1337 } 1338 umem = siw_umem_get(pd->device, start, len, rights); 1339 if (IS_ERR(umem)) { 1340 rv = PTR_ERR(umem); 1341 siw_dbg_pd(pd, "getting user memory failed: %d\n", rv); 1342 umem = NULL; 1343 goto err_out; 1344 } 1345 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1346 if (!mr) { 1347 rv = -ENOMEM; 1348 goto err_out; 1349 } 1350 rv = siw_mr_add_mem(mr, pd, umem, start, len, rights); 1351 if (rv) 1352 goto err_out; 1353 1354 if (udata) { 1355 struct siw_uresp_reg_mr uresp = {}; 1356 struct siw_mem *mem = mr->mem; 1357 1358 if (udata->inlen < sizeof(ureq)) { 1359 rv = -EINVAL; 1360 goto err_out; 1361 } 1362 rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); 1363 if (rv) 1364 goto err_out; 1365 1366 mr->base_mr.lkey |= ureq.stag_key; 1367 mr->base_mr.rkey |= ureq.stag_key; 1368 mem->stag |= ureq.stag_key; 1369 uresp.stag = mem->stag; 1370 1371 if (udata->outlen < sizeof(uresp)) { 1372 rv = -EINVAL; 1373 goto err_out; 1374 } 1375 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1376 if (rv) 1377 goto err_out; 1378 } 1379 mr->mem->stag_valid = 1; 1380 1381 return &mr->base_mr; 1382 1383 err_out: 1384 atomic_dec(&sdev->num_mr); 1385 if (mr) { 1386 if (mr->mem) 1387 siw_mr_drop_mem(mr); 1388 kfree_rcu(mr, rcu); 1389 } else { 1390 if (umem) 1391 siw_umem_release(umem); 1392 } 1393 return ERR_PTR(rv); 1394 } 1395 1396 struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 1397 u32 max_sge) 1398 { 1399 struct siw_device *sdev = to_siw_dev(pd->device); 1400 struct siw_mr *mr = NULL; 1401 struct siw_pbl *pbl = NULL; 1402 int rv; 1403 1404 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1405 siw_dbg_pd(pd, "too many mr's\n"); 1406 rv = -ENOMEM; 1407 goto err_out; 1408 } 1409 if (mr_type != IB_MR_TYPE_MEM_REG) { 1410 siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type); 1411 rv = -EOPNOTSUPP; 1412 goto err_out; 1413 } 1414 if (max_sge > SIW_MAX_SGE_PBL) { 1415 siw_dbg_pd(pd, "too many sge's: %d\n", max_sge); 1416 rv = -ENOMEM; 1417 goto err_out; 1418 } 1419 pbl = siw_pbl_alloc(max_sge); 1420 if (IS_ERR(pbl)) { 1421 rv = PTR_ERR(pbl); 1422 siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv); 1423 pbl = NULL; 1424 goto err_out; 1425 } 1426 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1427 if (!mr) { 1428 rv = -ENOMEM; 1429 goto err_out; 1430 } 1431 rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0); 1432 if (rv) 1433 goto err_out; 1434 1435 mr->mem->is_pbl = 1; 1436 1437 siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); 1438 1439 return &mr->base_mr; 1440 1441 err_out: 1442 atomic_dec(&sdev->num_mr); 1443 1444 if (!mr) { 1445 kfree(pbl); 1446 } else { 1447 if (mr->mem) 1448 siw_mr_drop_mem(mr); 1449 kfree_rcu(mr, rcu); 1450 } 1451 siw_dbg_pd(pd, "failed: %d\n", rv); 1452 1453 return ERR_PTR(rv); 1454 } 1455 1456 /* Just used to count number of pages being mapped */ 1457 static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr) 1458 { 1459 return 0; 1460 } 1461 1462 int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, 1463 unsigned int *sg_off) 1464 { 1465 struct scatterlist *slp; 1466 struct siw_mr *mr = to_siw_mr(base_mr); 1467 struct siw_mem *mem = mr->mem; 1468 struct siw_pbl *pbl = mem->pbl; 1469 struct siw_pble *pble; 1470 unsigned long pbl_size; 1471 int i, rv; 1472 1473 if (!pbl) { 1474 siw_dbg_mem(mem, "no PBL allocated\n"); 1475 return -EINVAL; 1476 } 1477 pble = pbl->pbe; 1478 1479 if (pbl->max_buf < num_sle) { 1480 siw_dbg_mem(mem, "too many SGE's: %d > %d\n", 1481 num_sle, pbl->max_buf); 1482 return -ENOMEM; 1483 } 1484 for_each_sg(sl, slp, num_sle, i) { 1485 if (sg_dma_len(slp) == 0) { 1486 siw_dbg_mem(mem, "empty SGE\n"); 1487 return -EINVAL; 1488 } 1489 if (i == 0) { 1490 pble->addr = sg_dma_address(slp); 1491 pble->size = sg_dma_len(slp); 1492 pble->pbl_off = 0; 1493 pbl_size = pble->size; 1494 pbl->num_buf = 1; 1495 } else { 1496 /* Merge PBL entries if adjacent */ 1497 if (pble->addr + pble->size == sg_dma_address(slp)) { 1498 pble->size += sg_dma_len(slp); 1499 } else { 1500 pble++; 1501 pbl->num_buf++; 1502 pble->addr = sg_dma_address(slp); 1503 pble->size = sg_dma_len(slp); 1504 pble->pbl_off = pbl_size; 1505 } 1506 pbl_size += sg_dma_len(slp); 1507 } 1508 siw_dbg_mem(mem, 1509 "sge[%d], size %u, addr 0x%p, total %lu\n", 1510 i, pble->size, ib_virt_dma_to_ptr(pble->addr), 1511 pbl_size); 1512 } 1513 rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); 1514 if (rv > 0) { 1515 mem->len = base_mr->length; 1516 mem->va = base_mr->iova; 1517 siw_dbg_mem(mem, 1518 "%llu bytes, start 0x%pK, %u SLE to %u entries\n", 1519 mem->len, (void *)(uintptr_t)mem->va, num_sle, 1520 pbl->num_buf); 1521 } 1522 return rv; 1523 } 1524 1525 /* 1526 * siw_get_dma_mr() 1527 * 1528 * Create a (empty) DMA memory region, where no umem is attached. 1529 */ 1530 struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights) 1531 { 1532 struct siw_device *sdev = to_siw_dev(pd->device); 1533 struct siw_mr *mr = NULL; 1534 int rv; 1535 1536 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1537 siw_dbg_pd(pd, "too many mr's\n"); 1538 rv = -ENOMEM; 1539 goto err_out; 1540 } 1541 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1542 if (!mr) { 1543 rv = -ENOMEM; 1544 goto err_out; 1545 } 1546 rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights); 1547 if (rv) 1548 goto err_out; 1549 1550 mr->mem->stag_valid = 1; 1551 1552 siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); 1553 1554 return &mr->base_mr; 1555 1556 err_out: 1557 if (rv) 1558 kfree(mr); 1559 1560 atomic_dec(&sdev->num_mr); 1561 1562 return ERR_PTR(rv); 1563 } 1564 1565 /* 1566 * siw_create_srq() 1567 * 1568 * Create Shared Receive Queue of attributes @init_attrs 1569 * within protection domain given by @pd. 1570 * 1571 * @base_srq: Base SRQ contained in siw SRQ. 1572 * @init_attrs: SRQ init attributes. 1573 * @udata: points to user context 1574 */ 1575 int siw_create_srq(struct ib_srq *base_srq, 1576 struct ib_srq_init_attr *init_attrs, struct ib_udata *udata) 1577 { 1578 struct siw_srq *srq = to_siw_srq(base_srq); 1579 struct ib_srq_attr *attrs = &init_attrs->attr; 1580 struct siw_device *sdev = to_siw_dev(base_srq->device); 1581 struct siw_ucontext *ctx = 1582 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1583 base_ucontext); 1584 int rv; 1585 1586 if (init_attrs->srq_type != IB_SRQT_BASIC) 1587 return -EOPNOTSUPP; 1588 1589 if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) { 1590 siw_dbg_pd(base_srq->pd, "too many SRQ's\n"); 1591 rv = -ENOMEM; 1592 goto err_out; 1593 } 1594 if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR || 1595 attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) { 1596 rv = -EINVAL; 1597 goto err_out; 1598 } 1599 srq->max_sge = attrs->max_sge; 1600 srq->num_rqe = roundup_pow_of_two(attrs->max_wr); 1601 srq->limit = attrs->srq_limit; 1602 if (srq->limit) 1603 srq->armed = true; 1604 1605 srq->is_kernel_res = !udata; 1606 1607 if (udata) 1608 srq->recvq = 1609 vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe)); 1610 else 1611 srq->recvq = vcalloc(srq->num_rqe, sizeof(struct siw_rqe)); 1612 1613 if (srq->recvq == NULL) { 1614 rv = -ENOMEM; 1615 goto err_out; 1616 } 1617 if (udata) { 1618 struct siw_uresp_create_srq uresp = {}; 1619 size_t length = srq->num_rqe * sizeof(struct siw_rqe); 1620 1621 srq->srq_entry = 1622 siw_mmap_entry_insert(ctx, srq->recvq, 1623 length, &uresp.srq_key); 1624 if (!srq->srq_entry) { 1625 rv = -ENOMEM; 1626 goto err_out; 1627 } 1628 1629 uresp.num_rqe = srq->num_rqe; 1630 1631 if (udata->outlen < sizeof(uresp)) { 1632 rv = -EINVAL; 1633 goto err_out; 1634 } 1635 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1636 if (rv) 1637 goto err_out; 1638 } 1639 spin_lock_init(&srq->lock); 1640 1641 siw_dbg_pd(base_srq->pd, "[SRQ]: success\n"); 1642 1643 return 0; 1644 1645 err_out: 1646 if (srq->recvq) { 1647 if (ctx) 1648 rdma_user_mmap_entry_remove(srq->srq_entry); 1649 vfree(srq->recvq); 1650 } 1651 atomic_dec(&sdev->num_srq); 1652 1653 return rv; 1654 } 1655 1656 /* 1657 * siw_modify_srq() 1658 * 1659 * Modify SRQ. The caller may resize SRQ and/or set/reset notification 1660 * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification. 1661 * 1662 * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE 1663 * parameter. siw_modify_srq() does not check the attrs->max_sge param. 1664 */ 1665 int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs, 1666 enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) 1667 { 1668 struct siw_srq *srq = to_siw_srq(base_srq); 1669 unsigned long flags; 1670 int rv = 0; 1671 1672 spin_lock_irqsave(&srq->lock, flags); 1673 1674 if (attr_mask & IB_SRQ_MAX_WR) { 1675 /* resize request not yet supported */ 1676 rv = -EOPNOTSUPP; 1677 goto out; 1678 } 1679 if (attr_mask & IB_SRQ_LIMIT) { 1680 if (attrs->srq_limit) { 1681 if (unlikely(attrs->srq_limit > srq->num_rqe)) { 1682 rv = -EINVAL; 1683 goto out; 1684 } 1685 srq->armed = true; 1686 } else { 1687 srq->armed = false; 1688 } 1689 srq->limit = attrs->srq_limit; 1690 } 1691 out: 1692 spin_unlock_irqrestore(&srq->lock, flags); 1693 1694 return rv; 1695 } 1696 1697 /* 1698 * siw_query_srq() 1699 * 1700 * Query SRQ attributes. 1701 */ 1702 int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs) 1703 { 1704 struct siw_srq *srq = to_siw_srq(base_srq); 1705 unsigned long flags; 1706 1707 spin_lock_irqsave(&srq->lock, flags); 1708 1709 attrs->max_wr = srq->num_rqe; 1710 attrs->max_sge = srq->max_sge; 1711 attrs->srq_limit = srq->limit; 1712 1713 spin_unlock_irqrestore(&srq->lock, flags); 1714 1715 return 0; 1716 } 1717 1718 /* 1719 * siw_destroy_srq() 1720 * 1721 * Destroy SRQ. 1722 * It is assumed that the SRQ is not referenced by any 1723 * QP anymore - the code trusts the RDMA core environment to keep track 1724 * of QP references. 1725 */ 1726 int siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata) 1727 { 1728 struct siw_srq *srq = to_siw_srq(base_srq); 1729 struct siw_device *sdev = to_siw_dev(base_srq->device); 1730 struct siw_ucontext *ctx = 1731 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1732 base_ucontext); 1733 1734 if (ctx) 1735 rdma_user_mmap_entry_remove(srq->srq_entry); 1736 vfree(srq->recvq); 1737 atomic_dec(&sdev->num_srq); 1738 return 0; 1739 } 1740 1741 /* 1742 * siw_post_srq_recv() 1743 * 1744 * Post a list of receive queue elements to SRQ. 1745 * NOTE: The function does not check or lock a certain SRQ state 1746 * during the post operation. The code simply trusts the 1747 * RDMA core environment. 1748 * 1749 * @base_srq: Base SRQ contained in siw SRQ 1750 * @wr: List of R-WR's 1751 * @bad_wr: Updated to failing WR if posting fails. 1752 */ 1753 int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, 1754 const struct ib_recv_wr **bad_wr) 1755 { 1756 struct siw_srq *srq = to_siw_srq(base_srq); 1757 unsigned long flags; 1758 int rv = 0; 1759 1760 if (unlikely(!srq->is_kernel_res)) { 1761 siw_dbg_pd(base_srq->pd, 1762 "[SRQ]: no kernel post_recv for mapped srq\n"); 1763 rv = -EINVAL; 1764 goto out; 1765 } 1766 /* 1767 * Serialize potentially multiple producers. 1768 * Also needed to serialize potentially multiple 1769 * consumers. 1770 */ 1771 spin_lock_irqsave(&srq->lock, flags); 1772 1773 while (wr) { 1774 u32 idx = srq->rq_put % srq->num_rqe; 1775 struct siw_rqe *rqe = &srq->recvq[idx]; 1776 1777 if (rqe->flags) { 1778 siw_dbg_pd(base_srq->pd, "SRQ full\n"); 1779 rv = -ENOMEM; 1780 break; 1781 } 1782 if (unlikely(wr->num_sge > srq->max_sge)) { 1783 siw_dbg_pd(base_srq->pd, 1784 "[SRQ]: too many sge's: %d\n", wr->num_sge); 1785 rv = -EINVAL; 1786 break; 1787 } 1788 rqe->id = wr->wr_id; 1789 rqe->num_sge = wr->num_sge; 1790 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); 1791 1792 /* Make sure S-RQE is completely written before valid */ 1793 smp_wmb(); 1794 1795 rqe->flags = SIW_WQE_VALID; 1796 1797 srq->rq_put++; 1798 wr = wr->next; 1799 } 1800 spin_unlock_irqrestore(&srq->lock, flags); 1801 out: 1802 if (unlikely(rv < 0)) { 1803 siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv); 1804 *bad_wr = wr; 1805 } 1806 return rv; 1807 } 1808 1809 void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype) 1810 { 1811 struct ib_event event; 1812 struct ib_qp *base_qp = &qp->base_qp; 1813 1814 /* 1815 * Do not report asynchronous errors on QP which gets 1816 * destroyed via verbs interface (siw_destroy_qp()) 1817 */ 1818 if (qp->attrs.flags & SIW_QP_IN_DESTROY) 1819 return; 1820 1821 event.event = etype; 1822 event.device = base_qp->device; 1823 event.element.qp = base_qp; 1824 1825 if (base_qp->event_handler) { 1826 siw_dbg_qp(qp, "reporting event %d\n", etype); 1827 base_qp->event_handler(&event, base_qp->qp_context); 1828 } 1829 } 1830 1831 void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype) 1832 { 1833 struct ib_event event; 1834 struct ib_cq *base_cq = &cq->base_cq; 1835 1836 event.event = etype; 1837 event.device = base_cq->device; 1838 event.element.cq = base_cq; 1839 1840 if (base_cq->event_handler) { 1841 siw_dbg_cq(cq, "reporting CQ event %d\n", etype); 1842 base_cq->event_handler(&event, base_cq->cq_context); 1843 } 1844 } 1845 1846 void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype) 1847 { 1848 struct ib_event event; 1849 struct ib_srq *base_srq = &srq->base_srq; 1850 1851 event.event = etype; 1852 event.device = base_srq->device; 1853 event.element.srq = base_srq; 1854 1855 if (base_srq->event_handler) { 1856 siw_dbg_pd(srq->base_srq.pd, 1857 "reporting SRQ event %d\n", etype); 1858 base_srq->event_handler(&event, base_srq->srq_context); 1859 } 1860 } 1861 1862 void siw_port_event(struct siw_device *sdev, u32 port, enum ib_event_type etype) 1863 { 1864 struct ib_event event; 1865 1866 event.event = etype; 1867 event.device = &sdev->base_dev; 1868 event.element.port_num = port; 1869 1870 siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype); 1871 1872 ib_dispatch_event(&event); 1873 } 1874