1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause 2 3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 4 /* Copyright (c) 2008-2019, IBM Corporation */ 5 6 #include <linux/errno.h> 7 #include <linux/types.h> 8 #include <linux/uaccess.h> 9 #include <linux/vmalloc.h> 10 #include <linux/xarray.h> 11 #include <net/addrconf.h> 12 13 #include <rdma/iw_cm.h> 14 #include <rdma/ib_verbs.h> 15 #include <rdma/ib_user_verbs.h> 16 #include <rdma/uverbs_ioctl.h> 17 18 #include "siw.h" 19 #include "siw_verbs.h" 20 #include "siw_mem.h" 21 22 static int siw_qp_state_to_ib_qp_state[SIW_QP_STATE_COUNT] = { 23 [SIW_QP_STATE_IDLE] = IB_QPS_INIT, 24 [SIW_QP_STATE_RTR] = IB_QPS_RTR, 25 [SIW_QP_STATE_RTS] = IB_QPS_RTS, 26 [SIW_QP_STATE_CLOSING] = IB_QPS_SQD, 27 [SIW_QP_STATE_TERMINATE] = IB_QPS_SQE, 28 [SIW_QP_STATE_ERROR] = IB_QPS_ERR 29 }; 30 31 static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = { 32 [IB_QPS_RESET] = SIW_QP_STATE_IDLE, 33 [IB_QPS_INIT] = SIW_QP_STATE_IDLE, 34 [IB_QPS_RTR] = SIW_QP_STATE_RTR, 35 [IB_QPS_RTS] = SIW_QP_STATE_RTS, 36 [IB_QPS_SQD] = SIW_QP_STATE_CLOSING, 37 [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE, 38 [IB_QPS_ERR] = SIW_QP_STATE_ERROR 39 }; 40 41 static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = { 42 [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR", 43 [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE", 44 [IB_QPS_ERR] = "ERR" 45 }; 46 47 void siw_mmap_free(struct rdma_user_mmap_entry *rdma_entry) 48 { 49 struct siw_user_mmap_entry *entry = to_siw_mmap_entry(rdma_entry); 50 51 kfree(entry); 52 } 53 54 int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) 55 { 56 struct siw_ucontext *uctx = to_siw_ctx(ctx); 57 size_t size = vma->vm_end - vma->vm_start; 58 struct rdma_user_mmap_entry *rdma_entry; 59 struct siw_user_mmap_entry *entry; 60 int rv = -EINVAL; 61 62 /* 63 * Must be page aligned 64 */ 65 if (vma->vm_start & (PAGE_SIZE - 1)) { 66 pr_warn("siw: mmap not page aligned\n"); 67 return -EINVAL; 68 } 69 rdma_entry = rdma_user_mmap_entry_get(&uctx->base_ucontext, vma); 70 if (!rdma_entry) { 71 siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %#zx\n", 72 vma->vm_pgoff, size); 73 return -EINVAL; 74 } 75 entry = to_siw_mmap_entry(rdma_entry); 76 77 rv = remap_vmalloc_range(vma, entry->address, 0); 78 if (rv) 79 pr_warn("remap_vmalloc_range failed: %lu, %zu\n", vma->vm_pgoff, 80 size); 81 rdma_user_mmap_entry_put(rdma_entry); 82 83 return rv; 84 } 85 86 int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata) 87 { 88 struct siw_device *sdev = to_siw_dev(base_ctx->device); 89 struct siw_ucontext *ctx = to_siw_ctx(base_ctx); 90 struct siw_uresp_alloc_ctx uresp = {}; 91 int rv; 92 93 if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) { 94 rv = -ENOMEM; 95 goto err_out; 96 } 97 ctx->sdev = sdev; 98 99 uresp.dev_id = sdev->vendor_part_id; 100 101 if (udata->outlen < sizeof(uresp)) { 102 rv = -EINVAL; 103 goto err_out; 104 } 105 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 106 if (rv) 107 goto err_out; 108 109 siw_dbg(base_ctx->device, "success. now %d context(s)\n", 110 atomic_read(&sdev->num_ctx)); 111 112 return 0; 113 114 err_out: 115 atomic_dec(&sdev->num_ctx); 116 siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv, 117 atomic_read(&sdev->num_ctx)); 118 119 return rv; 120 } 121 122 void siw_dealloc_ucontext(struct ib_ucontext *base_ctx) 123 { 124 struct siw_ucontext *uctx = to_siw_ctx(base_ctx); 125 126 atomic_dec(&uctx->sdev->num_ctx); 127 } 128 129 int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, 130 struct ib_udata *udata) 131 { 132 struct siw_device *sdev = to_siw_dev(base_dev); 133 134 if (udata->inlen || udata->outlen) 135 return -EINVAL; 136 137 memset(attr, 0, sizeof(*attr)); 138 139 /* Revisit atomic caps if RFC 7306 gets supported */ 140 attr->atomic_cap = 0; 141 attr->device_cap_flags = IB_DEVICE_MEM_MGT_EXTENSIONS; 142 attr->kernel_cap_flags = IBK_ALLOW_USER_UNREG; 143 attr->max_cq = sdev->attrs.max_cq; 144 attr->max_cqe = sdev->attrs.max_cqe; 145 attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; 146 attr->max_mr = sdev->attrs.max_mr; 147 attr->max_mw = sdev->attrs.max_mw; 148 attr->max_mr_size = ~0ull; 149 attr->max_pd = sdev->attrs.max_pd; 150 attr->max_qp = sdev->attrs.max_qp; 151 attr->max_qp_init_rd_atom = sdev->attrs.max_ird; 152 attr->max_qp_rd_atom = sdev->attrs.max_ord; 153 attr->max_qp_wr = sdev->attrs.max_qp_wr; 154 attr->max_recv_sge = sdev->attrs.max_sge; 155 attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird; 156 attr->max_send_sge = sdev->attrs.max_sge; 157 attr->max_sge_rd = sdev->attrs.max_sge_rd; 158 attr->max_srq = sdev->attrs.max_srq; 159 attr->max_srq_sge = sdev->attrs.max_srq_sge; 160 attr->max_srq_wr = sdev->attrs.max_srq_wr; 161 attr->page_size_cap = PAGE_SIZE; 162 attr->vendor_id = SIW_VENDOR_ID; 163 attr->vendor_part_id = sdev->vendor_part_id; 164 165 addrconf_addr_eui48((u8 *)&attr->sys_image_guid, 166 sdev->raw_gid); 167 168 return 0; 169 } 170 171 int siw_query_port(struct ib_device *base_dev, u32 port, 172 struct ib_port_attr *attr) 173 { 174 struct siw_device *sdev = to_siw_dev(base_dev); 175 int rv; 176 177 memset(attr, 0, sizeof(*attr)); 178 179 rv = ib_get_eth_speed(base_dev, port, &attr->active_speed, 180 &attr->active_width); 181 attr->gid_tbl_len = 1; 182 attr->max_msg_sz = -1; 183 attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); 184 attr->active_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); 185 attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 186 IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED; 187 attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; 188 attr->state = sdev->state; 189 /* 190 * All zero 191 * 192 * attr->lid = 0; 193 * attr->bad_pkey_cntr = 0; 194 * attr->qkey_viol_cntr = 0; 195 * attr->sm_lid = 0; 196 * attr->lmc = 0; 197 * attr->max_vl_num = 0; 198 * attr->sm_sl = 0; 199 * attr->subnet_timeout = 0; 200 * attr->init_type_repy = 0; 201 */ 202 return rv; 203 } 204 205 int siw_get_port_immutable(struct ib_device *base_dev, u32 port, 206 struct ib_port_immutable *port_immutable) 207 { 208 struct ib_port_attr attr; 209 int rv = siw_query_port(base_dev, port, &attr); 210 211 if (rv) 212 return rv; 213 214 port_immutable->gid_tbl_len = attr.gid_tbl_len; 215 port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; 216 217 return 0; 218 } 219 220 int siw_query_gid(struct ib_device *base_dev, u32 port, int idx, 221 union ib_gid *gid) 222 { 223 struct siw_device *sdev = to_siw_dev(base_dev); 224 225 /* subnet_prefix == interface_id == 0; */ 226 memset(gid, 0, sizeof(*gid)); 227 memcpy(gid->raw, sdev->raw_gid, ETH_ALEN); 228 229 return 0; 230 } 231 232 int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata) 233 { 234 struct siw_device *sdev = to_siw_dev(pd->device); 235 236 if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) { 237 atomic_dec(&sdev->num_pd); 238 return -ENOMEM; 239 } 240 siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd)); 241 242 return 0; 243 } 244 245 int siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) 246 { 247 struct siw_device *sdev = to_siw_dev(pd->device); 248 249 siw_dbg_pd(pd, "free PD\n"); 250 atomic_dec(&sdev->num_pd); 251 return 0; 252 } 253 254 void siw_qp_get_ref(struct ib_qp *base_qp) 255 { 256 siw_qp_get(to_siw_qp(base_qp)); 257 } 258 259 void siw_qp_put_ref(struct ib_qp *base_qp) 260 { 261 siw_qp_put(to_siw_qp(base_qp)); 262 } 263 264 static struct rdma_user_mmap_entry * 265 siw_mmap_entry_insert(struct siw_ucontext *uctx, 266 void *address, size_t length, 267 u64 *offset) 268 { 269 struct siw_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL); 270 int rv; 271 272 *offset = SIW_INVAL_UOBJ_KEY; 273 if (!entry) 274 return NULL; 275 276 entry->address = address; 277 278 rv = rdma_user_mmap_entry_insert(&uctx->base_ucontext, 279 &entry->rdma_entry, 280 length); 281 if (rv) { 282 kfree(entry); 283 return NULL; 284 } 285 286 *offset = rdma_user_mmap_get_offset(&entry->rdma_entry); 287 288 return &entry->rdma_entry; 289 } 290 291 /* 292 * siw_create_qp() 293 * 294 * Create QP of requested size on given device. 295 * 296 * @qp: Queue pait 297 * @attrs: Initial QP attributes. 298 * @udata: used to provide QP ID, SQ and RQ size back to user. 299 */ 300 301 int siw_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *attrs, 302 struct ib_udata *udata) 303 { 304 struct ib_pd *pd = ibqp->pd; 305 struct siw_qp *qp = to_siw_qp(ibqp); 306 struct ib_device *base_dev = pd->device; 307 struct siw_device *sdev = to_siw_dev(base_dev); 308 struct siw_ucontext *uctx = 309 rdma_udata_to_drv_context(udata, struct siw_ucontext, 310 base_ucontext); 311 unsigned long flags; 312 int num_sqe, num_rqe, rv = 0; 313 size_t length; 314 315 siw_dbg(base_dev, "create new QP\n"); 316 317 if (attrs->create_flags) 318 return -EOPNOTSUPP; 319 320 if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { 321 siw_dbg(base_dev, "too many QP's\n"); 322 rv = -ENOMEM; 323 goto err_atomic; 324 } 325 if (attrs->qp_type != IB_QPT_RC) { 326 siw_dbg(base_dev, "only RC QP's supported\n"); 327 rv = -EOPNOTSUPP; 328 goto err_atomic; 329 } 330 if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) || 331 (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) || 332 (attrs->cap.max_send_sge > SIW_MAX_SGE) || 333 (attrs->cap.max_recv_sge > SIW_MAX_SGE)) { 334 siw_dbg(base_dev, "QP size error\n"); 335 rv = -EINVAL; 336 goto err_atomic; 337 } 338 if (attrs->cap.max_inline_data > SIW_MAX_INLINE) { 339 siw_dbg(base_dev, "max inline send: %d > %d\n", 340 attrs->cap.max_inline_data, (int)SIW_MAX_INLINE); 341 rv = -EINVAL; 342 goto err_atomic; 343 } 344 /* 345 * NOTE: we don't allow for a QP unable to hold any SQ WQE 346 */ 347 if (attrs->cap.max_send_wr == 0) { 348 siw_dbg(base_dev, "QP must have send queue\n"); 349 rv = -EINVAL; 350 goto err_atomic; 351 } 352 353 if (!attrs->send_cq || (!attrs->recv_cq && !attrs->srq)) { 354 siw_dbg(base_dev, "send CQ or receive CQ invalid\n"); 355 rv = -EINVAL; 356 goto err_atomic; 357 } 358 359 init_rwsem(&qp->state_lock); 360 spin_lock_init(&qp->sq_lock); 361 spin_lock_init(&qp->rq_lock); 362 spin_lock_init(&qp->orq_lock); 363 364 rv = siw_qp_add(sdev, qp); 365 if (rv) 366 goto err_atomic; 367 368 369 /* All queue indices are derived from modulo operations 370 * on a free running 'get' (consumer) and 'put' (producer) 371 * unsigned counter. Having queue sizes at power of two 372 * avoids handling counter wrap around. 373 */ 374 num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr); 375 num_rqe = attrs->cap.max_recv_wr; 376 if (num_rqe) 377 num_rqe = roundup_pow_of_two(num_rqe); 378 379 if (udata) 380 qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe)); 381 else 382 qp->sendq = vcalloc(num_sqe, sizeof(struct siw_sqe)); 383 384 if (qp->sendq == NULL) { 385 rv = -ENOMEM; 386 goto err_out_xa; 387 } 388 if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) { 389 if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) 390 qp->attrs.flags |= SIW_SIGNAL_ALL_WR; 391 else { 392 rv = -EINVAL; 393 goto err_out_xa; 394 } 395 } 396 qp->pd = pd; 397 qp->scq = to_siw_cq(attrs->send_cq); 398 qp->rcq = to_siw_cq(attrs->recv_cq); 399 400 if (attrs->srq) { 401 /* 402 * SRQ support. 403 * Verbs 6.3.7: ignore RQ size, if SRQ present 404 * Verbs 6.3.5: do not check PD of SRQ against PD of QP 405 */ 406 qp->srq = to_siw_srq(attrs->srq); 407 qp->attrs.rq_size = 0; 408 siw_dbg(base_dev, "QP [%u]: SRQ attached\n", 409 qp->base_qp.qp_num); 410 } else if (num_rqe) { 411 if (udata) 412 qp->recvq = 413 vmalloc_user(num_rqe * sizeof(struct siw_rqe)); 414 else 415 qp->recvq = vcalloc(num_rqe, sizeof(struct siw_rqe)); 416 417 if (qp->recvq == NULL) { 418 rv = -ENOMEM; 419 goto err_out_xa; 420 } 421 qp->attrs.rq_size = num_rqe; 422 } 423 qp->attrs.sq_size = num_sqe; 424 qp->attrs.sq_max_sges = attrs->cap.max_send_sge; 425 qp->attrs.rq_max_sges = attrs->cap.max_recv_sge; 426 427 /* Make those two tunables fixed for now. */ 428 qp->tx_ctx.gso_seg_limit = 1; 429 qp->tx_ctx.zcopy_tx = zcopy_tx; 430 431 qp->attrs.state = SIW_QP_STATE_IDLE; 432 433 if (udata) { 434 struct siw_uresp_create_qp uresp = {}; 435 436 uresp.num_sqe = num_sqe; 437 uresp.num_rqe = num_rqe; 438 uresp.qp_id = qp_id(qp); 439 440 if (qp->sendq) { 441 length = num_sqe * sizeof(struct siw_sqe); 442 qp->sq_entry = 443 siw_mmap_entry_insert(uctx, qp->sendq, 444 length, &uresp.sq_key); 445 if (!qp->sq_entry) { 446 rv = -ENOMEM; 447 goto err_out_xa; 448 } 449 } 450 451 if (qp->recvq) { 452 length = num_rqe * sizeof(struct siw_rqe); 453 qp->rq_entry = 454 siw_mmap_entry_insert(uctx, qp->recvq, 455 length, &uresp.rq_key); 456 if (!qp->rq_entry) { 457 uresp.sq_key = SIW_INVAL_UOBJ_KEY; 458 rv = -ENOMEM; 459 goto err_out_xa; 460 } 461 } 462 463 if (udata->outlen < sizeof(uresp)) { 464 rv = -EINVAL; 465 goto err_out_xa; 466 } 467 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 468 if (rv) 469 goto err_out_xa; 470 } 471 qp->tx_cpu = siw_get_tx_cpu(sdev); 472 if (qp->tx_cpu < 0) { 473 rv = -EINVAL; 474 goto err_out_xa; 475 } 476 INIT_LIST_HEAD(&qp->devq); 477 spin_lock_irqsave(&sdev->lock, flags); 478 list_add_tail(&qp->devq, &sdev->qp_list); 479 spin_unlock_irqrestore(&sdev->lock, flags); 480 481 init_completion(&qp->qp_free); 482 483 return 0; 484 485 err_out_xa: 486 xa_erase(&sdev->qp_xa, qp_id(qp)); 487 if (uctx) { 488 rdma_user_mmap_entry_remove(qp->sq_entry); 489 rdma_user_mmap_entry_remove(qp->rq_entry); 490 } 491 vfree(qp->sendq); 492 vfree(qp->recvq); 493 494 err_atomic: 495 atomic_dec(&sdev->num_qp); 496 return rv; 497 } 498 499 /* 500 * Minimum siw_query_qp() verb interface. 501 * 502 * @qp_attr_mask is not used but all available information is provided 503 */ 504 int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, 505 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) 506 { 507 struct siw_qp *qp; 508 struct siw_device *sdev; 509 510 if (base_qp && qp_attr && qp_init_attr) { 511 qp = to_siw_qp(base_qp); 512 sdev = to_siw_dev(base_qp->device); 513 } else { 514 return -EINVAL; 515 } 516 qp_attr->qp_state = siw_qp_state_to_ib_qp_state[qp->attrs.state]; 517 qp_attr->cap.max_inline_data = SIW_MAX_INLINE; 518 qp_attr->cap.max_send_wr = qp->attrs.sq_size; 519 qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; 520 qp_attr->cap.max_recv_wr = qp->attrs.rq_size; 521 qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; 522 qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); 523 qp_attr->max_rd_atomic = qp->attrs.irq_size; 524 qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; 525 526 qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | 527 IB_ACCESS_REMOTE_WRITE | 528 IB_ACCESS_REMOTE_READ; 529 530 qp_init_attr->qp_type = base_qp->qp_type; 531 qp_init_attr->send_cq = base_qp->send_cq; 532 qp_init_attr->recv_cq = base_qp->recv_cq; 533 qp_init_attr->srq = base_qp->srq; 534 535 qp_init_attr->cap = qp_attr->cap; 536 537 return 0; 538 } 539 540 int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, 541 int attr_mask, struct ib_udata *udata) 542 { 543 struct siw_qp_attrs new_attrs; 544 enum siw_qp_attr_mask siw_attr_mask = 0; 545 struct siw_qp *qp = to_siw_qp(base_qp); 546 int rv = 0; 547 548 if (!attr_mask) 549 return 0; 550 551 if (attr_mask & ~IB_QP_ATTR_STANDARD_BITS) 552 return -EOPNOTSUPP; 553 554 memset(&new_attrs, 0, sizeof(new_attrs)); 555 556 if (attr_mask & IB_QP_ACCESS_FLAGS) { 557 siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS; 558 559 if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) 560 new_attrs.flags |= SIW_RDMA_READ_ENABLED; 561 if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) 562 new_attrs.flags |= SIW_RDMA_WRITE_ENABLED; 563 if (attr->qp_access_flags & IB_ACCESS_MW_BIND) 564 new_attrs.flags |= SIW_RDMA_BIND_ENABLED; 565 } 566 if (attr_mask & IB_QP_STATE) { 567 siw_dbg_qp(qp, "desired IB QP state: %s\n", 568 ib_qp_state_to_string[attr->qp_state]); 569 570 new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state]; 571 572 if (new_attrs.state > SIW_QP_STATE_RTS) 573 qp->tx_ctx.tx_suspend = 1; 574 575 siw_attr_mask |= SIW_QP_ATTR_STATE; 576 } 577 if (!siw_attr_mask) 578 goto out; 579 580 down_write(&qp->state_lock); 581 582 rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask); 583 584 up_write(&qp->state_lock); 585 out: 586 return rv; 587 } 588 589 int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata) 590 { 591 struct siw_qp *qp = to_siw_qp(base_qp); 592 struct siw_ucontext *uctx = 593 rdma_udata_to_drv_context(udata, struct siw_ucontext, 594 base_ucontext); 595 struct siw_qp_attrs qp_attrs; 596 597 siw_dbg_qp(qp, "state %d\n", qp->attrs.state); 598 599 /* 600 * Mark QP as in process of destruction to prevent from 601 * any async callbacks to RDMA core 602 */ 603 qp->attrs.flags |= SIW_QP_IN_DESTROY; 604 qp->rx_stream.rx_suspend = 1; 605 606 if (uctx) { 607 rdma_user_mmap_entry_remove(qp->sq_entry); 608 rdma_user_mmap_entry_remove(qp->rq_entry); 609 } 610 611 down_write(&qp->state_lock); 612 613 qp_attrs.state = SIW_QP_STATE_ERROR; 614 siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE); 615 616 if (qp->cep) { 617 siw_cep_put(qp->cep); 618 qp->cep = NULL; 619 } 620 up_write(&qp->state_lock); 621 622 kfree(qp->tx_ctx.mpa_crc_hd); 623 kfree(qp->rx_stream.mpa_crc_hd); 624 625 qp->scq = qp->rcq = NULL; 626 627 siw_qp_put(qp); 628 wait_for_completion(&qp->qp_free); 629 630 return 0; 631 } 632 633 /* 634 * siw_copy_inline_sgl() 635 * 636 * Prepare sgl of inlined data for sending. For userland callers 637 * function checks if given buffer addresses and len's are within 638 * process context bounds. 639 * Data from all provided sge's are copied together into the wqe, 640 * referenced by a single sge. 641 */ 642 static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, 643 struct siw_sqe *sqe) 644 { 645 struct ib_sge *core_sge = core_wr->sg_list; 646 void *kbuf = &sqe->sge[1]; 647 int num_sge = core_wr->num_sge, bytes = 0; 648 649 sqe->sge[0].laddr = (uintptr_t)kbuf; 650 sqe->sge[0].lkey = 0; 651 652 while (num_sge--) { 653 if (!core_sge->length) { 654 core_sge++; 655 continue; 656 } 657 bytes += core_sge->length; 658 if (bytes > SIW_MAX_INLINE) { 659 bytes = -EINVAL; 660 break; 661 } 662 memcpy(kbuf, ib_virt_dma_to_ptr(core_sge->addr), 663 core_sge->length); 664 665 kbuf += core_sge->length; 666 core_sge++; 667 } 668 sqe->sge[0].length = max(bytes, 0); 669 sqe->num_sge = bytes > 0 ? 1 : 0; 670 671 return bytes; 672 } 673 674 /* Complete SQ WR's without processing */ 675 static int siw_sq_flush_wr(struct siw_qp *qp, const struct ib_send_wr *wr, 676 const struct ib_send_wr **bad_wr) 677 { 678 int rv = 0; 679 680 while (wr) { 681 struct siw_sqe sqe = {}; 682 683 switch (wr->opcode) { 684 case IB_WR_RDMA_WRITE: 685 sqe.opcode = SIW_OP_WRITE; 686 break; 687 case IB_WR_RDMA_READ: 688 sqe.opcode = SIW_OP_READ; 689 break; 690 case IB_WR_RDMA_READ_WITH_INV: 691 sqe.opcode = SIW_OP_READ_LOCAL_INV; 692 break; 693 case IB_WR_SEND: 694 sqe.opcode = SIW_OP_SEND; 695 break; 696 case IB_WR_SEND_WITH_IMM: 697 sqe.opcode = SIW_OP_SEND_WITH_IMM; 698 break; 699 case IB_WR_SEND_WITH_INV: 700 sqe.opcode = SIW_OP_SEND_REMOTE_INV; 701 break; 702 case IB_WR_LOCAL_INV: 703 sqe.opcode = SIW_OP_INVAL_STAG; 704 break; 705 case IB_WR_REG_MR: 706 sqe.opcode = SIW_OP_REG_MR; 707 break; 708 default: 709 rv = -EINVAL; 710 break; 711 } 712 if (!rv) { 713 sqe.id = wr->wr_id; 714 rv = siw_sqe_complete(qp, &sqe, 0, 715 SIW_WC_WR_FLUSH_ERR); 716 } 717 if (rv) { 718 if (bad_wr) 719 *bad_wr = wr; 720 break; 721 } 722 wr = wr->next; 723 } 724 return rv; 725 } 726 727 /* Complete RQ WR's without processing */ 728 static int siw_rq_flush_wr(struct siw_qp *qp, const struct ib_recv_wr *wr, 729 const struct ib_recv_wr **bad_wr) 730 { 731 struct siw_rqe rqe = {}; 732 int rv = 0; 733 734 while (wr) { 735 rqe.id = wr->wr_id; 736 rv = siw_rqe_complete(qp, &rqe, 0, 0, SIW_WC_WR_FLUSH_ERR); 737 if (rv) { 738 if (bad_wr) 739 *bad_wr = wr; 740 break; 741 } 742 wr = wr->next; 743 } 744 return rv; 745 } 746 747 /* 748 * siw_post_send() 749 * 750 * Post a list of S-WR's to a SQ. 751 * 752 * @base_qp: Base QP contained in siw QP 753 * @wr: Null terminated list of user WR's 754 * @bad_wr: Points to failing WR in case of synchronous failure. 755 */ 756 int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, 757 const struct ib_send_wr **bad_wr) 758 { 759 struct siw_qp *qp = to_siw_qp(base_qp); 760 struct siw_wqe *wqe = tx_wqe(qp); 761 762 unsigned long flags; 763 int rv = 0; 764 765 if (wr && !rdma_is_kernel_res(&qp->base_qp.res)) { 766 siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); 767 *bad_wr = wr; 768 return -EINVAL; 769 } 770 771 /* 772 * Try to acquire QP state lock. Must be non-blocking 773 * to accommodate kernel clients needs. 774 */ 775 if (!down_read_trylock(&qp->state_lock)) { 776 if (qp->attrs.state == SIW_QP_STATE_ERROR) { 777 /* 778 * ERROR state is final, so we can be sure 779 * this state will not change as long as the QP 780 * exists. 781 * 782 * This handles an ib_drain_sq() call with 783 * a concurrent request to set the QP state 784 * to ERROR. 785 */ 786 rv = siw_sq_flush_wr(qp, wr, bad_wr); 787 } else { 788 siw_dbg_qp(qp, "QP locked, state %d\n", 789 qp->attrs.state); 790 *bad_wr = wr; 791 rv = -ENOTCONN; 792 } 793 return rv; 794 } 795 if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { 796 if (qp->attrs.state == SIW_QP_STATE_ERROR) { 797 /* 798 * Immediately flush this WR to CQ, if QP 799 * is in ERROR state. SQ is guaranteed to 800 * be empty, so WR complets in-order. 801 * 802 * Typically triggered by ib_drain_sq(). 803 */ 804 rv = siw_sq_flush_wr(qp, wr, bad_wr); 805 } else { 806 siw_dbg_qp(qp, "QP out of state %d\n", 807 qp->attrs.state); 808 *bad_wr = wr; 809 rv = -ENOTCONN; 810 } 811 up_read(&qp->state_lock); 812 return rv; 813 } 814 spin_lock_irqsave(&qp->sq_lock, flags); 815 816 while (wr) { 817 u32 idx = qp->sq_put % qp->attrs.sq_size; 818 struct siw_sqe *sqe = &qp->sendq[idx]; 819 820 if (sqe->flags) { 821 siw_dbg_qp(qp, "sq full\n"); 822 rv = -ENOMEM; 823 break; 824 } 825 if (wr->num_sge > qp->attrs.sq_max_sges) { 826 siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); 827 rv = -EINVAL; 828 break; 829 } 830 sqe->id = wr->wr_id; 831 832 if ((wr->send_flags & IB_SEND_SIGNALED) || 833 (qp->attrs.flags & SIW_SIGNAL_ALL_WR)) 834 sqe->flags |= SIW_WQE_SIGNALLED; 835 836 if (wr->send_flags & IB_SEND_FENCE) 837 sqe->flags |= SIW_WQE_READ_FENCE; 838 839 switch (wr->opcode) { 840 case IB_WR_SEND: 841 case IB_WR_SEND_WITH_INV: 842 if (wr->send_flags & IB_SEND_SOLICITED) 843 sqe->flags |= SIW_WQE_SOLICITED; 844 845 if (!(wr->send_flags & IB_SEND_INLINE)) { 846 siw_copy_sgl(wr->sg_list, sqe->sge, 847 wr->num_sge); 848 sqe->num_sge = wr->num_sge; 849 } else { 850 rv = siw_copy_inline_sgl(wr, sqe); 851 if (rv <= 0) { 852 rv = -EINVAL; 853 break; 854 } 855 sqe->flags |= SIW_WQE_INLINE; 856 sqe->num_sge = 1; 857 } 858 if (wr->opcode == IB_WR_SEND) 859 sqe->opcode = SIW_OP_SEND; 860 else { 861 sqe->opcode = SIW_OP_SEND_REMOTE_INV; 862 sqe->rkey = wr->ex.invalidate_rkey; 863 } 864 break; 865 866 case IB_WR_RDMA_READ_WITH_INV: 867 case IB_WR_RDMA_READ: 868 /* 869 * iWarp restricts RREAD sink to SGL containing 870 * 1 SGE only. we could relax to SGL with multiple 871 * elements referring the SAME ltag or even sending 872 * a private per-rreq tag referring to a checked 873 * local sgl with MULTIPLE ltag's. 874 */ 875 if (unlikely(wr->num_sge != 1)) { 876 rv = -EINVAL; 877 break; 878 } 879 siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1); 880 /* 881 * NOTE: zero length RREAD is allowed! 882 */ 883 sqe->raddr = rdma_wr(wr)->remote_addr; 884 sqe->rkey = rdma_wr(wr)->rkey; 885 sqe->num_sge = 1; 886 887 if (wr->opcode == IB_WR_RDMA_READ) 888 sqe->opcode = SIW_OP_READ; 889 else 890 sqe->opcode = SIW_OP_READ_LOCAL_INV; 891 break; 892 893 case IB_WR_RDMA_WRITE: 894 if (!(wr->send_flags & IB_SEND_INLINE)) { 895 siw_copy_sgl(wr->sg_list, &sqe->sge[0], 896 wr->num_sge); 897 sqe->num_sge = wr->num_sge; 898 } else { 899 rv = siw_copy_inline_sgl(wr, sqe); 900 if (unlikely(rv < 0)) { 901 rv = -EINVAL; 902 break; 903 } 904 sqe->flags |= SIW_WQE_INLINE; 905 sqe->num_sge = 1; 906 } 907 sqe->raddr = rdma_wr(wr)->remote_addr; 908 sqe->rkey = rdma_wr(wr)->rkey; 909 sqe->opcode = SIW_OP_WRITE; 910 break; 911 912 case IB_WR_REG_MR: 913 sqe->base_mr = (uintptr_t)reg_wr(wr)->mr; 914 sqe->rkey = reg_wr(wr)->key; 915 sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK; 916 sqe->opcode = SIW_OP_REG_MR; 917 break; 918 919 case IB_WR_LOCAL_INV: 920 sqe->rkey = wr->ex.invalidate_rkey; 921 sqe->opcode = SIW_OP_INVAL_STAG; 922 break; 923 924 default: 925 siw_dbg_qp(qp, "ib wr type %d unsupported\n", 926 wr->opcode); 927 rv = -EINVAL; 928 break; 929 } 930 siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n", 931 sqe->opcode, sqe->flags, 932 (void *)(uintptr_t)sqe->id); 933 934 if (unlikely(rv < 0)) 935 break; 936 937 /* make SQE only valid after completely written */ 938 smp_wmb(); 939 sqe->flags |= SIW_WQE_VALID; 940 941 qp->sq_put++; 942 wr = wr->next; 943 } 944 945 /* 946 * Send directly if SQ processing is not in progress. 947 * Eventual immediate errors (rv < 0) do not affect the involved 948 * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ 949 * processing, if new work is already pending. But rv must be passed 950 * to caller. 951 */ 952 if (wqe->wr_status != SIW_WR_IDLE) { 953 spin_unlock_irqrestore(&qp->sq_lock, flags); 954 goto skip_direct_sending; 955 } 956 rv = siw_activate_tx(qp); 957 spin_unlock_irqrestore(&qp->sq_lock, flags); 958 959 if (rv <= 0) 960 goto skip_direct_sending; 961 962 if (rdma_is_kernel_res(&qp->base_qp.res)) { 963 rv = siw_sq_start(qp); 964 } else { 965 qp->tx_ctx.in_syscall = 1; 966 967 if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend)) 968 siw_qp_cm_drop(qp, 0); 969 970 qp->tx_ctx.in_syscall = 0; 971 } 972 skip_direct_sending: 973 974 up_read(&qp->state_lock); 975 976 if (rv >= 0) 977 return 0; 978 /* 979 * Immediate error 980 */ 981 siw_dbg_qp(qp, "error %d\n", rv); 982 983 *bad_wr = wr; 984 return rv; 985 } 986 987 /* 988 * siw_post_receive() 989 * 990 * Post a list of R-WR's to a RQ. 991 * 992 * @base_qp: Base QP contained in siw QP 993 * @wr: Null terminated list of user WR's 994 * @bad_wr: Points to failing WR in case of synchronous failure. 995 */ 996 int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, 997 const struct ib_recv_wr **bad_wr) 998 { 999 struct siw_qp *qp = to_siw_qp(base_qp); 1000 unsigned long flags; 1001 int rv = 0; 1002 1003 if (qp->srq || qp->attrs.rq_size == 0) { 1004 *bad_wr = wr; 1005 return -EINVAL; 1006 } 1007 if (!rdma_is_kernel_res(&qp->base_qp.res)) { 1008 siw_dbg_qp(qp, "no kernel post_recv for user mapped rq\n"); 1009 *bad_wr = wr; 1010 return -EINVAL; 1011 } 1012 1013 /* 1014 * Try to acquire QP state lock. Must be non-blocking 1015 * to accommodate kernel clients needs. 1016 */ 1017 if (!down_read_trylock(&qp->state_lock)) { 1018 if (qp->attrs.state == SIW_QP_STATE_ERROR) { 1019 /* 1020 * ERROR state is final, so we can be sure 1021 * this state will not change as long as the QP 1022 * exists. 1023 * 1024 * This handles an ib_drain_rq() call with 1025 * a concurrent request to set the QP state 1026 * to ERROR. 1027 */ 1028 rv = siw_rq_flush_wr(qp, wr, bad_wr); 1029 } else { 1030 siw_dbg_qp(qp, "QP locked, state %d\n", 1031 qp->attrs.state); 1032 *bad_wr = wr; 1033 rv = -ENOTCONN; 1034 } 1035 return rv; 1036 } 1037 if (qp->attrs.state > SIW_QP_STATE_RTS) { 1038 if (qp->attrs.state == SIW_QP_STATE_ERROR) { 1039 /* 1040 * Immediately flush this WR to CQ, if QP 1041 * is in ERROR state. RQ is guaranteed to 1042 * be empty, so WR complets in-order. 1043 * 1044 * Typically triggered by ib_drain_rq(). 1045 */ 1046 rv = siw_rq_flush_wr(qp, wr, bad_wr); 1047 } else { 1048 siw_dbg_qp(qp, "QP out of state %d\n", 1049 qp->attrs.state); 1050 *bad_wr = wr; 1051 rv = -ENOTCONN; 1052 } 1053 up_read(&qp->state_lock); 1054 return rv; 1055 } 1056 /* 1057 * Serialize potentially multiple producers. 1058 * Not needed for single threaded consumer side. 1059 */ 1060 spin_lock_irqsave(&qp->rq_lock, flags); 1061 1062 while (wr) { 1063 u32 idx = qp->rq_put % qp->attrs.rq_size; 1064 struct siw_rqe *rqe = &qp->recvq[idx]; 1065 1066 if (rqe->flags) { 1067 siw_dbg_qp(qp, "RQ full\n"); 1068 rv = -ENOMEM; 1069 break; 1070 } 1071 if (wr->num_sge > qp->attrs.rq_max_sges) { 1072 siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); 1073 rv = -EINVAL; 1074 break; 1075 } 1076 rqe->id = wr->wr_id; 1077 rqe->num_sge = wr->num_sge; 1078 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); 1079 1080 /* make sure RQE is completely written before valid */ 1081 smp_wmb(); 1082 1083 rqe->flags = SIW_WQE_VALID; 1084 1085 qp->rq_put++; 1086 wr = wr->next; 1087 } 1088 spin_unlock_irqrestore(&qp->rq_lock, flags); 1089 1090 up_read(&qp->state_lock); 1091 1092 if (rv < 0) { 1093 siw_dbg_qp(qp, "error %d\n", rv); 1094 *bad_wr = wr; 1095 } 1096 return rv > 0 ? 0 : rv; 1097 } 1098 1099 int siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) 1100 { 1101 struct siw_cq *cq = to_siw_cq(base_cq); 1102 struct siw_device *sdev = to_siw_dev(base_cq->device); 1103 struct siw_ucontext *ctx = 1104 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1105 base_ucontext); 1106 1107 siw_dbg_cq(cq, "free CQ resources\n"); 1108 1109 siw_cq_flush(cq); 1110 1111 if (ctx) 1112 rdma_user_mmap_entry_remove(cq->cq_entry); 1113 1114 atomic_dec(&sdev->num_cq); 1115 1116 vfree(cq->queue); 1117 return 0; 1118 } 1119 1120 /* 1121 * siw_create_cq() 1122 * 1123 * Populate CQ of requested size 1124 * 1125 * @base_cq: CQ as allocated by RDMA midlayer 1126 * @attr: Initial CQ attributes 1127 * @attrs: uverbs bundle 1128 */ 1129 1130 int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, 1131 struct uverbs_attr_bundle *attrs) 1132 { 1133 struct ib_udata *udata = &attrs->driver_udata; 1134 struct siw_device *sdev = to_siw_dev(base_cq->device); 1135 struct siw_cq *cq = to_siw_cq(base_cq); 1136 int rv, size = attr->cqe; 1137 1138 if (attr->flags) 1139 return -EOPNOTSUPP; 1140 1141 if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) { 1142 siw_dbg(base_cq->device, "too many CQ's\n"); 1143 rv = -ENOMEM; 1144 goto err_out; 1145 } 1146 if (size < 1 || size > sdev->attrs.max_cqe) { 1147 siw_dbg(base_cq->device, "CQ size error: %d\n", size); 1148 rv = -EINVAL; 1149 goto err_out; 1150 } 1151 size = roundup_pow_of_two(size); 1152 cq->base_cq.cqe = size; 1153 cq->num_cqe = size; 1154 1155 if (udata) 1156 cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) + 1157 sizeof(struct siw_cq_ctrl)); 1158 else 1159 cq->queue = vzalloc(size * sizeof(struct siw_cqe) + 1160 sizeof(struct siw_cq_ctrl)); 1161 1162 if (cq->queue == NULL) { 1163 rv = -ENOMEM; 1164 goto err_out; 1165 } 1166 get_random_bytes(&cq->id, 4); 1167 siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id); 1168 1169 spin_lock_init(&cq->lock); 1170 1171 cq->notify = (struct siw_cq_ctrl *)&cq->queue[size]; 1172 1173 if (udata) { 1174 struct siw_uresp_create_cq uresp = {}; 1175 struct siw_ucontext *ctx = 1176 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1177 base_ucontext); 1178 size_t length = size * sizeof(struct siw_cqe) + 1179 sizeof(struct siw_cq_ctrl); 1180 1181 cq->cq_entry = 1182 siw_mmap_entry_insert(ctx, cq->queue, 1183 length, &uresp.cq_key); 1184 if (!cq->cq_entry) { 1185 rv = -ENOMEM; 1186 goto err_out; 1187 } 1188 1189 uresp.cq_id = cq->id; 1190 uresp.num_cqe = size; 1191 1192 if (udata->outlen < sizeof(uresp)) { 1193 rv = -EINVAL; 1194 goto err_out; 1195 } 1196 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1197 if (rv) 1198 goto err_out; 1199 } 1200 return 0; 1201 1202 err_out: 1203 siw_dbg(base_cq->device, "CQ creation failed: %d", rv); 1204 1205 if (cq->queue) { 1206 struct siw_ucontext *ctx = 1207 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1208 base_ucontext); 1209 if (ctx) 1210 rdma_user_mmap_entry_remove(cq->cq_entry); 1211 vfree(cq->queue); 1212 } 1213 atomic_dec(&sdev->num_cq); 1214 1215 return rv; 1216 } 1217 1218 /* 1219 * siw_poll_cq() 1220 * 1221 * Reap CQ entries if available and copy work completion status into 1222 * array of WC's provided by caller. Returns number of reaped CQE's. 1223 * 1224 * @base_cq: Base CQ contained in siw CQ. 1225 * @num_cqe: Maximum number of CQE's to reap. 1226 * @wc: Array of work completions to be filled by siw. 1227 */ 1228 int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc) 1229 { 1230 struct siw_cq *cq = to_siw_cq(base_cq); 1231 int i; 1232 1233 for (i = 0; i < num_cqe; i++) { 1234 if (!siw_reap_cqe(cq, wc)) 1235 break; 1236 wc++; 1237 } 1238 return i; 1239 } 1240 1241 /* 1242 * siw_req_notify_cq() 1243 * 1244 * Request notification for new CQE's added to that CQ. 1245 * Defined flags: 1246 * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification 1247 * event if a WQE with notification flag set enters the CQ 1248 * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification 1249 * event if a WQE enters the CQ. 1250 * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the 1251 * number of not reaped CQE's regardless of its notification 1252 * type and current or new CQ notification settings. 1253 * 1254 * @base_cq: Base CQ contained in siw CQ. 1255 * @flags: Requested notification flags. 1256 */ 1257 int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) 1258 { 1259 struct siw_cq *cq = to_siw_cq(base_cq); 1260 1261 siw_dbg_cq(cq, "flags: 0x%02x\n", flags); 1262 1263 if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) 1264 /* 1265 * Enable CQ event for next solicited completion. 1266 * and make it visible to all associated producers. 1267 */ 1268 smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED); 1269 else 1270 /* 1271 * Enable CQ event for any signalled completion. 1272 * and make it visible to all associated producers. 1273 */ 1274 smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL); 1275 1276 if (flags & IB_CQ_REPORT_MISSED_EVENTS) 1277 return cq->cq_put - cq->cq_get; 1278 1279 return 0; 1280 } 1281 1282 /* 1283 * siw_dereg_mr() 1284 * 1285 * Release Memory Region. 1286 * 1287 * @base_mr: Base MR contained in siw MR. 1288 * @udata: points to user context, unused. 1289 */ 1290 int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata) 1291 { 1292 struct siw_mr *mr = to_siw_mr(base_mr); 1293 struct siw_device *sdev = to_siw_dev(base_mr->device); 1294 1295 siw_dbg_mem(mr->mem, "deregister MR\n"); 1296 1297 atomic_dec(&sdev->num_mr); 1298 1299 siw_mr_drop_mem(mr); 1300 kfree_rcu(mr, rcu); 1301 1302 return 0; 1303 } 1304 1305 /* 1306 * siw_reg_user_mr() 1307 * 1308 * Register Memory Region. 1309 * 1310 * @pd: Protection Domain 1311 * @start: starting address of MR (virtual address) 1312 * @len: len of MR 1313 * @rnic_va: not used by siw 1314 * @rights: MR access rights 1315 * @udata: user buffer to communicate STag and Key. 1316 */ 1317 struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, 1318 u64 rnic_va, int rights, struct ib_udata *udata) 1319 { 1320 struct siw_mr *mr = NULL; 1321 struct siw_umem *umem = NULL; 1322 struct siw_ureq_reg_mr ureq; 1323 struct siw_device *sdev = to_siw_dev(pd->device); 1324 int rv; 1325 1326 siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n", 1327 (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va, 1328 (unsigned long long)len); 1329 1330 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1331 siw_dbg_pd(pd, "too many mr's\n"); 1332 rv = -ENOMEM; 1333 goto err_out; 1334 } 1335 if (!len) { 1336 rv = -EINVAL; 1337 goto err_out; 1338 } 1339 umem = siw_umem_get(pd->device, start, len, rights); 1340 if (IS_ERR(umem)) { 1341 rv = PTR_ERR(umem); 1342 siw_dbg_pd(pd, "getting user memory failed: %d\n", rv); 1343 umem = NULL; 1344 goto err_out; 1345 } 1346 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1347 if (!mr) { 1348 rv = -ENOMEM; 1349 goto err_out; 1350 } 1351 rv = siw_mr_add_mem(mr, pd, umem, start, len, rights); 1352 if (rv) 1353 goto err_out; 1354 1355 if (udata) { 1356 struct siw_uresp_reg_mr uresp = {}; 1357 struct siw_mem *mem = mr->mem; 1358 1359 if (udata->inlen < sizeof(ureq)) { 1360 rv = -EINVAL; 1361 goto err_out; 1362 } 1363 rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); 1364 if (rv) 1365 goto err_out; 1366 1367 mr->base_mr.lkey |= ureq.stag_key; 1368 mr->base_mr.rkey |= ureq.stag_key; 1369 mem->stag |= ureq.stag_key; 1370 uresp.stag = mem->stag; 1371 1372 if (udata->outlen < sizeof(uresp)) { 1373 rv = -EINVAL; 1374 goto err_out; 1375 } 1376 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1377 if (rv) 1378 goto err_out; 1379 } 1380 mr->mem->stag_valid = 1; 1381 1382 return &mr->base_mr; 1383 1384 err_out: 1385 atomic_dec(&sdev->num_mr); 1386 if (mr) { 1387 if (mr->mem) 1388 siw_mr_drop_mem(mr); 1389 kfree_rcu(mr, rcu); 1390 } else { 1391 if (umem) 1392 siw_umem_release(umem); 1393 } 1394 return ERR_PTR(rv); 1395 } 1396 1397 struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 1398 u32 max_sge) 1399 { 1400 struct siw_device *sdev = to_siw_dev(pd->device); 1401 struct siw_mr *mr = NULL; 1402 struct siw_pbl *pbl = NULL; 1403 int rv; 1404 1405 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1406 siw_dbg_pd(pd, "too many mr's\n"); 1407 rv = -ENOMEM; 1408 goto err_out; 1409 } 1410 if (mr_type != IB_MR_TYPE_MEM_REG) { 1411 siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type); 1412 rv = -EOPNOTSUPP; 1413 goto err_out; 1414 } 1415 if (max_sge > SIW_MAX_SGE_PBL) { 1416 siw_dbg_pd(pd, "too many sge's: %d\n", max_sge); 1417 rv = -ENOMEM; 1418 goto err_out; 1419 } 1420 pbl = siw_pbl_alloc(max_sge); 1421 if (IS_ERR(pbl)) { 1422 rv = PTR_ERR(pbl); 1423 siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv); 1424 pbl = NULL; 1425 goto err_out; 1426 } 1427 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1428 if (!mr) { 1429 rv = -ENOMEM; 1430 goto err_out; 1431 } 1432 rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0); 1433 if (rv) 1434 goto err_out; 1435 1436 mr->mem->is_pbl = 1; 1437 1438 siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); 1439 1440 return &mr->base_mr; 1441 1442 err_out: 1443 atomic_dec(&sdev->num_mr); 1444 1445 if (!mr) { 1446 kfree(pbl); 1447 } else { 1448 if (mr->mem) 1449 siw_mr_drop_mem(mr); 1450 kfree_rcu(mr, rcu); 1451 } 1452 siw_dbg_pd(pd, "failed: %d\n", rv); 1453 1454 return ERR_PTR(rv); 1455 } 1456 1457 /* Just used to count number of pages being mapped */ 1458 static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr) 1459 { 1460 return 0; 1461 } 1462 1463 int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, 1464 unsigned int *sg_off) 1465 { 1466 struct scatterlist *slp; 1467 struct siw_mr *mr = to_siw_mr(base_mr); 1468 struct siw_mem *mem = mr->mem; 1469 struct siw_pbl *pbl = mem->pbl; 1470 struct siw_pble *pble; 1471 unsigned long pbl_size; 1472 int i, rv; 1473 1474 if (!pbl) { 1475 siw_dbg_mem(mem, "no PBL allocated\n"); 1476 return -EINVAL; 1477 } 1478 pble = pbl->pbe; 1479 1480 if (pbl->max_buf < num_sle) { 1481 siw_dbg_mem(mem, "too many SGE's: %d > %d\n", 1482 num_sle, pbl->max_buf); 1483 return -ENOMEM; 1484 } 1485 for_each_sg(sl, slp, num_sle, i) { 1486 if (sg_dma_len(slp) == 0) { 1487 siw_dbg_mem(mem, "empty SGE\n"); 1488 return -EINVAL; 1489 } 1490 if (i == 0) { 1491 pble->addr = sg_dma_address(slp); 1492 pble->size = sg_dma_len(slp); 1493 pble->pbl_off = 0; 1494 pbl_size = pble->size; 1495 pbl->num_buf = 1; 1496 } else { 1497 /* Merge PBL entries if adjacent */ 1498 if (pble->addr + pble->size == sg_dma_address(slp)) { 1499 pble->size += sg_dma_len(slp); 1500 } else { 1501 pble++; 1502 pbl->num_buf++; 1503 pble->addr = sg_dma_address(slp); 1504 pble->size = sg_dma_len(slp); 1505 pble->pbl_off = pbl_size; 1506 } 1507 pbl_size += sg_dma_len(slp); 1508 } 1509 siw_dbg_mem(mem, 1510 "sge[%d], size %u, addr 0x%p, total %lu\n", 1511 i, pble->size, ib_virt_dma_to_ptr(pble->addr), 1512 pbl_size); 1513 } 1514 rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); 1515 if (rv > 0) { 1516 mem->len = base_mr->length; 1517 mem->va = base_mr->iova; 1518 siw_dbg_mem(mem, 1519 "%llu bytes, start 0x%pK, %u SLE to %u entries\n", 1520 mem->len, (void *)(uintptr_t)mem->va, num_sle, 1521 pbl->num_buf); 1522 } 1523 return rv; 1524 } 1525 1526 /* 1527 * siw_get_dma_mr() 1528 * 1529 * Create a (empty) DMA memory region, where no umem is attached. 1530 */ 1531 struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights) 1532 { 1533 struct siw_device *sdev = to_siw_dev(pd->device); 1534 struct siw_mr *mr = NULL; 1535 int rv; 1536 1537 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1538 siw_dbg_pd(pd, "too many mr's\n"); 1539 rv = -ENOMEM; 1540 goto err_out; 1541 } 1542 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1543 if (!mr) { 1544 rv = -ENOMEM; 1545 goto err_out; 1546 } 1547 rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights); 1548 if (rv) 1549 goto err_out; 1550 1551 mr->mem->stag_valid = 1; 1552 1553 siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); 1554 1555 return &mr->base_mr; 1556 1557 err_out: 1558 if (rv) 1559 kfree(mr); 1560 1561 atomic_dec(&sdev->num_mr); 1562 1563 return ERR_PTR(rv); 1564 } 1565 1566 /* 1567 * siw_create_srq() 1568 * 1569 * Create Shared Receive Queue of attributes @init_attrs 1570 * within protection domain given by @pd. 1571 * 1572 * @base_srq: Base SRQ contained in siw SRQ. 1573 * @init_attrs: SRQ init attributes. 1574 * @udata: points to user context 1575 */ 1576 int siw_create_srq(struct ib_srq *base_srq, 1577 struct ib_srq_init_attr *init_attrs, struct ib_udata *udata) 1578 { 1579 struct siw_srq *srq = to_siw_srq(base_srq); 1580 struct ib_srq_attr *attrs = &init_attrs->attr; 1581 struct siw_device *sdev = to_siw_dev(base_srq->device); 1582 struct siw_ucontext *ctx = 1583 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1584 base_ucontext); 1585 int rv; 1586 1587 if (init_attrs->srq_type != IB_SRQT_BASIC) 1588 return -EOPNOTSUPP; 1589 1590 if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) { 1591 siw_dbg_pd(base_srq->pd, "too many SRQ's\n"); 1592 rv = -ENOMEM; 1593 goto err_out; 1594 } 1595 if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR || 1596 attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) { 1597 rv = -EINVAL; 1598 goto err_out; 1599 } 1600 srq->max_sge = attrs->max_sge; 1601 srq->num_rqe = roundup_pow_of_two(attrs->max_wr); 1602 srq->limit = attrs->srq_limit; 1603 if (srq->limit) 1604 srq->armed = true; 1605 1606 srq->is_kernel_res = !udata; 1607 1608 if (udata) 1609 srq->recvq = 1610 vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe)); 1611 else 1612 srq->recvq = vcalloc(srq->num_rqe, sizeof(struct siw_rqe)); 1613 1614 if (srq->recvq == NULL) { 1615 rv = -ENOMEM; 1616 goto err_out; 1617 } 1618 if (udata) { 1619 struct siw_uresp_create_srq uresp = {}; 1620 size_t length = srq->num_rqe * sizeof(struct siw_rqe); 1621 1622 srq->srq_entry = 1623 siw_mmap_entry_insert(ctx, srq->recvq, 1624 length, &uresp.srq_key); 1625 if (!srq->srq_entry) { 1626 rv = -ENOMEM; 1627 goto err_out; 1628 } 1629 1630 uresp.num_rqe = srq->num_rqe; 1631 1632 if (udata->outlen < sizeof(uresp)) { 1633 rv = -EINVAL; 1634 goto err_out; 1635 } 1636 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1637 if (rv) 1638 goto err_out; 1639 } 1640 spin_lock_init(&srq->lock); 1641 1642 siw_dbg_pd(base_srq->pd, "[SRQ]: success\n"); 1643 1644 return 0; 1645 1646 err_out: 1647 if (srq->recvq) { 1648 if (ctx) 1649 rdma_user_mmap_entry_remove(srq->srq_entry); 1650 vfree(srq->recvq); 1651 } 1652 atomic_dec(&sdev->num_srq); 1653 1654 return rv; 1655 } 1656 1657 /* 1658 * siw_modify_srq() 1659 * 1660 * Modify SRQ. The caller may resize SRQ and/or set/reset notification 1661 * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification. 1662 * 1663 * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE 1664 * parameter. siw_modify_srq() does not check the attrs->max_sge param. 1665 */ 1666 int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs, 1667 enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) 1668 { 1669 struct siw_srq *srq = to_siw_srq(base_srq); 1670 unsigned long flags; 1671 int rv = 0; 1672 1673 spin_lock_irqsave(&srq->lock, flags); 1674 1675 if (attr_mask & IB_SRQ_MAX_WR) { 1676 /* resize request not yet supported */ 1677 rv = -EOPNOTSUPP; 1678 goto out; 1679 } 1680 if (attr_mask & IB_SRQ_LIMIT) { 1681 if (attrs->srq_limit) { 1682 if (unlikely(attrs->srq_limit > srq->num_rqe)) { 1683 rv = -EINVAL; 1684 goto out; 1685 } 1686 srq->armed = true; 1687 } else { 1688 srq->armed = false; 1689 } 1690 srq->limit = attrs->srq_limit; 1691 } 1692 out: 1693 spin_unlock_irqrestore(&srq->lock, flags); 1694 1695 return rv; 1696 } 1697 1698 /* 1699 * siw_query_srq() 1700 * 1701 * Query SRQ attributes. 1702 */ 1703 int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs) 1704 { 1705 struct siw_srq *srq = to_siw_srq(base_srq); 1706 unsigned long flags; 1707 1708 spin_lock_irqsave(&srq->lock, flags); 1709 1710 attrs->max_wr = srq->num_rqe; 1711 attrs->max_sge = srq->max_sge; 1712 attrs->srq_limit = srq->limit; 1713 1714 spin_unlock_irqrestore(&srq->lock, flags); 1715 1716 return 0; 1717 } 1718 1719 /* 1720 * siw_destroy_srq() 1721 * 1722 * Destroy SRQ. 1723 * It is assumed that the SRQ is not referenced by any 1724 * QP anymore - the code trusts the RDMA core environment to keep track 1725 * of QP references. 1726 */ 1727 int siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata) 1728 { 1729 struct siw_srq *srq = to_siw_srq(base_srq); 1730 struct siw_device *sdev = to_siw_dev(base_srq->device); 1731 struct siw_ucontext *ctx = 1732 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1733 base_ucontext); 1734 1735 if (ctx) 1736 rdma_user_mmap_entry_remove(srq->srq_entry); 1737 vfree(srq->recvq); 1738 atomic_dec(&sdev->num_srq); 1739 return 0; 1740 } 1741 1742 /* 1743 * siw_post_srq_recv() 1744 * 1745 * Post a list of receive queue elements to SRQ. 1746 * NOTE: The function does not check or lock a certain SRQ state 1747 * during the post operation. The code simply trusts the 1748 * RDMA core environment. 1749 * 1750 * @base_srq: Base SRQ contained in siw SRQ 1751 * @wr: List of R-WR's 1752 * @bad_wr: Updated to failing WR if posting fails. 1753 */ 1754 int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, 1755 const struct ib_recv_wr **bad_wr) 1756 { 1757 struct siw_srq *srq = to_siw_srq(base_srq); 1758 unsigned long flags; 1759 int rv = 0; 1760 1761 if (unlikely(!srq->is_kernel_res)) { 1762 siw_dbg_pd(base_srq->pd, 1763 "[SRQ]: no kernel post_recv for mapped srq\n"); 1764 rv = -EINVAL; 1765 goto out; 1766 } 1767 /* 1768 * Serialize potentially multiple producers. 1769 * Also needed to serialize potentially multiple 1770 * consumers. 1771 */ 1772 spin_lock_irqsave(&srq->lock, flags); 1773 1774 while (wr) { 1775 u32 idx = srq->rq_put % srq->num_rqe; 1776 struct siw_rqe *rqe = &srq->recvq[idx]; 1777 1778 if (rqe->flags) { 1779 siw_dbg_pd(base_srq->pd, "SRQ full\n"); 1780 rv = -ENOMEM; 1781 break; 1782 } 1783 if (unlikely(wr->num_sge > srq->max_sge)) { 1784 siw_dbg_pd(base_srq->pd, 1785 "[SRQ]: too many sge's: %d\n", wr->num_sge); 1786 rv = -EINVAL; 1787 break; 1788 } 1789 rqe->id = wr->wr_id; 1790 rqe->num_sge = wr->num_sge; 1791 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); 1792 1793 /* Make sure S-RQE is completely written before valid */ 1794 smp_wmb(); 1795 1796 rqe->flags = SIW_WQE_VALID; 1797 1798 srq->rq_put++; 1799 wr = wr->next; 1800 } 1801 spin_unlock_irqrestore(&srq->lock, flags); 1802 out: 1803 if (unlikely(rv < 0)) { 1804 siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv); 1805 *bad_wr = wr; 1806 } 1807 return rv; 1808 } 1809 1810 void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype) 1811 { 1812 struct ib_event event; 1813 struct ib_qp *base_qp = &qp->base_qp; 1814 1815 /* 1816 * Do not report asynchronous errors on QP which gets 1817 * destroyed via verbs interface (siw_destroy_qp()) 1818 */ 1819 if (qp->attrs.flags & SIW_QP_IN_DESTROY) 1820 return; 1821 1822 event.event = etype; 1823 event.device = base_qp->device; 1824 event.element.qp = base_qp; 1825 1826 if (base_qp->event_handler) { 1827 siw_dbg_qp(qp, "reporting event %d\n", etype); 1828 base_qp->event_handler(&event, base_qp->qp_context); 1829 } 1830 } 1831 1832 void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype) 1833 { 1834 struct ib_event event; 1835 struct ib_cq *base_cq = &cq->base_cq; 1836 1837 event.event = etype; 1838 event.device = base_cq->device; 1839 event.element.cq = base_cq; 1840 1841 if (base_cq->event_handler) { 1842 siw_dbg_cq(cq, "reporting CQ event %d\n", etype); 1843 base_cq->event_handler(&event, base_cq->cq_context); 1844 } 1845 } 1846 1847 void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype) 1848 { 1849 struct ib_event event; 1850 struct ib_srq *base_srq = &srq->base_srq; 1851 1852 event.event = etype; 1853 event.device = base_srq->device; 1854 event.element.srq = base_srq; 1855 1856 if (base_srq->event_handler) { 1857 siw_dbg_pd(srq->base_srq.pd, 1858 "reporting SRQ event %d\n", etype); 1859 base_srq->event_handler(&event, base_srq->srq_context); 1860 } 1861 } 1862 1863 void siw_port_event(struct siw_device *sdev, u32 port, enum ib_event_type etype) 1864 { 1865 struct ib_event event; 1866 1867 event.event = etype; 1868 event.device = &sdev->base_dev; 1869 event.element.port_num = port; 1870 1871 siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype); 1872 1873 ib_dispatch_event(&event); 1874 } 1875