1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause 2 3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */ 4 /* Copyright (c) 2008-2019, IBM Corporation */ 5 6 #include <linux/errno.h> 7 #include <linux/types.h> 8 #include <linux/uaccess.h> 9 #include <linux/vmalloc.h> 10 #include <linux/xarray.h> 11 12 #include <rdma/iw_cm.h> 13 #include <rdma/ib_verbs.h> 14 #include <rdma/ib_user_verbs.h> 15 #include <rdma/uverbs_ioctl.h> 16 17 #include "siw.h" 18 #include "siw_verbs.h" 19 #include "siw_mem.h" 20 21 static int ib_qp_state_to_siw_qp_state[IB_QPS_ERR + 1] = { 22 [IB_QPS_RESET] = SIW_QP_STATE_IDLE, 23 [IB_QPS_INIT] = SIW_QP_STATE_IDLE, 24 [IB_QPS_RTR] = SIW_QP_STATE_RTR, 25 [IB_QPS_RTS] = SIW_QP_STATE_RTS, 26 [IB_QPS_SQD] = SIW_QP_STATE_CLOSING, 27 [IB_QPS_SQE] = SIW_QP_STATE_TERMINATE, 28 [IB_QPS_ERR] = SIW_QP_STATE_ERROR 29 }; 30 31 static char ib_qp_state_to_string[IB_QPS_ERR + 1][sizeof("RESET")] = { 32 [IB_QPS_RESET] = "RESET", [IB_QPS_INIT] = "INIT", [IB_QPS_RTR] = "RTR", 33 [IB_QPS_RTS] = "RTS", [IB_QPS_SQD] = "SQD", [IB_QPS_SQE] = "SQE", 34 [IB_QPS_ERR] = "ERR" 35 }; 36 37 static u32 siw_create_uobj(struct siw_ucontext *uctx, void *vaddr, u32 size) 38 { 39 struct siw_uobj *uobj; 40 struct xa_limit limit = XA_LIMIT(0, SIW_UOBJ_MAX_KEY); 41 u32 key; 42 43 uobj = kzalloc(sizeof(*uobj), GFP_KERNEL); 44 if (!uobj) 45 return SIW_INVAL_UOBJ_KEY; 46 47 if (xa_alloc_cyclic(&uctx->xa, &key, uobj, limit, &uctx->uobj_nextkey, 48 GFP_KERNEL) < 0) { 49 kfree(uobj); 50 return SIW_INVAL_UOBJ_KEY; 51 } 52 uobj->size = PAGE_ALIGN(size); 53 uobj->addr = vaddr; 54 55 return key; 56 } 57 58 static struct siw_uobj *siw_get_uobj(struct siw_ucontext *uctx, 59 unsigned long off, u32 size) 60 { 61 struct siw_uobj *uobj = xa_load(&uctx->xa, off); 62 63 if (uobj && uobj->size == size) 64 return uobj; 65 66 return NULL; 67 } 68 69 int siw_mmap(struct ib_ucontext *ctx, struct vm_area_struct *vma) 70 { 71 struct siw_ucontext *uctx = to_siw_ctx(ctx); 72 struct siw_uobj *uobj; 73 unsigned long off = vma->vm_pgoff; 74 int size = vma->vm_end - vma->vm_start; 75 int rv = -EINVAL; 76 77 /* 78 * Must be page aligned 79 */ 80 if (vma->vm_start & (PAGE_SIZE - 1)) { 81 pr_warn("siw: mmap not page aligned\n"); 82 goto out; 83 } 84 uobj = siw_get_uobj(uctx, off, size); 85 if (!uobj) { 86 siw_dbg(&uctx->sdev->base_dev, "mmap lookup failed: %lu, %u\n", 87 off, size); 88 goto out; 89 } 90 rv = remap_vmalloc_range(vma, uobj->addr, 0); 91 if (rv) 92 pr_warn("remap_vmalloc_range failed: %lu, %u\n", off, size); 93 out: 94 return rv; 95 } 96 97 int siw_alloc_ucontext(struct ib_ucontext *base_ctx, struct ib_udata *udata) 98 { 99 struct siw_device *sdev = to_siw_dev(base_ctx->device); 100 struct siw_ucontext *ctx = to_siw_ctx(base_ctx); 101 struct siw_uresp_alloc_ctx uresp = {}; 102 int rv; 103 104 if (atomic_inc_return(&sdev->num_ctx) > SIW_MAX_CONTEXT) { 105 rv = -ENOMEM; 106 goto err_out; 107 } 108 xa_init_flags(&ctx->xa, XA_FLAGS_ALLOC); 109 ctx->uobj_nextkey = 0; 110 ctx->sdev = sdev; 111 112 uresp.dev_id = sdev->vendor_part_id; 113 114 if (udata->outlen < sizeof(uresp)) { 115 rv = -EINVAL; 116 goto err_out; 117 } 118 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 119 if (rv) 120 goto err_out; 121 122 siw_dbg(base_ctx->device, "success. now %d context(s)\n", 123 atomic_read(&sdev->num_ctx)); 124 125 return 0; 126 127 err_out: 128 atomic_dec(&sdev->num_ctx); 129 siw_dbg(base_ctx->device, "failure %d. now %d context(s)\n", rv, 130 atomic_read(&sdev->num_ctx)); 131 132 return rv; 133 } 134 135 void siw_dealloc_ucontext(struct ib_ucontext *base_ctx) 136 { 137 struct siw_ucontext *uctx = to_siw_ctx(base_ctx); 138 void *entry; 139 unsigned long index; 140 141 /* 142 * Make sure all user mmap objects are gone. Since QP, CQ 143 * and SRQ destroy routines destroy related objects, nothing 144 * should be found here. 145 */ 146 xa_for_each(&uctx->xa, index, entry) { 147 kfree(xa_erase(&uctx->xa, index)); 148 pr_warn("siw: dropping orphaned uobj at %lu\n", index); 149 } 150 xa_destroy(&uctx->xa); 151 atomic_dec(&uctx->sdev->num_ctx); 152 } 153 154 int siw_query_device(struct ib_device *base_dev, struct ib_device_attr *attr, 155 struct ib_udata *udata) 156 { 157 struct siw_device *sdev = to_siw_dev(base_dev); 158 159 if (udata->inlen || udata->outlen) 160 return -EINVAL; 161 162 memset(attr, 0, sizeof(*attr)); 163 164 /* Revisit atomic caps if RFC 7306 gets supported */ 165 attr->atomic_cap = 0; 166 attr->device_cap_flags = 167 IB_DEVICE_MEM_MGT_EXTENSIONS | IB_DEVICE_ALLOW_USER_UNREG; 168 attr->max_cq = sdev->attrs.max_cq; 169 attr->max_cqe = sdev->attrs.max_cqe; 170 attr->max_fast_reg_page_list_len = SIW_MAX_SGE_PBL; 171 attr->max_fmr = sdev->attrs.max_fmr; 172 attr->max_mr = sdev->attrs.max_mr; 173 attr->max_mw = sdev->attrs.max_mw; 174 attr->max_mr_size = ~0ull; 175 attr->max_pd = sdev->attrs.max_pd; 176 attr->max_qp = sdev->attrs.max_qp; 177 attr->max_qp_init_rd_atom = sdev->attrs.max_ird; 178 attr->max_qp_rd_atom = sdev->attrs.max_ord; 179 attr->max_qp_wr = sdev->attrs.max_qp_wr; 180 attr->max_recv_sge = sdev->attrs.max_sge; 181 attr->max_res_rd_atom = sdev->attrs.max_qp * sdev->attrs.max_ird; 182 attr->max_send_sge = sdev->attrs.max_sge; 183 attr->max_sge_rd = sdev->attrs.max_sge_rd; 184 attr->max_srq = sdev->attrs.max_srq; 185 attr->max_srq_sge = sdev->attrs.max_srq_sge; 186 attr->max_srq_wr = sdev->attrs.max_srq_wr; 187 attr->page_size_cap = PAGE_SIZE; 188 attr->vendor_id = SIW_VENDOR_ID; 189 attr->vendor_part_id = sdev->vendor_part_id; 190 191 memcpy(&attr->sys_image_guid, sdev->netdev->dev_addr, 6); 192 193 return 0; 194 } 195 196 int siw_query_port(struct ib_device *base_dev, u8 port, 197 struct ib_port_attr *attr) 198 { 199 struct siw_device *sdev = to_siw_dev(base_dev); 200 201 memset(attr, 0, sizeof(*attr)); 202 203 attr->active_mtu = attr->max_mtu; 204 attr->active_speed = 2; 205 attr->active_width = 2; 206 attr->gid_tbl_len = 1; 207 attr->max_msg_sz = -1; 208 attr->max_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); 209 attr->phys_state = sdev->state == IB_PORT_ACTIVE ? 210 IB_PORT_PHYS_STATE_LINK_UP : IB_PORT_PHYS_STATE_DISABLED; 211 attr->pkey_tbl_len = 1; 212 attr->port_cap_flags = IB_PORT_CM_SUP | IB_PORT_DEVICE_MGMT_SUP; 213 attr->state = sdev->state; 214 /* 215 * All zero 216 * 217 * attr->lid = 0; 218 * attr->bad_pkey_cntr = 0; 219 * attr->qkey_viol_cntr = 0; 220 * attr->sm_lid = 0; 221 * attr->lmc = 0; 222 * attr->max_vl_num = 0; 223 * attr->sm_sl = 0; 224 * attr->subnet_timeout = 0; 225 * attr->init_type_repy = 0; 226 */ 227 return 0; 228 } 229 230 int siw_get_port_immutable(struct ib_device *base_dev, u8 port, 231 struct ib_port_immutable *port_immutable) 232 { 233 struct ib_port_attr attr; 234 int rv = siw_query_port(base_dev, port, &attr); 235 236 if (rv) 237 return rv; 238 239 port_immutable->pkey_tbl_len = attr.pkey_tbl_len; 240 port_immutable->gid_tbl_len = attr.gid_tbl_len; 241 port_immutable->core_cap_flags = RDMA_CORE_PORT_IWARP; 242 243 return 0; 244 } 245 246 int siw_query_pkey(struct ib_device *base_dev, u8 port, u16 idx, u16 *pkey) 247 { 248 /* Report the default pkey */ 249 *pkey = 0xffff; 250 return 0; 251 } 252 253 int siw_query_gid(struct ib_device *base_dev, u8 port, int idx, 254 union ib_gid *gid) 255 { 256 struct siw_device *sdev = to_siw_dev(base_dev); 257 258 /* subnet_prefix == interface_id == 0; */ 259 memset(gid, 0, sizeof(*gid)); 260 memcpy(&gid->raw[0], sdev->netdev->dev_addr, 6); 261 262 return 0; 263 } 264 265 int siw_alloc_pd(struct ib_pd *pd, struct ib_udata *udata) 266 { 267 struct siw_device *sdev = to_siw_dev(pd->device); 268 269 if (atomic_inc_return(&sdev->num_pd) > SIW_MAX_PD) { 270 atomic_dec(&sdev->num_pd); 271 return -ENOMEM; 272 } 273 siw_dbg_pd(pd, "now %d PD's(s)\n", atomic_read(&sdev->num_pd)); 274 275 return 0; 276 } 277 278 void siw_dealloc_pd(struct ib_pd *pd, struct ib_udata *udata) 279 { 280 struct siw_device *sdev = to_siw_dev(pd->device); 281 282 siw_dbg_pd(pd, "free PD\n"); 283 atomic_dec(&sdev->num_pd); 284 } 285 286 void siw_qp_get_ref(struct ib_qp *base_qp) 287 { 288 siw_qp_get(to_siw_qp(base_qp)); 289 } 290 291 void siw_qp_put_ref(struct ib_qp *base_qp) 292 { 293 siw_qp_put(to_siw_qp(base_qp)); 294 } 295 296 /* 297 * siw_create_qp() 298 * 299 * Create QP of requested size on given device. 300 * 301 * @pd: Protection Domain 302 * @attrs: Initial QP attributes. 303 * @udata: used to provide QP ID, SQ and RQ size back to user. 304 */ 305 306 struct ib_qp *siw_create_qp(struct ib_pd *pd, 307 struct ib_qp_init_attr *attrs, 308 struct ib_udata *udata) 309 { 310 struct siw_qp *qp = NULL; 311 struct siw_base_qp *siw_base_qp = NULL; 312 struct ib_device *base_dev = pd->device; 313 struct siw_device *sdev = to_siw_dev(base_dev); 314 struct siw_ucontext *uctx = 315 rdma_udata_to_drv_context(udata, struct siw_ucontext, 316 base_ucontext); 317 struct siw_cq *scq = NULL, *rcq = NULL; 318 unsigned long flags; 319 int num_sqe, num_rqe, rv = 0; 320 321 siw_dbg(base_dev, "create new QP\n"); 322 323 if (atomic_inc_return(&sdev->num_qp) > SIW_MAX_QP) { 324 siw_dbg(base_dev, "too many QP's\n"); 325 rv = -ENOMEM; 326 goto err_out; 327 } 328 if (attrs->qp_type != IB_QPT_RC) { 329 siw_dbg(base_dev, "only RC QP's supported\n"); 330 rv = -EINVAL; 331 goto err_out; 332 } 333 if ((attrs->cap.max_send_wr > SIW_MAX_QP_WR) || 334 (attrs->cap.max_recv_wr > SIW_MAX_QP_WR) || 335 (attrs->cap.max_send_sge > SIW_MAX_SGE) || 336 (attrs->cap.max_recv_sge > SIW_MAX_SGE)) { 337 siw_dbg(base_dev, "QP size error\n"); 338 rv = -EINVAL; 339 goto err_out; 340 } 341 if (attrs->cap.max_inline_data > SIW_MAX_INLINE) { 342 siw_dbg(base_dev, "max inline send: %d > %d\n", 343 attrs->cap.max_inline_data, (int)SIW_MAX_INLINE); 344 rv = -EINVAL; 345 goto err_out; 346 } 347 /* 348 * NOTE: we allow for zero element SQ and RQ WQE's SGL's 349 * but not for a QP unable to hold any WQE (SQ + RQ) 350 */ 351 if (attrs->cap.max_send_wr + attrs->cap.max_recv_wr == 0) { 352 siw_dbg(base_dev, "QP must have send or receive queue\n"); 353 rv = -EINVAL; 354 goto err_out; 355 } 356 scq = to_siw_cq(attrs->send_cq); 357 rcq = to_siw_cq(attrs->recv_cq); 358 359 if (!scq || (!rcq && !attrs->srq)) { 360 siw_dbg(base_dev, "send CQ or receive CQ invalid\n"); 361 rv = -EINVAL; 362 goto err_out; 363 } 364 siw_base_qp = kzalloc(sizeof(*siw_base_qp), GFP_KERNEL); 365 if (!siw_base_qp) { 366 rv = -ENOMEM; 367 goto err_out; 368 } 369 qp = kzalloc(sizeof(*qp), GFP_KERNEL); 370 if (!qp) { 371 rv = -ENOMEM; 372 goto err_out; 373 } 374 siw_base_qp->qp = qp; 375 qp->ib_qp = &siw_base_qp->base_qp; 376 377 init_rwsem(&qp->state_lock); 378 spin_lock_init(&qp->sq_lock); 379 spin_lock_init(&qp->rq_lock); 380 spin_lock_init(&qp->orq_lock); 381 382 qp->kernel_verbs = !udata; 383 qp->xa_sq_index = SIW_INVAL_UOBJ_KEY; 384 qp->xa_rq_index = SIW_INVAL_UOBJ_KEY; 385 386 rv = siw_qp_add(sdev, qp); 387 if (rv) 388 goto err_out; 389 390 /* All queue indices are derived from modulo operations 391 * on a free running 'get' (consumer) and 'put' (producer) 392 * unsigned counter. Having queue sizes at power of two 393 * avoids handling counter wrap around. 394 */ 395 num_sqe = roundup_pow_of_two(attrs->cap.max_send_wr); 396 num_rqe = roundup_pow_of_two(attrs->cap.max_recv_wr); 397 398 if (qp->kernel_verbs) 399 qp->sendq = vzalloc(num_sqe * sizeof(struct siw_sqe)); 400 else 401 qp->sendq = vmalloc_user(num_sqe * sizeof(struct siw_sqe)); 402 403 if (qp->sendq == NULL) { 404 siw_dbg(base_dev, "SQ size %d alloc failed\n", num_sqe); 405 rv = -ENOMEM; 406 goto err_out_xa; 407 } 408 if (attrs->sq_sig_type != IB_SIGNAL_REQ_WR) { 409 if (attrs->sq_sig_type == IB_SIGNAL_ALL_WR) 410 qp->attrs.flags |= SIW_SIGNAL_ALL_WR; 411 else { 412 rv = -EINVAL; 413 goto err_out_xa; 414 } 415 } 416 qp->pd = pd; 417 qp->scq = scq; 418 qp->rcq = rcq; 419 420 if (attrs->srq) { 421 /* 422 * SRQ support. 423 * Verbs 6.3.7: ignore RQ size, if SRQ present 424 * Verbs 6.3.5: do not check PD of SRQ against PD of QP 425 */ 426 qp->srq = to_siw_srq(attrs->srq); 427 qp->attrs.rq_size = 0; 428 siw_dbg(base_dev, "QP [%u]: SRQ attached\n", qp->qp_num); 429 } else if (num_rqe) { 430 if (qp->kernel_verbs) 431 qp->recvq = vzalloc(num_rqe * sizeof(struct siw_rqe)); 432 else 433 qp->recvq = 434 vmalloc_user(num_rqe * sizeof(struct siw_rqe)); 435 436 if (qp->recvq == NULL) { 437 siw_dbg(base_dev, "RQ size %d alloc failed\n", num_rqe); 438 rv = -ENOMEM; 439 goto err_out_xa; 440 } 441 qp->attrs.rq_size = num_rqe; 442 } 443 qp->attrs.sq_size = num_sqe; 444 qp->attrs.sq_max_sges = attrs->cap.max_send_sge; 445 qp->attrs.rq_max_sges = attrs->cap.max_recv_sge; 446 447 /* Make those two tunables fixed for now. */ 448 qp->tx_ctx.gso_seg_limit = 1; 449 qp->tx_ctx.zcopy_tx = zcopy_tx; 450 451 qp->attrs.state = SIW_QP_STATE_IDLE; 452 453 if (udata) { 454 struct siw_uresp_create_qp uresp = {}; 455 456 uresp.num_sqe = num_sqe; 457 uresp.num_rqe = num_rqe; 458 uresp.qp_id = qp_id(qp); 459 460 if (qp->sendq) { 461 qp->xa_sq_index = 462 siw_create_uobj(uctx, qp->sendq, 463 num_sqe * sizeof(struct siw_sqe)); 464 } 465 if (qp->recvq) { 466 qp->xa_rq_index = 467 siw_create_uobj(uctx, qp->recvq, 468 num_rqe * sizeof(struct siw_rqe)); 469 } 470 if (qp->xa_sq_index == SIW_INVAL_UOBJ_KEY || 471 qp->xa_rq_index == SIW_INVAL_UOBJ_KEY) { 472 rv = -ENOMEM; 473 goto err_out_xa; 474 } 475 uresp.sq_key = qp->xa_sq_index << PAGE_SHIFT; 476 uresp.rq_key = qp->xa_rq_index << PAGE_SHIFT; 477 478 if (udata->outlen < sizeof(uresp)) { 479 rv = -EINVAL; 480 goto err_out_xa; 481 } 482 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 483 if (rv) 484 goto err_out_xa; 485 } 486 qp->tx_cpu = siw_get_tx_cpu(sdev); 487 if (qp->tx_cpu < 0) { 488 rv = -EINVAL; 489 goto err_out_xa; 490 } 491 INIT_LIST_HEAD(&qp->devq); 492 spin_lock_irqsave(&sdev->lock, flags); 493 list_add_tail(&qp->devq, &sdev->qp_list); 494 spin_unlock_irqrestore(&sdev->lock, flags); 495 496 return qp->ib_qp; 497 498 err_out_xa: 499 xa_erase(&sdev->qp_xa, qp_id(qp)); 500 err_out: 501 kfree(siw_base_qp); 502 503 if (qp) { 504 if (qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) 505 kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); 506 if (qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) 507 kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); 508 509 vfree(qp->sendq); 510 vfree(qp->recvq); 511 kfree(qp); 512 } 513 atomic_dec(&sdev->num_qp); 514 515 return ERR_PTR(rv); 516 } 517 518 /* 519 * Minimum siw_query_qp() verb interface. 520 * 521 * @qp_attr_mask is not used but all available information is provided 522 */ 523 int siw_query_qp(struct ib_qp *base_qp, struct ib_qp_attr *qp_attr, 524 int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) 525 { 526 struct siw_qp *qp; 527 struct siw_device *sdev; 528 529 if (base_qp && qp_attr && qp_init_attr) { 530 qp = to_siw_qp(base_qp); 531 sdev = to_siw_dev(base_qp->device); 532 } else { 533 return -EINVAL; 534 } 535 qp_attr->cap.max_inline_data = SIW_MAX_INLINE; 536 qp_attr->cap.max_send_wr = qp->attrs.sq_size; 537 qp_attr->cap.max_send_sge = qp->attrs.sq_max_sges; 538 qp_attr->cap.max_recv_wr = qp->attrs.rq_size; 539 qp_attr->cap.max_recv_sge = qp->attrs.rq_max_sges; 540 qp_attr->path_mtu = ib_mtu_int_to_enum(sdev->netdev->mtu); 541 qp_attr->max_rd_atomic = qp->attrs.irq_size; 542 qp_attr->max_dest_rd_atomic = qp->attrs.orq_size; 543 544 qp_attr->qp_access_flags = IB_ACCESS_LOCAL_WRITE | 545 IB_ACCESS_REMOTE_WRITE | 546 IB_ACCESS_REMOTE_READ; 547 548 qp_init_attr->qp_type = base_qp->qp_type; 549 qp_init_attr->send_cq = base_qp->send_cq; 550 qp_init_attr->recv_cq = base_qp->recv_cq; 551 qp_init_attr->srq = base_qp->srq; 552 553 qp_init_attr->cap = qp_attr->cap; 554 555 return 0; 556 } 557 558 int siw_verbs_modify_qp(struct ib_qp *base_qp, struct ib_qp_attr *attr, 559 int attr_mask, struct ib_udata *udata) 560 { 561 struct siw_qp_attrs new_attrs; 562 enum siw_qp_attr_mask siw_attr_mask = 0; 563 struct siw_qp *qp = to_siw_qp(base_qp); 564 int rv = 0; 565 566 if (!attr_mask) 567 return 0; 568 569 memset(&new_attrs, 0, sizeof(new_attrs)); 570 571 if (attr_mask & IB_QP_ACCESS_FLAGS) { 572 siw_attr_mask = SIW_QP_ATTR_ACCESS_FLAGS; 573 574 if (attr->qp_access_flags & IB_ACCESS_REMOTE_READ) 575 new_attrs.flags |= SIW_RDMA_READ_ENABLED; 576 if (attr->qp_access_flags & IB_ACCESS_REMOTE_WRITE) 577 new_attrs.flags |= SIW_RDMA_WRITE_ENABLED; 578 if (attr->qp_access_flags & IB_ACCESS_MW_BIND) 579 new_attrs.flags |= SIW_RDMA_BIND_ENABLED; 580 } 581 if (attr_mask & IB_QP_STATE) { 582 siw_dbg_qp(qp, "desired IB QP state: %s\n", 583 ib_qp_state_to_string[attr->qp_state]); 584 585 new_attrs.state = ib_qp_state_to_siw_qp_state[attr->qp_state]; 586 587 if (new_attrs.state > SIW_QP_STATE_RTS) 588 qp->tx_ctx.tx_suspend = 1; 589 590 siw_attr_mask |= SIW_QP_ATTR_STATE; 591 } 592 if (!siw_attr_mask) 593 goto out; 594 595 down_write(&qp->state_lock); 596 597 rv = siw_qp_modify(qp, &new_attrs, siw_attr_mask); 598 599 up_write(&qp->state_lock); 600 out: 601 return rv; 602 } 603 604 int siw_destroy_qp(struct ib_qp *base_qp, struct ib_udata *udata) 605 { 606 struct siw_qp *qp = to_siw_qp(base_qp); 607 struct siw_ucontext *uctx = 608 rdma_udata_to_drv_context(udata, struct siw_ucontext, 609 base_ucontext); 610 struct siw_qp_attrs qp_attrs; 611 612 siw_dbg_qp(qp, "state %d\n", qp->attrs.state); 613 614 /* 615 * Mark QP as in process of destruction to prevent from 616 * any async callbacks to RDMA core 617 */ 618 qp->attrs.flags |= SIW_QP_IN_DESTROY; 619 qp->rx_stream.rx_suspend = 1; 620 621 if (uctx && qp->xa_sq_index != SIW_INVAL_UOBJ_KEY) 622 kfree(xa_erase(&uctx->xa, qp->xa_sq_index)); 623 if (uctx && qp->xa_rq_index != SIW_INVAL_UOBJ_KEY) 624 kfree(xa_erase(&uctx->xa, qp->xa_rq_index)); 625 626 down_write(&qp->state_lock); 627 628 qp_attrs.state = SIW_QP_STATE_ERROR; 629 siw_qp_modify(qp, &qp_attrs, SIW_QP_ATTR_STATE); 630 631 if (qp->cep) { 632 siw_cep_put(qp->cep); 633 qp->cep = NULL; 634 } 635 up_write(&qp->state_lock); 636 637 kfree(qp->tx_ctx.mpa_crc_hd); 638 kfree(qp->rx_stream.mpa_crc_hd); 639 640 qp->scq = qp->rcq = NULL; 641 642 siw_qp_put(qp); 643 644 return 0; 645 } 646 647 /* 648 * siw_copy_inline_sgl() 649 * 650 * Prepare sgl of inlined data for sending. For userland callers 651 * function checks if given buffer addresses and len's are within 652 * process context bounds. 653 * Data from all provided sge's are copied together into the wqe, 654 * referenced by a single sge. 655 */ 656 static int siw_copy_inline_sgl(const struct ib_send_wr *core_wr, 657 struct siw_sqe *sqe) 658 { 659 struct ib_sge *core_sge = core_wr->sg_list; 660 void *kbuf = &sqe->sge[1]; 661 int num_sge = core_wr->num_sge, bytes = 0; 662 663 sqe->sge[0].laddr = (uintptr_t)kbuf; 664 sqe->sge[0].lkey = 0; 665 666 while (num_sge--) { 667 if (!core_sge->length) { 668 core_sge++; 669 continue; 670 } 671 bytes += core_sge->length; 672 if (bytes > SIW_MAX_INLINE) { 673 bytes = -EINVAL; 674 break; 675 } 676 memcpy(kbuf, (void *)(uintptr_t)core_sge->addr, 677 core_sge->length); 678 679 kbuf += core_sge->length; 680 core_sge++; 681 } 682 sqe->sge[0].length = bytes > 0 ? bytes : 0; 683 sqe->num_sge = bytes > 0 ? 1 : 0; 684 685 return bytes; 686 } 687 688 /* 689 * siw_post_send() 690 * 691 * Post a list of S-WR's to a SQ. 692 * 693 * @base_qp: Base QP contained in siw QP 694 * @wr: Null terminated list of user WR's 695 * @bad_wr: Points to failing WR in case of synchronous failure. 696 */ 697 int siw_post_send(struct ib_qp *base_qp, const struct ib_send_wr *wr, 698 const struct ib_send_wr **bad_wr) 699 { 700 struct siw_qp *qp = to_siw_qp(base_qp); 701 struct siw_wqe *wqe = tx_wqe(qp); 702 703 unsigned long flags; 704 int rv = 0; 705 706 /* 707 * Try to acquire QP state lock. Must be non-blocking 708 * to accommodate kernel clients needs. 709 */ 710 if (!down_read_trylock(&qp->state_lock)) { 711 *bad_wr = wr; 712 siw_dbg_qp(qp, "QP locked, state %d\n", qp->attrs.state); 713 return -ENOTCONN; 714 } 715 if (unlikely(qp->attrs.state != SIW_QP_STATE_RTS)) { 716 up_read(&qp->state_lock); 717 *bad_wr = wr; 718 siw_dbg_qp(qp, "QP out of state %d\n", qp->attrs.state); 719 return -ENOTCONN; 720 } 721 if (wr && !qp->kernel_verbs) { 722 siw_dbg_qp(qp, "wr must be empty for user mapped sq\n"); 723 up_read(&qp->state_lock); 724 *bad_wr = wr; 725 return -EINVAL; 726 } 727 spin_lock_irqsave(&qp->sq_lock, flags); 728 729 while (wr) { 730 u32 idx = qp->sq_put % qp->attrs.sq_size; 731 struct siw_sqe *sqe = &qp->sendq[idx]; 732 733 if (sqe->flags) { 734 siw_dbg_qp(qp, "sq full\n"); 735 rv = -ENOMEM; 736 break; 737 } 738 if (wr->num_sge > qp->attrs.sq_max_sges) { 739 siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); 740 rv = -EINVAL; 741 break; 742 } 743 sqe->id = wr->wr_id; 744 745 if ((wr->send_flags & IB_SEND_SIGNALED) || 746 (qp->attrs.flags & SIW_SIGNAL_ALL_WR)) 747 sqe->flags |= SIW_WQE_SIGNALLED; 748 749 if (wr->send_flags & IB_SEND_FENCE) 750 sqe->flags |= SIW_WQE_READ_FENCE; 751 752 switch (wr->opcode) { 753 case IB_WR_SEND: 754 case IB_WR_SEND_WITH_INV: 755 if (wr->send_flags & IB_SEND_SOLICITED) 756 sqe->flags |= SIW_WQE_SOLICITED; 757 758 if (!(wr->send_flags & IB_SEND_INLINE)) { 759 siw_copy_sgl(wr->sg_list, sqe->sge, 760 wr->num_sge); 761 sqe->num_sge = wr->num_sge; 762 } else { 763 rv = siw_copy_inline_sgl(wr, sqe); 764 if (rv <= 0) { 765 rv = -EINVAL; 766 break; 767 } 768 sqe->flags |= SIW_WQE_INLINE; 769 sqe->num_sge = 1; 770 } 771 if (wr->opcode == IB_WR_SEND) 772 sqe->opcode = SIW_OP_SEND; 773 else { 774 sqe->opcode = SIW_OP_SEND_REMOTE_INV; 775 sqe->rkey = wr->ex.invalidate_rkey; 776 } 777 break; 778 779 case IB_WR_RDMA_READ_WITH_INV: 780 case IB_WR_RDMA_READ: 781 /* 782 * iWarp restricts RREAD sink to SGL containing 783 * 1 SGE only. we could relax to SGL with multiple 784 * elements referring the SAME ltag or even sending 785 * a private per-rreq tag referring to a checked 786 * local sgl with MULTIPLE ltag's. 787 */ 788 if (unlikely(wr->num_sge != 1)) { 789 rv = -EINVAL; 790 break; 791 } 792 siw_copy_sgl(wr->sg_list, &sqe->sge[0], 1); 793 /* 794 * NOTE: zero length RREAD is allowed! 795 */ 796 sqe->raddr = rdma_wr(wr)->remote_addr; 797 sqe->rkey = rdma_wr(wr)->rkey; 798 sqe->num_sge = 1; 799 800 if (wr->opcode == IB_WR_RDMA_READ) 801 sqe->opcode = SIW_OP_READ; 802 else 803 sqe->opcode = SIW_OP_READ_LOCAL_INV; 804 break; 805 806 case IB_WR_RDMA_WRITE: 807 if (!(wr->send_flags & IB_SEND_INLINE)) { 808 siw_copy_sgl(wr->sg_list, &sqe->sge[0], 809 wr->num_sge); 810 sqe->num_sge = wr->num_sge; 811 } else { 812 rv = siw_copy_inline_sgl(wr, sqe); 813 if (unlikely(rv < 0)) { 814 rv = -EINVAL; 815 break; 816 } 817 sqe->flags |= SIW_WQE_INLINE; 818 sqe->num_sge = 1; 819 } 820 sqe->raddr = rdma_wr(wr)->remote_addr; 821 sqe->rkey = rdma_wr(wr)->rkey; 822 sqe->opcode = SIW_OP_WRITE; 823 break; 824 825 case IB_WR_REG_MR: 826 sqe->base_mr = (uintptr_t)reg_wr(wr)->mr; 827 sqe->rkey = reg_wr(wr)->key; 828 sqe->access = reg_wr(wr)->access & IWARP_ACCESS_MASK; 829 sqe->opcode = SIW_OP_REG_MR; 830 break; 831 832 case IB_WR_LOCAL_INV: 833 sqe->rkey = wr->ex.invalidate_rkey; 834 sqe->opcode = SIW_OP_INVAL_STAG; 835 break; 836 837 default: 838 siw_dbg_qp(qp, "ib wr type %d unsupported\n", 839 wr->opcode); 840 rv = -EINVAL; 841 break; 842 } 843 siw_dbg_qp(qp, "opcode %d, flags 0x%x, wr_id 0x%pK\n", 844 sqe->opcode, sqe->flags, 845 (void *)(uintptr_t)sqe->id); 846 847 if (unlikely(rv < 0)) 848 break; 849 850 /* make SQE only valid after completely written */ 851 smp_wmb(); 852 sqe->flags |= SIW_WQE_VALID; 853 854 qp->sq_put++; 855 wr = wr->next; 856 } 857 858 /* 859 * Send directly if SQ processing is not in progress. 860 * Eventual immediate errors (rv < 0) do not affect the involved 861 * RI resources (Verbs, 8.3.1) and thus do not prevent from SQ 862 * processing, if new work is already pending. But rv must be passed 863 * to caller. 864 */ 865 if (wqe->wr_status != SIW_WR_IDLE) { 866 spin_unlock_irqrestore(&qp->sq_lock, flags); 867 goto skip_direct_sending; 868 } 869 rv = siw_activate_tx(qp); 870 spin_unlock_irqrestore(&qp->sq_lock, flags); 871 872 if (rv <= 0) 873 goto skip_direct_sending; 874 875 if (qp->kernel_verbs) { 876 rv = siw_sq_start(qp); 877 } else { 878 qp->tx_ctx.in_syscall = 1; 879 880 if (siw_qp_sq_process(qp) != 0 && !(qp->tx_ctx.tx_suspend)) 881 siw_qp_cm_drop(qp, 0); 882 883 qp->tx_ctx.in_syscall = 0; 884 } 885 skip_direct_sending: 886 887 up_read(&qp->state_lock); 888 889 if (rv >= 0) 890 return 0; 891 /* 892 * Immediate error 893 */ 894 siw_dbg_qp(qp, "error %d\n", rv); 895 896 *bad_wr = wr; 897 return rv; 898 } 899 900 /* 901 * siw_post_receive() 902 * 903 * Post a list of R-WR's to a RQ. 904 * 905 * @base_qp: Base QP contained in siw QP 906 * @wr: Null terminated list of user WR's 907 * @bad_wr: Points to failing WR in case of synchronous failure. 908 */ 909 int siw_post_receive(struct ib_qp *base_qp, const struct ib_recv_wr *wr, 910 const struct ib_recv_wr **bad_wr) 911 { 912 struct siw_qp *qp = to_siw_qp(base_qp); 913 unsigned long flags; 914 int rv = 0; 915 916 if (qp->srq) { 917 *bad_wr = wr; 918 return -EOPNOTSUPP; /* what else from errno.h? */ 919 } 920 /* 921 * Try to acquire QP state lock. Must be non-blocking 922 * to accommodate kernel clients needs. 923 */ 924 if (!down_read_trylock(&qp->state_lock)) { 925 *bad_wr = wr; 926 return -ENOTCONN; 927 } 928 if (!qp->kernel_verbs) { 929 siw_dbg_qp(qp, "no kernel post_recv for user mapped sq\n"); 930 up_read(&qp->state_lock); 931 *bad_wr = wr; 932 return -EINVAL; 933 } 934 if (qp->attrs.state > SIW_QP_STATE_RTS) { 935 up_read(&qp->state_lock); 936 *bad_wr = wr; 937 return -EINVAL; 938 } 939 /* 940 * Serialize potentially multiple producers. 941 * Not needed for single threaded consumer side. 942 */ 943 spin_lock_irqsave(&qp->rq_lock, flags); 944 945 while (wr) { 946 u32 idx = qp->rq_put % qp->attrs.rq_size; 947 struct siw_rqe *rqe = &qp->recvq[idx]; 948 949 if (rqe->flags) { 950 siw_dbg_qp(qp, "RQ full\n"); 951 rv = -ENOMEM; 952 break; 953 } 954 if (wr->num_sge > qp->attrs.rq_max_sges) { 955 siw_dbg_qp(qp, "too many sge's: %d\n", wr->num_sge); 956 rv = -EINVAL; 957 break; 958 } 959 rqe->id = wr->wr_id; 960 rqe->num_sge = wr->num_sge; 961 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); 962 963 /* make sure RQE is completely written before valid */ 964 smp_wmb(); 965 966 rqe->flags = SIW_WQE_VALID; 967 968 qp->rq_put++; 969 wr = wr->next; 970 } 971 spin_unlock_irqrestore(&qp->rq_lock, flags); 972 973 up_read(&qp->state_lock); 974 975 if (rv < 0) { 976 siw_dbg_qp(qp, "error %d\n", rv); 977 *bad_wr = wr; 978 } 979 return rv > 0 ? 0 : rv; 980 } 981 982 void siw_destroy_cq(struct ib_cq *base_cq, struct ib_udata *udata) 983 { 984 struct siw_cq *cq = to_siw_cq(base_cq); 985 struct siw_device *sdev = to_siw_dev(base_cq->device); 986 struct siw_ucontext *ctx = 987 rdma_udata_to_drv_context(udata, struct siw_ucontext, 988 base_ucontext); 989 990 siw_dbg_cq(cq, "free CQ resources\n"); 991 992 siw_cq_flush(cq); 993 994 if (ctx && cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) 995 kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); 996 997 atomic_dec(&sdev->num_cq); 998 999 vfree(cq->queue); 1000 } 1001 1002 /* 1003 * siw_create_cq() 1004 * 1005 * Populate CQ of requested size 1006 * 1007 * @base_cq: CQ as allocated by RDMA midlayer 1008 * @attr: Initial CQ attributes 1009 * @udata: relates to user context 1010 */ 1011 1012 int siw_create_cq(struct ib_cq *base_cq, const struct ib_cq_init_attr *attr, 1013 struct ib_udata *udata) 1014 { 1015 struct siw_device *sdev = to_siw_dev(base_cq->device); 1016 struct siw_cq *cq = to_siw_cq(base_cq); 1017 int rv, size = attr->cqe; 1018 1019 if (atomic_inc_return(&sdev->num_cq) > SIW_MAX_CQ) { 1020 siw_dbg(base_cq->device, "too many CQ's\n"); 1021 rv = -ENOMEM; 1022 goto err_out; 1023 } 1024 if (size < 1 || size > sdev->attrs.max_cqe) { 1025 siw_dbg(base_cq->device, "CQ size error: %d\n", size); 1026 rv = -EINVAL; 1027 goto err_out; 1028 } 1029 size = roundup_pow_of_two(size); 1030 cq->base_cq.cqe = size; 1031 cq->num_cqe = size; 1032 cq->xa_cq_index = SIW_INVAL_UOBJ_KEY; 1033 1034 if (!udata) { 1035 cq->kernel_verbs = 1; 1036 cq->queue = vzalloc(size * sizeof(struct siw_cqe) + 1037 sizeof(struct siw_cq_ctrl)); 1038 } else { 1039 cq->queue = vmalloc_user(size * sizeof(struct siw_cqe) + 1040 sizeof(struct siw_cq_ctrl)); 1041 } 1042 if (cq->queue == NULL) { 1043 rv = -ENOMEM; 1044 goto err_out; 1045 } 1046 get_random_bytes(&cq->id, 4); 1047 siw_dbg(base_cq->device, "new CQ [%u]\n", cq->id); 1048 1049 spin_lock_init(&cq->lock); 1050 1051 cq->notify = (struct siw_cq_ctrl *)&cq->queue[size]; 1052 1053 if (udata) { 1054 struct siw_uresp_create_cq uresp = {}; 1055 struct siw_ucontext *ctx = 1056 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1057 base_ucontext); 1058 1059 cq->xa_cq_index = 1060 siw_create_uobj(ctx, cq->queue, 1061 size * sizeof(struct siw_cqe) + 1062 sizeof(struct siw_cq_ctrl)); 1063 if (cq->xa_cq_index == SIW_INVAL_UOBJ_KEY) { 1064 rv = -ENOMEM; 1065 goto err_out; 1066 } 1067 uresp.cq_key = cq->xa_cq_index << PAGE_SHIFT; 1068 uresp.cq_id = cq->id; 1069 uresp.num_cqe = size; 1070 1071 if (udata->outlen < sizeof(uresp)) { 1072 rv = -EINVAL; 1073 goto err_out; 1074 } 1075 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1076 if (rv) 1077 goto err_out; 1078 } 1079 return 0; 1080 1081 err_out: 1082 siw_dbg(base_cq->device, "CQ creation failed: %d", rv); 1083 1084 if (cq && cq->queue) { 1085 struct siw_ucontext *ctx = 1086 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1087 base_ucontext); 1088 if (cq->xa_cq_index != SIW_INVAL_UOBJ_KEY) 1089 kfree(xa_erase(&ctx->xa, cq->xa_cq_index)); 1090 vfree(cq->queue); 1091 } 1092 atomic_dec(&sdev->num_cq); 1093 1094 return rv; 1095 } 1096 1097 /* 1098 * siw_poll_cq() 1099 * 1100 * Reap CQ entries if available and copy work completion status into 1101 * array of WC's provided by caller. Returns number of reaped CQE's. 1102 * 1103 * @base_cq: Base CQ contained in siw CQ. 1104 * @num_cqe: Maximum number of CQE's to reap. 1105 * @wc: Array of work completions to be filled by siw. 1106 */ 1107 int siw_poll_cq(struct ib_cq *base_cq, int num_cqe, struct ib_wc *wc) 1108 { 1109 struct siw_cq *cq = to_siw_cq(base_cq); 1110 int i; 1111 1112 for (i = 0; i < num_cqe; i++) { 1113 if (!siw_reap_cqe(cq, wc)) 1114 break; 1115 wc++; 1116 } 1117 return i; 1118 } 1119 1120 /* 1121 * siw_req_notify_cq() 1122 * 1123 * Request notification for new CQE's added to that CQ. 1124 * Defined flags: 1125 * o SIW_CQ_NOTIFY_SOLICITED lets siw trigger a notification 1126 * event if a WQE with notification flag set enters the CQ 1127 * o SIW_CQ_NOTIFY_NEXT_COMP lets siw trigger a notification 1128 * event if a WQE enters the CQ. 1129 * o IB_CQ_REPORT_MISSED_EVENTS: return value will provide the 1130 * number of not reaped CQE's regardless of its notification 1131 * type and current or new CQ notification settings. 1132 * 1133 * @base_cq: Base CQ contained in siw CQ. 1134 * @flags: Requested notification flags. 1135 */ 1136 int siw_req_notify_cq(struct ib_cq *base_cq, enum ib_cq_notify_flags flags) 1137 { 1138 struct siw_cq *cq = to_siw_cq(base_cq); 1139 1140 siw_dbg_cq(cq, "flags: 0x%02x\n", flags); 1141 1142 if ((flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED) 1143 /* 1144 * Enable CQ event for next solicited completion. 1145 * and make it visible to all associated producers. 1146 */ 1147 smp_store_mb(cq->notify->flags, SIW_NOTIFY_SOLICITED); 1148 else 1149 /* 1150 * Enable CQ event for any signalled completion. 1151 * and make it visible to all associated producers. 1152 */ 1153 smp_store_mb(cq->notify->flags, SIW_NOTIFY_ALL); 1154 1155 if (flags & IB_CQ_REPORT_MISSED_EVENTS) 1156 return cq->cq_put - cq->cq_get; 1157 1158 return 0; 1159 } 1160 1161 /* 1162 * siw_dereg_mr() 1163 * 1164 * Release Memory Region. 1165 * 1166 * @base_mr: Base MR contained in siw MR. 1167 * @udata: points to user context, unused. 1168 */ 1169 int siw_dereg_mr(struct ib_mr *base_mr, struct ib_udata *udata) 1170 { 1171 struct siw_mr *mr = to_siw_mr(base_mr); 1172 struct siw_device *sdev = to_siw_dev(base_mr->device); 1173 1174 siw_dbg_mem(mr->mem, "deregister MR\n"); 1175 1176 atomic_dec(&sdev->num_mr); 1177 1178 siw_mr_drop_mem(mr); 1179 kfree_rcu(mr, rcu); 1180 1181 return 0; 1182 } 1183 1184 /* 1185 * siw_reg_user_mr() 1186 * 1187 * Register Memory Region. 1188 * 1189 * @pd: Protection Domain 1190 * @start: starting address of MR (virtual address) 1191 * @len: len of MR 1192 * @rnic_va: not used by siw 1193 * @rights: MR access rights 1194 * @udata: user buffer to communicate STag and Key. 1195 */ 1196 struct ib_mr *siw_reg_user_mr(struct ib_pd *pd, u64 start, u64 len, 1197 u64 rnic_va, int rights, struct ib_udata *udata) 1198 { 1199 struct siw_mr *mr = NULL; 1200 struct siw_umem *umem = NULL; 1201 struct siw_ureq_reg_mr ureq; 1202 struct siw_device *sdev = to_siw_dev(pd->device); 1203 1204 unsigned long mem_limit = rlimit(RLIMIT_MEMLOCK); 1205 int rv; 1206 1207 siw_dbg_pd(pd, "start: 0x%pK, va: 0x%pK, len: %llu\n", 1208 (void *)(uintptr_t)start, (void *)(uintptr_t)rnic_va, 1209 (unsigned long long)len); 1210 1211 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1212 siw_dbg_pd(pd, "too many mr's\n"); 1213 rv = -ENOMEM; 1214 goto err_out; 1215 } 1216 if (!len) { 1217 rv = -EINVAL; 1218 goto err_out; 1219 } 1220 if (mem_limit != RLIM_INFINITY) { 1221 unsigned long num_pages = 1222 (PAGE_ALIGN(len + (start & ~PAGE_MASK))) >> PAGE_SHIFT; 1223 mem_limit >>= PAGE_SHIFT; 1224 1225 if (num_pages > mem_limit - current->mm->locked_vm) { 1226 siw_dbg_pd(pd, "pages req %lu, max %lu, lock %lu\n", 1227 num_pages, mem_limit, 1228 current->mm->locked_vm); 1229 rv = -ENOMEM; 1230 goto err_out; 1231 } 1232 } 1233 umem = siw_umem_get(start, len, ib_access_writable(rights)); 1234 if (IS_ERR(umem)) { 1235 rv = PTR_ERR(umem); 1236 siw_dbg_pd(pd, "getting user memory failed: %d\n", rv); 1237 umem = NULL; 1238 goto err_out; 1239 } 1240 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1241 if (!mr) { 1242 rv = -ENOMEM; 1243 goto err_out; 1244 } 1245 rv = siw_mr_add_mem(mr, pd, umem, start, len, rights); 1246 if (rv) 1247 goto err_out; 1248 1249 if (udata) { 1250 struct siw_uresp_reg_mr uresp = {}; 1251 struct siw_mem *mem = mr->mem; 1252 1253 if (udata->inlen < sizeof(ureq)) { 1254 rv = -EINVAL; 1255 goto err_out; 1256 } 1257 rv = ib_copy_from_udata(&ureq, udata, sizeof(ureq)); 1258 if (rv) 1259 goto err_out; 1260 1261 mr->base_mr.lkey |= ureq.stag_key; 1262 mr->base_mr.rkey |= ureq.stag_key; 1263 mem->stag |= ureq.stag_key; 1264 uresp.stag = mem->stag; 1265 1266 if (udata->outlen < sizeof(uresp)) { 1267 rv = -EINVAL; 1268 goto err_out; 1269 } 1270 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1271 if (rv) 1272 goto err_out; 1273 } 1274 mr->mem->stag_valid = 1; 1275 1276 return &mr->base_mr; 1277 1278 err_out: 1279 atomic_dec(&sdev->num_mr); 1280 if (mr) { 1281 if (mr->mem) 1282 siw_mr_drop_mem(mr); 1283 kfree_rcu(mr, rcu); 1284 } else { 1285 if (umem) 1286 siw_umem_release(umem, false); 1287 } 1288 return ERR_PTR(rv); 1289 } 1290 1291 struct ib_mr *siw_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, 1292 u32 max_sge, struct ib_udata *udata) 1293 { 1294 struct siw_device *sdev = to_siw_dev(pd->device); 1295 struct siw_mr *mr = NULL; 1296 struct siw_pbl *pbl = NULL; 1297 int rv; 1298 1299 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1300 siw_dbg_pd(pd, "too many mr's\n"); 1301 rv = -ENOMEM; 1302 goto err_out; 1303 } 1304 if (mr_type != IB_MR_TYPE_MEM_REG) { 1305 siw_dbg_pd(pd, "mr type %d unsupported\n", mr_type); 1306 rv = -EOPNOTSUPP; 1307 goto err_out; 1308 } 1309 if (max_sge > SIW_MAX_SGE_PBL) { 1310 siw_dbg_pd(pd, "too many sge's: %d\n", max_sge); 1311 rv = -ENOMEM; 1312 goto err_out; 1313 } 1314 pbl = siw_pbl_alloc(max_sge); 1315 if (IS_ERR(pbl)) { 1316 rv = PTR_ERR(pbl); 1317 siw_dbg_pd(pd, "pbl allocation failed: %d\n", rv); 1318 pbl = NULL; 1319 goto err_out; 1320 } 1321 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1322 if (!mr) { 1323 rv = -ENOMEM; 1324 goto err_out; 1325 } 1326 rv = siw_mr_add_mem(mr, pd, pbl, 0, max_sge * PAGE_SIZE, 0); 1327 if (rv) 1328 goto err_out; 1329 1330 mr->mem->is_pbl = 1; 1331 1332 siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); 1333 1334 return &mr->base_mr; 1335 1336 err_out: 1337 atomic_dec(&sdev->num_mr); 1338 1339 if (!mr) { 1340 kfree(pbl); 1341 } else { 1342 if (mr->mem) 1343 siw_mr_drop_mem(mr); 1344 kfree_rcu(mr, rcu); 1345 } 1346 siw_dbg_pd(pd, "failed: %d\n", rv); 1347 1348 return ERR_PTR(rv); 1349 } 1350 1351 /* Just used to count number of pages being mapped */ 1352 static int siw_set_pbl_page(struct ib_mr *base_mr, u64 buf_addr) 1353 { 1354 return 0; 1355 } 1356 1357 int siw_map_mr_sg(struct ib_mr *base_mr, struct scatterlist *sl, int num_sle, 1358 unsigned int *sg_off) 1359 { 1360 struct scatterlist *slp; 1361 struct siw_mr *mr = to_siw_mr(base_mr); 1362 struct siw_mem *mem = mr->mem; 1363 struct siw_pbl *pbl = mem->pbl; 1364 struct siw_pble *pble; 1365 unsigned long pbl_size; 1366 int i, rv; 1367 1368 if (!pbl) { 1369 siw_dbg_mem(mem, "no PBL allocated\n"); 1370 return -EINVAL; 1371 } 1372 pble = pbl->pbe; 1373 1374 if (pbl->max_buf < num_sle) { 1375 siw_dbg_mem(mem, "too many SGE's: %d > %d\n", 1376 mem->pbl->max_buf, num_sle); 1377 return -ENOMEM; 1378 } 1379 for_each_sg(sl, slp, num_sle, i) { 1380 if (sg_dma_len(slp) == 0) { 1381 siw_dbg_mem(mem, "empty SGE\n"); 1382 return -EINVAL; 1383 } 1384 if (i == 0) { 1385 pble->addr = sg_dma_address(slp); 1386 pble->size = sg_dma_len(slp); 1387 pble->pbl_off = 0; 1388 pbl_size = pble->size; 1389 pbl->num_buf = 1; 1390 } else { 1391 /* Merge PBL entries if adjacent */ 1392 if (pble->addr + pble->size == sg_dma_address(slp)) { 1393 pble->size += sg_dma_len(slp); 1394 } else { 1395 pble++; 1396 pbl->num_buf++; 1397 pble->addr = sg_dma_address(slp); 1398 pble->size = sg_dma_len(slp); 1399 pble->pbl_off = pbl_size; 1400 } 1401 pbl_size += sg_dma_len(slp); 1402 } 1403 siw_dbg_mem(mem, 1404 "sge[%d], size %u, addr 0x%p, total %lu\n", 1405 i, pble->size, (void *)(uintptr_t)pble->addr, 1406 pbl_size); 1407 } 1408 rv = ib_sg_to_pages(base_mr, sl, num_sle, sg_off, siw_set_pbl_page); 1409 if (rv > 0) { 1410 mem->len = base_mr->length; 1411 mem->va = base_mr->iova; 1412 siw_dbg_mem(mem, 1413 "%llu bytes, start 0x%pK, %u SLE to %u entries\n", 1414 mem->len, (void *)(uintptr_t)mem->va, num_sle, 1415 pbl->num_buf); 1416 } 1417 return rv; 1418 } 1419 1420 /* 1421 * siw_get_dma_mr() 1422 * 1423 * Create a (empty) DMA memory region, where no umem is attached. 1424 */ 1425 struct ib_mr *siw_get_dma_mr(struct ib_pd *pd, int rights) 1426 { 1427 struct siw_device *sdev = to_siw_dev(pd->device); 1428 struct siw_mr *mr = NULL; 1429 int rv; 1430 1431 if (atomic_inc_return(&sdev->num_mr) > SIW_MAX_MR) { 1432 siw_dbg_pd(pd, "too many mr's\n"); 1433 rv = -ENOMEM; 1434 goto err_out; 1435 } 1436 mr = kzalloc(sizeof(*mr), GFP_KERNEL); 1437 if (!mr) { 1438 rv = -ENOMEM; 1439 goto err_out; 1440 } 1441 rv = siw_mr_add_mem(mr, pd, NULL, 0, ULONG_MAX, rights); 1442 if (rv) 1443 goto err_out; 1444 1445 mr->mem->stag_valid = 1; 1446 1447 siw_dbg_pd(pd, "[MEM %u]: success\n", mr->mem->stag); 1448 1449 return &mr->base_mr; 1450 1451 err_out: 1452 if (rv) 1453 kfree(mr); 1454 1455 atomic_dec(&sdev->num_mr); 1456 1457 return ERR_PTR(rv); 1458 } 1459 1460 /* 1461 * siw_create_srq() 1462 * 1463 * Create Shared Receive Queue of attributes @init_attrs 1464 * within protection domain given by @pd. 1465 * 1466 * @base_srq: Base SRQ contained in siw SRQ. 1467 * @init_attrs: SRQ init attributes. 1468 * @udata: points to user context 1469 */ 1470 int siw_create_srq(struct ib_srq *base_srq, 1471 struct ib_srq_init_attr *init_attrs, struct ib_udata *udata) 1472 { 1473 struct siw_srq *srq = to_siw_srq(base_srq); 1474 struct ib_srq_attr *attrs = &init_attrs->attr; 1475 struct siw_device *sdev = to_siw_dev(base_srq->device); 1476 struct siw_ucontext *ctx = 1477 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1478 base_ucontext); 1479 int rv; 1480 1481 if (atomic_inc_return(&sdev->num_srq) > SIW_MAX_SRQ) { 1482 siw_dbg_pd(base_srq->pd, "too many SRQ's\n"); 1483 rv = -ENOMEM; 1484 goto err_out; 1485 } 1486 if (attrs->max_wr == 0 || attrs->max_wr > SIW_MAX_SRQ_WR || 1487 attrs->max_sge > SIW_MAX_SGE || attrs->srq_limit > attrs->max_wr) { 1488 rv = -EINVAL; 1489 goto err_out; 1490 } 1491 srq->max_sge = attrs->max_sge; 1492 srq->num_rqe = roundup_pow_of_two(attrs->max_wr); 1493 srq->xa_srq_index = SIW_INVAL_UOBJ_KEY; 1494 srq->limit = attrs->srq_limit; 1495 if (srq->limit) 1496 srq->armed = 1; 1497 1498 srq->kernel_verbs = !udata; 1499 1500 if (udata) 1501 srq->recvq = 1502 vmalloc_user(srq->num_rqe * sizeof(struct siw_rqe)); 1503 else 1504 srq->recvq = vzalloc(srq->num_rqe * sizeof(struct siw_rqe)); 1505 1506 if (srq->recvq == NULL) { 1507 rv = -ENOMEM; 1508 goto err_out; 1509 } 1510 if (udata) { 1511 struct siw_uresp_create_srq uresp = {}; 1512 1513 srq->xa_srq_index = siw_create_uobj( 1514 ctx, srq->recvq, srq->num_rqe * sizeof(struct siw_rqe)); 1515 1516 if (srq->xa_srq_index == SIW_INVAL_UOBJ_KEY) { 1517 rv = -ENOMEM; 1518 goto err_out; 1519 } 1520 uresp.srq_key = srq->xa_srq_index; 1521 uresp.num_rqe = srq->num_rqe; 1522 1523 if (udata->outlen < sizeof(uresp)) { 1524 rv = -EINVAL; 1525 goto err_out; 1526 } 1527 rv = ib_copy_to_udata(udata, &uresp, sizeof(uresp)); 1528 if (rv) 1529 goto err_out; 1530 } 1531 spin_lock_init(&srq->lock); 1532 1533 siw_dbg_pd(base_srq->pd, "[SRQ]: success\n"); 1534 1535 return 0; 1536 1537 err_out: 1538 if (srq->recvq) { 1539 if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) 1540 kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); 1541 vfree(srq->recvq); 1542 } 1543 atomic_dec(&sdev->num_srq); 1544 1545 return rv; 1546 } 1547 1548 /* 1549 * siw_modify_srq() 1550 * 1551 * Modify SRQ. The caller may resize SRQ and/or set/reset notification 1552 * limit and (re)arm IB_EVENT_SRQ_LIMIT_REACHED notification. 1553 * 1554 * NOTE: it is unclear if RDMA core allows for changing the MAX_SGE 1555 * parameter. siw_modify_srq() does not check the attrs->max_sge param. 1556 */ 1557 int siw_modify_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs, 1558 enum ib_srq_attr_mask attr_mask, struct ib_udata *udata) 1559 { 1560 struct siw_srq *srq = to_siw_srq(base_srq); 1561 unsigned long flags; 1562 int rv = 0; 1563 1564 spin_lock_irqsave(&srq->lock, flags); 1565 1566 if (attr_mask & IB_SRQ_MAX_WR) { 1567 /* resize request not yet supported */ 1568 rv = -EOPNOTSUPP; 1569 goto out; 1570 } 1571 if (attr_mask & IB_SRQ_LIMIT) { 1572 if (attrs->srq_limit) { 1573 if (unlikely(attrs->srq_limit > srq->num_rqe)) { 1574 rv = -EINVAL; 1575 goto out; 1576 } 1577 srq->armed = 1; 1578 } else { 1579 srq->armed = 0; 1580 } 1581 srq->limit = attrs->srq_limit; 1582 } 1583 out: 1584 spin_unlock_irqrestore(&srq->lock, flags); 1585 1586 return rv; 1587 } 1588 1589 /* 1590 * siw_query_srq() 1591 * 1592 * Query SRQ attributes. 1593 */ 1594 int siw_query_srq(struct ib_srq *base_srq, struct ib_srq_attr *attrs) 1595 { 1596 struct siw_srq *srq = to_siw_srq(base_srq); 1597 unsigned long flags; 1598 1599 spin_lock_irqsave(&srq->lock, flags); 1600 1601 attrs->max_wr = srq->num_rqe; 1602 attrs->max_sge = srq->max_sge; 1603 attrs->srq_limit = srq->limit; 1604 1605 spin_unlock_irqrestore(&srq->lock, flags); 1606 1607 return 0; 1608 } 1609 1610 /* 1611 * siw_destroy_srq() 1612 * 1613 * Destroy SRQ. 1614 * It is assumed that the SRQ is not referenced by any 1615 * QP anymore - the code trusts the RDMA core environment to keep track 1616 * of QP references. 1617 */ 1618 void siw_destroy_srq(struct ib_srq *base_srq, struct ib_udata *udata) 1619 { 1620 struct siw_srq *srq = to_siw_srq(base_srq); 1621 struct siw_device *sdev = to_siw_dev(base_srq->device); 1622 struct siw_ucontext *ctx = 1623 rdma_udata_to_drv_context(udata, struct siw_ucontext, 1624 base_ucontext); 1625 1626 if (ctx && srq->xa_srq_index != SIW_INVAL_UOBJ_KEY) 1627 kfree(xa_erase(&ctx->xa, srq->xa_srq_index)); 1628 1629 vfree(srq->recvq); 1630 atomic_dec(&sdev->num_srq); 1631 } 1632 1633 /* 1634 * siw_post_srq_recv() 1635 * 1636 * Post a list of receive queue elements to SRQ. 1637 * NOTE: The function does not check or lock a certain SRQ state 1638 * during the post operation. The code simply trusts the 1639 * RDMA core environment. 1640 * 1641 * @base_srq: Base SRQ contained in siw SRQ 1642 * @wr: List of R-WR's 1643 * @bad_wr: Updated to failing WR if posting fails. 1644 */ 1645 int siw_post_srq_recv(struct ib_srq *base_srq, const struct ib_recv_wr *wr, 1646 const struct ib_recv_wr **bad_wr) 1647 { 1648 struct siw_srq *srq = to_siw_srq(base_srq); 1649 unsigned long flags; 1650 int rv = 0; 1651 1652 if (unlikely(!srq->kernel_verbs)) { 1653 siw_dbg_pd(base_srq->pd, 1654 "[SRQ]: no kernel post_recv for mapped srq\n"); 1655 rv = -EINVAL; 1656 goto out; 1657 } 1658 /* 1659 * Serialize potentially multiple producers. 1660 * Also needed to serialize potentially multiple 1661 * consumers. 1662 */ 1663 spin_lock_irqsave(&srq->lock, flags); 1664 1665 while (wr) { 1666 u32 idx = srq->rq_put % srq->num_rqe; 1667 struct siw_rqe *rqe = &srq->recvq[idx]; 1668 1669 if (rqe->flags) { 1670 siw_dbg_pd(base_srq->pd, "SRQ full\n"); 1671 rv = -ENOMEM; 1672 break; 1673 } 1674 if (unlikely(wr->num_sge > srq->max_sge)) { 1675 siw_dbg_pd(base_srq->pd, 1676 "[SRQ]: too many sge's: %d\n", wr->num_sge); 1677 rv = -EINVAL; 1678 break; 1679 } 1680 rqe->id = wr->wr_id; 1681 rqe->num_sge = wr->num_sge; 1682 siw_copy_sgl(wr->sg_list, rqe->sge, wr->num_sge); 1683 1684 /* Make sure S-RQE is completely written before valid */ 1685 smp_wmb(); 1686 1687 rqe->flags = SIW_WQE_VALID; 1688 1689 srq->rq_put++; 1690 wr = wr->next; 1691 } 1692 spin_unlock_irqrestore(&srq->lock, flags); 1693 out: 1694 if (unlikely(rv < 0)) { 1695 siw_dbg_pd(base_srq->pd, "[SRQ]: error %d\n", rv); 1696 *bad_wr = wr; 1697 } 1698 return rv; 1699 } 1700 1701 void siw_qp_event(struct siw_qp *qp, enum ib_event_type etype) 1702 { 1703 struct ib_event event; 1704 struct ib_qp *base_qp = qp->ib_qp; 1705 1706 /* 1707 * Do not report asynchronous errors on QP which gets 1708 * destroyed via verbs interface (siw_destroy_qp()) 1709 */ 1710 if (qp->attrs.flags & SIW_QP_IN_DESTROY) 1711 return; 1712 1713 event.event = etype; 1714 event.device = base_qp->device; 1715 event.element.qp = base_qp; 1716 1717 if (base_qp->event_handler) { 1718 siw_dbg_qp(qp, "reporting event %d\n", etype); 1719 base_qp->event_handler(&event, base_qp->qp_context); 1720 } 1721 } 1722 1723 void siw_cq_event(struct siw_cq *cq, enum ib_event_type etype) 1724 { 1725 struct ib_event event; 1726 struct ib_cq *base_cq = &cq->base_cq; 1727 1728 event.event = etype; 1729 event.device = base_cq->device; 1730 event.element.cq = base_cq; 1731 1732 if (base_cq->event_handler) { 1733 siw_dbg_cq(cq, "reporting CQ event %d\n", etype); 1734 base_cq->event_handler(&event, base_cq->cq_context); 1735 } 1736 } 1737 1738 void siw_srq_event(struct siw_srq *srq, enum ib_event_type etype) 1739 { 1740 struct ib_event event; 1741 struct ib_srq *base_srq = &srq->base_srq; 1742 1743 event.event = etype; 1744 event.device = base_srq->device; 1745 event.element.srq = base_srq; 1746 1747 if (base_srq->event_handler) { 1748 siw_dbg_pd(srq->base_srq.pd, 1749 "reporting SRQ event %d\n", etype); 1750 base_srq->event_handler(&event, base_srq->srq_context); 1751 } 1752 } 1753 1754 void siw_port_event(struct siw_device *sdev, u8 port, enum ib_event_type etype) 1755 { 1756 struct ib_event event; 1757 1758 event.event = etype; 1759 event.device = &sdev->base_dev; 1760 event.element.port_num = port; 1761 1762 siw_dbg(&sdev->base_dev, "reporting port event %d\n", etype); 1763 1764 ib_dispatch_event(&event); 1765 } 1766