1 /* 2 * Copyright (c) 2005 Topspin Communications. All rights reserved. 3 * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved. 4 * Copyright (c) 2007 Cisco, Inc. All rights reserved. 5 * 6 * This software is available to you under a choice of one of two 7 * licenses. You may choose to be licensed under the terms of the GNU 8 * General Public License (GPL) Version 2, available from the file 9 * COPYING in the main directory of this source tree, or the 10 * OpenIB.org BSD license below: 11 * 12 * Redistribution and use in source and binary forms, with or 13 * without modification, are permitted provided that the following 14 * conditions are met: 15 * 16 * - Redistributions of source code must retain the above 17 * copyright notice, this list of conditions and the following 18 * disclaimer. 19 * 20 * - Redistributions in binary form must reproduce the above 21 * copyright notice, this list of conditions and the following 22 * disclaimer in the documentation and/or other materials 23 * provided with the distribution. 24 * 25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 32 * SOFTWARE. 33 */ 34 35 #include <config.h> 36 37 #include <stdlib.h> 38 #include <pthread.h> 39 #include <string.h> 40 #include <errno.h> 41 42 #include "mlx4.h" 43 #include "doorbell.h" 44 #include "wqe.h" 45 46 static const uint32_t mlx4_ib_opcode[] = { 47 [IBV_WR_SEND] = MLX4_OPCODE_SEND, 48 [IBV_WR_SEND_WITH_IMM] = MLX4_OPCODE_SEND_IMM, 49 [IBV_WR_RDMA_WRITE] = MLX4_OPCODE_RDMA_WRITE, 50 [IBV_WR_RDMA_WRITE_WITH_IMM] = MLX4_OPCODE_RDMA_WRITE_IMM, 51 [IBV_WR_RDMA_READ] = MLX4_OPCODE_RDMA_READ, 52 [IBV_WR_ATOMIC_CMP_AND_SWP] = MLX4_OPCODE_ATOMIC_CS, 53 [IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX4_OPCODE_ATOMIC_FA, 54 [IBV_WR_LOCAL_INV] = MLX4_OPCODE_LOCAL_INVAL, 55 [IBV_WR_BIND_MW] = MLX4_OPCODE_BIND_MW, 56 [IBV_WR_SEND_WITH_INV] = MLX4_OPCODE_SEND_INVAL, 57 }; 58 59 static void *get_recv_wqe(struct mlx4_qp *qp, int n) 60 { 61 return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift); 62 } 63 64 static void *get_send_wqe(struct mlx4_qp *qp, int n) 65 { 66 return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift); 67 } 68 69 /* 70 * Stamp a SQ WQE so that it is invalid if prefetched by marking the 71 * first four bytes of every 64 byte chunk with 0xffffffff, except for 72 * the very first chunk of the WQE. 73 */ 74 static void stamp_send_wqe(struct mlx4_qp *qp, int n) 75 { 76 uint32_t *wqe = get_send_wqe(qp, n); 77 int i; 78 int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2; 79 80 for (i = 16; i < ds; i += 16) 81 wqe[i] = 0xffffffff; 82 } 83 84 void mlx4_init_qp_indices(struct mlx4_qp *qp) 85 { 86 qp->sq.head = 0; 87 qp->sq.tail = 0; 88 qp->rq.head = 0; 89 qp->rq.tail = 0; 90 } 91 92 void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp) 93 { 94 struct mlx4_wqe_ctrl_seg *ctrl; 95 int i; 96 97 for (i = 0; i < qp->sq.wqe_cnt; ++i) { 98 ctrl = get_send_wqe(qp, i); 99 ctrl->owner_opcode = htobe32(1 << 31); 100 ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4); 101 102 stamp_send_wqe(qp, i); 103 } 104 } 105 106 static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq) 107 { 108 unsigned cur; 109 110 cur = wq->head - wq->tail; 111 if (cur + nreq < wq->max_post) 112 return 0; 113 114 pthread_spin_lock(&cq->lock); 115 cur = wq->head - wq->tail; 116 pthread_spin_unlock(&cq->lock); 117 118 return cur + nreq >= wq->max_post; 119 } 120 121 static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_send_wr *wr) 122 { 123 int acc = wr->bind_mw.bind_info.mw_access_flags; 124 bseg->flags1 = 0; 125 if (acc & IBV_ACCESS_REMOTE_ATOMIC) 126 bseg->flags1 |= htobe32(MLX4_WQE_MW_ATOMIC); 127 if (acc & IBV_ACCESS_REMOTE_WRITE) 128 bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_WRITE); 129 if (acc & IBV_ACCESS_REMOTE_READ) 130 bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_READ); 131 132 bseg->flags2 = 0; 133 if (((struct ibv_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2) 134 bseg->flags2 |= htobe32(MLX4_WQE_BIND_TYPE_2); 135 if (acc & IBV_ACCESS_ZERO_BASED) 136 bseg->flags2 |= htobe32(MLX4_WQE_BIND_ZERO_BASED); 137 138 bseg->new_rkey = htobe32(wr->bind_mw.rkey); 139 bseg->lkey = htobe32(wr->bind_mw.bind_info.mr->lkey); 140 bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr); 141 bseg->length = htobe64(wr->bind_mw.bind_info.length); 142 } 143 144 static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, 145 uint32_t rkey) 146 { 147 iseg->mem_key = htobe32(rkey); 148 149 iseg->reserved1 = 0; 150 iseg->reserved2 = 0; 151 iseg->reserved3[0] = 0; 152 iseg->reserved3[1] = 0; 153 } 154 155 static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg, 156 uint64_t remote_addr, uint32_t rkey) 157 { 158 rseg->raddr = htobe64(remote_addr); 159 rseg->rkey = htobe32(rkey); 160 rseg->reserved = 0; 161 } 162 163 static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr) 164 { 165 if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { 166 aseg->swap_add = htobe64(wr->wr.atomic.swap); 167 aseg->compare = htobe64(wr->wr.atomic.compare_add); 168 } else { 169 aseg->swap_add = htobe64(wr->wr.atomic.compare_add); 170 aseg->compare = 0; 171 } 172 173 } 174 175 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg, 176 struct ibv_send_wr *wr) 177 { 178 memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av)); 179 dseg->dqpn = htobe32(wr->wr.ud.remote_qpn); 180 dseg->qkey = htobe32(wr->wr.ud.remote_qkey); 181 dseg->vlan = htobe16(to_mah(wr->wr.ud.ah)->vlan); 182 memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6); 183 } 184 185 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) 186 { 187 dseg->byte_count = htobe32(sg->length); 188 dseg->lkey = htobe32(sg->lkey); 189 dseg->addr = htobe64(sg->addr); 190 } 191 192 static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg) 193 { 194 dseg->lkey = htobe32(sg->lkey); 195 dseg->addr = htobe64(sg->addr); 196 197 /* 198 * Need a barrier here before writing the byte_count field to 199 * make sure that all the data is visible before the 200 * byte_count field is set. Otherwise, if the segment begins 201 * a new cacheline, the HCA prefetcher could grab the 64-byte 202 * chunk and get a valid (!= * 0xffffffff) byte count but 203 * stale data, and end up sending the wrong data. 204 */ 205 udma_to_device_barrier(); 206 207 if (likely(sg->length)) 208 dseg->byte_count = htobe32(sg->length); 209 else 210 dseg->byte_count = htobe32(0x80000000); 211 } 212 213 int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr, 214 struct ibv_send_wr **bad_wr) 215 { 216 struct mlx4_context *ctx; 217 struct mlx4_qp *qp = to_mqp(ibqp); 218 void *wqe; 219 struct mlx4_wqe_ctrl_seg *ctrl = NULL; 220 int ind; 221 int nreq; 222 int inl = 0; 223 int ret = 0; 224 int size = 0; 225 int i; 226 227 pthread_spin_lock(&qp->sq.lock); 228 229 /* XXX check that state is OK to post send */ 230 231 ind = qp->sq.head; 232 233 for (nreq = 0; wr; ++nreq, wr = wr->next) { 234 if (wq_overflow(&qp->sq, nreq, to_mcq(ibqp->send_cq))) { 235 ret = ENOMEM; 236 *bad_wr = wr; 237 goto out; 238 } 239 240 if (wr->num_sge > qp->sq.max_gs) { 241 ret = ENOMEM; 242 *bad_wr = wr; 243 goto out; 244 } 245 246 if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) { 247 ret = EINVAL; 248 *bad_wr = wr; 249 goto out; 250 } 251 252 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1)); 253 qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id; 254 255 ctrl->srcrb_flags = 256 (wr->send_flags & IBV_SEND_SIGNALED ? 257 htobe32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) | 258 (wr->send_flags & IBV_SEND_SOLICITED ? 259 htobe32(MLX4_WQE_CTRL_SOLICIT) : 0) | 260 qp->sq_signal_bits; 261 262 if (wr->opcode == IBV_WR_SEND_WITH_IMM || 263 wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM) 264 ctrl->imm = wr->imm_data; 265 else 266 ctrl->imm = 0; 267 268 wqe += sizeof *ctrl; 269 size = sizeof *ctrl / 16; 270 271 switch (ibqp->qp_type) { 272 case IBV_QPT_XRC_SEND: 273 ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr); 274 /* fall through */ 275 case IBV_QPT_RC: 276 case IBV_QPT_UC: 277 switch (wr->opcode) { 278 case IBV_WR_ATOMIC_CMP_AND_SWP: 279 case IBV_WR_ATOMIC_FETCH_AND_ADD: 280 set_raddr_seg(wqe, wr->wr.atomic.remote_addr, 281 wr->wr.atomic.rkey); 282 wqe += sizeof (struct mlx4_wqe_raddr_seg); 283 284 set_atomic_seg(wqe, wr); 285 wqe += sizeof (struct mlx4_wqe_atomic_seg); 286 size += (sizeof (struct mlx4_wqe_raddr_seg) + 287 sizeof (struct mlx4_wqe_atomic_seg)) / 16; 288 289 break; 290 291 case IBV_WR_RDMA_READ: 292 inl = 1; 293 /* fall through */ 294 case IBV_WR_RDMA_WRITE: 295 case IBV_WR_RDMA_WRITE_WITH_IMM: 296 if (!wr->num_sge) 297 inl = 1; 298 set_raddr_seg(wqe, wr->wr.rdma.remote_addr, 299 wr->wr.rdma.rkey); 300 wqe += sizeof (struct mlx4_wqe_raddr_seg); 301 size += sizeof (struct mlx4_wqe_raddr_seg) / 16; 302 303 break; 304 case IBV_WR_LOCAL_INV: 305 ctrl->srcrb_flags |= 306 htobe32(MLX4_WQE_CTRL_STRONG_ORDER); 307 set_local_inv_seg(wqe, wr->imm_data); 308 wqe += sizeof 309 (struct mlx4_wqe_local_inval_seg); 310 size += sizeof 311 (struct mlx4_wqe_local_inval_seg) / 16; 312 break; 313 case IBV_WR_BIND_MW: 314 ctrl->srcrb_flags |= 315 htobe32(MLX4_WQE_CTRL_STRONG_ORDER); 316 set_bind_seg(wqe, wr); 317 wqe += sizeof 318 (struct mlx4_wqe_bind_seg); 319 size += sizeof 320 (struct mlx4_wqe_bind_seg) / 16; 321 break; 322 case IBV_WR_SEND_WITH_INV: 323 ctrl->imm = htobe32(wr->imm_data); 324 break; 325 326 default: 327 /* No extra segments required for sends */ 328 break; 329 } 330 break; 331 332 case IBV_QPT_UD: 333 set_datagram_seg(wqe, wr); 334 wqe += sizeof (struct mlx4_wqe_datagram_seg); 335 size += sizeof (struct mlx4_wqe_datagram_seg) / 16; 336 337 if (wr->send_flags & IBV_SEND_IP_CSUM) { 338 if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_UD_OVER_IB)) { 339 ret = EINVAL; 340 *bad_wr = wr; 341 goto out; 342 } 343 ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM | 344 MLX4_WQE_CTRL_TCP_UDP_CSUM); 345 } 346 break; 347 348 case IBV_QPT_RAW_PACKET: 349 /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used 350 * to indicate that no icrc should be calculated */ 351 ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_SOLICIT); 352 if (wr->send_flags & IBV_SEND_IP_CSUM) { 353 if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_RAW_OVER_ETH)) { 354 ret = EINVAL; 355 *bad_wr = wr; 356 goto out; 357 } 358 ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM | 359 MLX4_WQE_CTRL_TCP_UDP_CSUM); 360 } 361 break; 362 363 default: 364 break; 365 } 366 367 if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) { 368 struct mlx4_wqe_inline_seg *seg; 369 void *addr; 370 int len, seg_len; 371 int num_seg; 372 int off, to_copy; 373 374 inl = 0; 375 376 seg = wqe; 377 wqe += sizeof *seg; 378 off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1); 379 num_seg = 0; 380 seg_len = 0; 381 382 for (i = 0; i < wr->num_sge; ++i) { 383 addr = (void *) (uintptr_t) wr->sg_list[i].addr; 384 len = wr->sg_list[i].length; 385 inl += len; 386 387 if (inl > qp->max_inline_data) { 388 inl = 0; 389 ret = ENOMEM; 390 *bad_wr = wr; 391 goto out; 392 } 393 394 while (len >= MLX4_INLINE_ALIGN - off) { 395 to_copy = MLX4_INLINE_ALIGN - off; 396 memcpy(wqe, addr, to_copy); 397 len -= to_copy; 398 wqe += to_copy; 399 addr += to_copy; 400 seg_len += to_copy; 401 udma_to_device_barrier(); /* see comment below */ 402 seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len); 403 seg_len = 0; 404 seg = wqe; 405 wqe += sizeof *seg; 406 off = sizeof *seg; 407 ++num_seg; 408 } 409 410 memcpy(wqe, addr, len); 411 wqe += len; 412 seg_len += len; 413 off += len; 414 } 415 416 if (seg_len) { 417 ++num_seg; 418 /* 419 * Need a barrier here to make sure 420 * all the data is visible before the 421 * byte_count field is set. Otherwise 422 * the HCA prefetcher could grab the 423 * 64-byte chunk with this inline 424 * segment and get a valid (!= 425 * 0xffffffff) byte count but stale 426 * data, and end up sending the wrong 427 * data. 428 */ 429 udma_to_device_barrier(); 430 seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len); 431 } 432 433 size += (inl + num_seg * sizeof * seg + 15) / 16; 434 } else { 435 struct mlx4_wqe_data_seg *seg = wqe; 436 437 for (i = wr->num_sge - 1; i >= 0 ; --i) 438 set_data_seg(seg + i, wr->sg_list + i); 439 440 size += wr->num_sge * (sizeof *seg / 16); 441 } 442 443 ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ? 444 MLX4_WQE_CTRL_FENCE : 0) | size; 445 446 /* 447 * Make sure descriptor is fully written before 448 * setting ownership bit (because HW can start 449 * executing as soon as we do). 450 */ 451 udma_to_device_barrier(); 452 453 ctrl->owner_opcode = htobe32(mlx4_ib_opcode[wr->opcode]) | 454 (ind & qp->sq.wqe_cnt ? htobe32(1 << 31) : 0); 455 456 /* 457 * We can improve latency by not stamping the last 458 * send queue WQE until after ringing the doorbell, so 459 * only stamp here if there are still more WQEs to post. 460 */ 461 if (wr->next) 462 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) & 463 (qp->sq.wqe_cnt - 1)); 464 465 ++ind; 466 } 467 468 out: 469 ctx = to_mctx(ibqp->context); 470 471 if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) { 472 ctrl->owner_opcode |= htobe32((qp->sq.head & 0xffff) << 8); 473 474 ctrl->bf_qpn |= qp->doorbell_qpn; 475 ++qp->sq.head; 476 /* 477 * Make sure that descriptor is written to memory 478 * before writing to BlueFlame page. 479 */ 480 mmio_wc_spinlock(&ctx->bf_lock); 481 482 mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl, 483 align(size * 16, 64)); 484 /* Flush before toggling bf_offset to be latency oriented */ 485 mmio_flush_writes(); 486 487 ctx->bf_offset ^= ctx->bf_buf_size; 488 489 pthread_spin_unlock(&ctx->bf_lock); 490 } else if (nreq) { 491 qp->sq.head += nreq; 492 493 /* 494 * Make sure that descriptors are written before 495 * doorbell record. 496 */ 497 udma_to_device_barrier(); 498 499 mmio_writel((unsigned long)(ctx->uar + MLX4_SEND_DOORBELL), 500 qp->doorbell_qpn); 501 } 502 503 if (nreq) 504 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) & 505 (qp->sq.wqe_cnt - 1)); 506 507 pthread_spin_unlock(&qp->sq.lock); 508 509 return ret; 510 } 511 512 int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr, 513 struct ibv_recv_wr **bad_wr) 514 { 515 struct mlx4_qp *qp = to_mqp(ibqp); 516 struct mlx4_wqe_data_seg *scat; 517 int ret = 0; 518 int nreq; 519 int ind; 520 int i; 521 522 pthread_spin_lock(&qp->rq.lock); 523 524 /* XXX check that state is OK to post receive */ 525 526 ind = qp->rq.head & (qp->rq.wqe_cnt - 1); 527 528 for (nreq = 0; wr; ++nreq, wr = wr->next) { 529 if (wq_overflow(&qp->rq, nreq, to_mcq(ibqp->recv_cq))) { 530 ret = ENOMEM; 531 *bad_wr = wr; 532 goto out; 533 } 534 535 if (wr->num_sge > qp->rq.max_gs) { 536 ret = ENOMEM; 537 *bad_wr = wr; 538 goto out; 539 } 540 541 scat = get_recv_wqe(qp, ind); 542 543 for (i = 0; i < wr->num_sge; ++i) 544 __set_data_seg(scat + i, wr->sg_list + i); 545 546 if (i < qp->rq.max_gs) { 547 scat[i].byte_count = 0; 548 scat[i].lkey = htobe32(MLX4_INVALID_LKEY); 549 scat[i].addr = 0; 550 } 551 552 qp->rq.wrid[ind] = wr->wr_id; 553 554 ind = (ind + 1) & (qp->rq.wqe_cnt - 1); 555 } 556 557 out: 558 if (nreq) { 559 qp->rq.head += nreq; 560 561 /* 562 * Make sure that descriptors are written before 563 * doorbell record. 564 */ 565 udma_to_device_barrier(); 566 567 *qp->db = htobe32(qp->rq.head & 0xffff); 568 } 569 570 pthread_spin_unlock(&qp->rq.lock); 571 572 return ret; 573 } 574 575 static int num_inline_segs(int data, enum ibv_qp_type type) 576 { 577 /* 578 * Inline data segments are not allowed to cross 64 byte 579 * boundaries. For UD QPs, the data segments always start 580 * aligned to 64 bytes (16 byte control segment + 48 byte 581 * datagram segment); for other QPs, there will be a 16 byte 582 * control segment and possibly a 16 byte remote address 583 * segment, so in the worst case there will be only 32 bytes 584 * available for the first data segment. 585 */ 586 if (type == IBV_QPT_UD) 587 data += (sizeof (struct mlx4_wqe_ctrl_seg) + 588 sizeof (struct mlx4_wqe_datagram_seg)) % 589 MLX4_INLINE_ALIGN; 590 else 591 data += (sizeof (struct mlx4_wqe_ctrl_seg) + 592 sizeof (struct mlx4_wqe_raddr_seg)) % 593 MLX4_INLINE_ALIGN; 594 595 return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) / 596 (MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg)); 597 } 598 599 void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type, 600 struct mlx4_qp *qp) 601 { 602 int size; 603 int max_sq_sge; 604 605 max_sq_sge = align(cap->max_inline_data + 606 num_inline_segs(cap->max_inline_data, type) * 607 sizeof (struct mlx4_wqe_inline_seg), 608 sizeof (struct mlx4_wqe_data_seg)) / 609 sizeof (struct mlx4_wqe_data_seg); 610 if (max_sq_sge < cap->max_send_sge) 611 max_sq_sge = cap->max_send_sge; 612 613 size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg); 614 switch (type) { 615 case IBV_QPT_UD: 616 size += sizeof (struct mlx4_wqe_datagram_seg); 617 break; 618 619 case IBV_QPT_UC: 620 size += sizeof (struct mlx4_wqe_raddr_seg); 621 break; 622 623 case IBV_QPT_XRC_SEND: 624 case IBV_QPT_RC: 625 size += sizeof (struct mlx4_wqe_raddr_seg); 626 /* 627 * An atomic op will require an atomic segment, a 628 * remote address segment and one scatter entry. 629 */ 630 if (size < (sizeof (struct mlx4_wqe_atomic_seg) + 631 sizeof (struct mlx4_wqe_raddr_seg) + 632 sizeof (struct mlx4_wqe_data_seg))) 633 size = (sizeof (struct mlx4_wqe_atomic_seg) + 634 sizeof (struct mlx4_wqe_raddr_seg) + 635 sizeof (struct mlx4_wqe_data_seg)); 636 break; 637 638 default: 639 break; 640 } 641 642 /* Make sure that we have enough space for a bind request */ 643 if (size < sizeof (struct mlx4_wqe_bind_seg)) 644 size = sizeof (struct mlx4_wqe_bind_seg); 645 646 size += sizeof (struct mlx4_wqe_ctrl_seg); 647 648 for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size; 649 qp->sq.wqe_shift++) 650 ; /* nothing */ 651 } 652 653 int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap, 654 enum ibv_qp_type type, struct mlx4_qp *qp) 655 { 656 qp->rq.max_gs = cap->max_recv_sge; 657 658 if (qp->sq.wqe_cnt) { 659 qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t)); 660 if (!qp->sq.wrid) 661 return -1; 662 } 663 664 if (qp->rq.wqe_cnt) { 665 qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t)); 666 if (!qp->rq.wrid) { 667 free(qp->sq.wrid); 668 return -1; 669 } 670 } 671 672 for (qp->rq.wqe_shift = 4; 673 1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg); 674 qp->rq.wqe_shift++) 675 ; /* nothing */ 676 677 qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + 678 (qp->sq.wqe_cnt << qp->sq.wqe_shift); 679 if (qp->rq.wqe_shift > qp->sq.wqe_shift) { 680 qp->rq.offset = 0; 681 qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; 682 } else { 683 qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift; 684 qp->sq.offset = 0; 685 } 686 687 if (qp->buf_size) { 688 if (mlx4_alloc_buf(&qp->buf, 689 align(qp->buf_size, to_mdev(context->device)->page_size), 690 to_mdev(context->device)->page_size)) { 691 free(qp->sq.wrid); 692 free(qp->rq.wrid); 693 return -1; 694 } 695 696 memset(qp->buf.buf, 0, qp->buf_size); 697 } else { 698 qp->buf.buf = NULL; 699 } 700 701 return 0; 702 } 703 704 void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap, 705 enum ibv_qp_type type) 706 { 707 int wqe_size; 708 709 wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg); 710 switch (type) { 711 case IBV_QPT_UD: 712 wqe_size -= sizeof (struct mlx4_wqe_datagram_seg); 713 break; 714 715 case IBV_QPT_XRC_SEND: 716 case IBV_QPT_UC: 717 case IBV_QPT_RC: 718 wqe_size -= sizeof (struct mlx4_wqe_raddr_seg); 719 break; 720 721 default: 722 break; 723 } 724 725 qp->sq.max_gs = wqe_size / sizeof (struct mlx4_wqe_data_seg); 726 cap->max_send_sge = qp->sq.max_gs; 727 qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes; 728 cap->max_send_wr = qp->sq.max_post; 729 730 /* 731 * Inline data segments can't cross a 64 byte boundary. So 732 * subtract off one segment header for each 64-byte chunk, 733 * taking into account the fact that wqe_size will be 32 mod 734 * 64 for non-UD QPs. 735 */ 736 qp->max_inline_data = wqe_size - 737 sizeof (struct mlx4_wqe_inline_seg) * 738 (align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN); 739 cap->max_inline_data = qp->max_inline_data; 740 } 741 742 struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn) 743 { 744 int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; 745 746 if (ctx->qp_table[tind].refcnt) 747 return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask]; 748 else 749 return NULL; 750 } 751 752 int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp) 753 { 754 int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; 755 756 if (!ctx->qp_table[tind].refcnt) { 757 ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1, 758 sizeof (struct mlx4_qp *)); 759 if (!ctx->qp_table[tind].table) 760 return -1; 761 } 762 763 ++ctx->qp_table[tind].refcnt; 764 ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp; 765 return 0; 766 } 767 768 void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn) 769 { 770 int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift; 771 772 if (!--ctx->qp_table[tind].refcnt) 773 free(ctx->qp_table[tind].table); 774 else 775 ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL; 776 } 777