1 /*
2 * Copyright (c) 2005 Topspin Communications. All rights reserved.
3 * Copyright (c) 2005 Mellanox Technologies Ltd. All rights reserved.
4 * Copyright (c) 2007 Cisco, Inc. All rights reserved.
5 *
6 * This software is available to you under a choice of one of two
7 * licenses. You may choose to be licensed under the terms of the GNU
8 * General Public License (GPL) Version 2, available from the file
9 * COPYING in the main directory of this source tree, or the
10 * OpenIB.org BSD license below:
11 *
12 * Redistribution and use in source and binary forms, with or
13 * without modification, are permitted provided that the following
14 * conditions are met:
15 *
16 * - Redistributions of source code must retain the above
17 * copyright notice, this list of conditions and the following
18 * disclaimer.
19 *
20 * - Redistributions in binary form must reproduce the above
21 * copyright notice, this list of conditions and the following
22 * disclaimer in the documentation and/or other materials
23 * provided with the distribution.
24 *
25 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32 * SOFTWARE.
33 */
34
35 #include <config.h>
36
37 #include <stdlib.h>
38 #include <pthread.h>
39 #include <string.h>
40 #include <errno.h>
41
42 #include "mlx4.h"
43 #include "doorbell.h"
44 #include "wqe.h"
45
46 static const uint32_t mlx4_ib_opcode[] = {
47 [IBV_WR_SEND] = MLX4_OPCODE_SEND,
48 [IBV_WR_SEND_WITH_IMM] = MLX4_OPCODE_SEND_IMM,
49 [IBV_WR_RDMA_WRITE] = MLX4_OPCODE_RDMA_WRITE,
50 [IBV_WR_RDMA_WRITE_WITH_IMM] = MLX4_OPCODE_RDMA_WRITE_IMM,
51 [IBV_WR_RDMA_READ] = MLX4_OPCODE_RDMA_READ,
52 [IBV_WR_ATOMIC_CMP_AND_SWP] = MLX4_OPCODE_ATOMIC_CS,
53 [IBV_WR_ATOMIC_FETCH_AND_ADD] = MLX4_OPCODE_ATOMIC_FA,
54 [IBV_WR_LOCAL_INV] = MLX4_OPCODE_LOCAL_INVAL,
55 [IBV_WR_BIND_MW] = MLX4_OPCODE_BIND_MW,
56 [IBV_WR_SEND_WITH_INV] = MLX4_OPCODE_SEND_INVAL,
57 };
58
get_recv_wqe(struct mlx4_qp * qp,int n)59 static void *get_recv_wqe(struct mlx4_qp *qp, int n)
60 {
61 return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
62 }
63
get_send_wqe(struct mlx4_qp * qp,int n)64 static void *get_send_wqe(struct mlx4_qp *qp, int n)
65 {
66 return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
67 }
68
69 /*
70 * Stamp a SQ WQE so that it is invalid if prefetched by marking the
71 * first four bytes of every 64 byte chunk with 0xffffffff, except for
72 * the very first chunk of the WQE.
73 */
stamp_send_wqe(struct mlx4_qp * qp,int n)74 static void stamp_send_wqe(struct mlx4_qp *qp, int n)
75 {
76 uint32_t *wqe = get_send_wqe(qp, n);
77 int i;
78 int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;
79
80 for (i = 16; i < ds; i += 16)
81 wqe[i] = 0xffffffff;
82 }
83
mlx4_init_qp_indices(struct mlx4_qp * qp)84 void mlx4_init_qp_indices(struct mlx4_qp *qp)
85 {
86 qp->sq.head = 0;
87 qp->sq.tail = 0;
88 qp->rq.head = 0;
89 qp->rq.tail = 0;
90 }
91
mlx4_qp_init_sq_ownership(struct mlx4_qp * qp)92 void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
93 {
94 struct mlx4_wqe_ctrl_seg *ctrl;
95 int i;
96
97 for (i = 0; i < qp->sq.wqe_cnt; ++i) {
98 ctrl = get_send_wqe(qp, i);
99 ctrl->owner_opcode = htobe32(1 << 31);
100 ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
101
102 stamp_send_wqe(qp, i);
103 }
104 }
105
wq_overflow(struct mlx4_wq * wq,int nreq,struct mlx4_cq * cq)106 static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
107 {
108 unsigned cur;
109
110 cur = wq->head - wq->tail;
111 if (cur + nreq < wq->max_post)
112 return 0;
113
114 pthread_spin_lock(&cq->lock);
115 cur = wq->head - wq->tail;
116 pthread_spin_unlock(&cq->lock);
117
118 return cur + nreq >= wq->max_post;
119 }
120
set_bind_seg(struct mlx4_wqe_bind_seg * bseg,struct ibv_send_wr * wr)121 static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_send_wr *wr)
122 {
123 int acc = wr->bind_mw.bind_info.mw_access_flags;
124 bseg->flags1 = 0;
125 if (acc & IBV_ACCESS_REMOTE_ATOMIC)
126 bseg->flags1 |= htobe32(MLX4_WQE_MW_ATOMIC);
127 if (acc & IBV_ACCESS_REMOTE_WRITE)
128 bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_WRITE);
129 if (acc & IBV_ACCESS_REMOTE_READ)
130 bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_READ);
131
132 bseg->flags2 = 0;
133 if (((struct ibv_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2)
134 bseg->flags2 |= htobe32(MLX4_WQE_BIND_TYPE_2);
135 if (acc & IBV_ACCESS_ZERO_BASED)
136 bseg->flags2 |= htobe32(MLX4_WQE_BIND_ZERO_BASED);
137
138 bseg->new_rkey = htobe32(wr->bind_mw.rkey);
139 bseg->lkey = htobe32(wr->bind_mw.bind_info.mr->lkey);
140 bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr);
141 bseg->length = htobe64(wr->bind_mw.bind_info.length);
142 }
143
set_local_inv_seg(struct mlx4_wqe_local_inval_seg * iseg,uint32_t rkey)144 static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
145 uint32_t rkey)
146 {
147 iseg->mem_key = htobe32(rkey);
148
149 iseg->reserved1 = 0;
150 iseg->reserved2 = 0;
151 iseg->reserved3[0] = 0;
152 iseg->reserved3[1] = 0;
153 }
154
set_raddr_seg(struct mlx4_wqe_raddr_seg * rseg,uint64_t remote_addr,uint32_t rkey)155 static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
156 uint64_t remote_addr, uint32_t rkey)
157 {
158 rseg->raddr = htobe64(remote_addr);
159 rseg->rkey = htobe32(rkey);
160 rseg->reserved = 0;
161 }
162
set_atomic_seg(struct mlx4_wqe_atomic_seg * aseg,struct ibv_send_wr * wr)163 static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
164 {
165 if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
166 aseg->swap_add = htobe64(wr->wr.atomic.swap);
167 aseg->compare = htobe64(wr->wr.atomic.compare_add);
168 } else {
169 aseg->swap_add = htobe64(wr->wr.atomic.compare_add);
170 aseg->compare = 0;
171 }
172
173 }
174
set_datagram_seg(struct mlx4_wqe_datagram_seg * dseg,struct ibv_send_wr * wr)175 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
176 struct ibv_send_wr *wr)
177 {
178 memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
179 dseg->dqpn = htobe32(wr->wr.ud.remote_qpn);
180 dseg->qkey = htobe32(wr->wr.ud.remote_qkey);
181 dseg->vlan = htobe16(to_mah(wr->wr.ud.ah)->vlan);
182 memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
183 }
184
__set_data_seg(struct mlx4_wqe_data_seg * dseg,struct ibv_sge * sg)185 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
186 {
187 dseg->byte_count = htobe32(sg->length);
188 dseg->lkey = htobe32(sg->lkey);
189 dseg->addr = htobe64(sg->addr);
190 }
191
set_data_seg(struct mlx4_wqe_data_seg * dseg,struct ibv_sge * sg)192 static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
193 {
194 dseg->lkey = htobe32(sg->lkey);
195 dseg->addr = htobe64(sg->addr);
196
197 /*
198 * Need a barrier here before writing the byte_count field to
199 * make sure that all the data is visible before the
200 * byte_count field is set. Otherwise, if the segment begins
201 * a new cacheline, the HCA prefetcher could grab the 64-byte
202 * chunk and get a valid (!= * 0xffffffff) byte count but
203 * stale data, and end up sending the wrong data.
204 */
205 udma_to_device_barrier();
206
207 if (likely(sg->length))
208 dseg->byte_count = htobe32(sg->length);
209 else
210 dseg->byte_count = htobe32(0x80000000);
211 }
212
mlx4_post_send(struct ibv_qp * ibqp,struct ibv_send_wr * wr,struct ibv_send_wr ** bad_wr)213 int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
214 struct ibv_send_wr **bad_wr)
215 {
216 struct mlx4_context *ctx;
217 struct mlx4_qp *qp = to_mqp(ibqp);
218 void *wqe;
219 struct mlx4_wqe_ctrl_seg *ctrl = NULL;
220 int ind;
221 int nreq;
222 int inl = 0;
223 int ret = 0;
224 int size = 0;
225 int i;
226
227 pthread_spin_lock(&qp->sq.lock);
228
229 /* XXX check that state is OK to post send */
230
231 ind = qp->sq.head;
232
233 for (nreq = 0; wr; ++nreq, wr = wr->next) {
234 if (wq_overflow(&qp->sq, nreq, to_mcq(ibqp->send_cq))) {
235 ret = ENOMEM;
236 *bad_wr = wr;
237 goto out;
238 }
239
240 if (wr->num_sge > qp->sq.max_gs) {
241 ret = ENOMEM;
242 *bad_wr = wr;
243 goto out;
244 }
245
246 if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
247 ret = EINVAL;
248 *bad_wr = wr;
249 goto out;
250 }
251
252 ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
253 qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
254
255 ctrl->srcrb_flags =
256 (wr->send_flags & IBV_SEND_SIGNALED ?
257 htobe32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
258 (wr->send_flags & IBV_SEND_SOLICITED ?
259 htobe32(MLX4_WQE_CTRL_SOLICIT) : 0) |
260 qp->sq_signal_bits;
261
262 if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
263 wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
264 ctrl->imm = wr->imm_data;
265 else
266 ctrl->imm = 0;
267
268 wqe += sizeof *ctrl;
269 size = sizeof *ctrl / 16;
270
271 switch (ibqp->qp_type) {
272 case IBV_QPT_XRC_SEND:
273 ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr);
274 /* fall through */
275 case IBV_QPT_RC:
276 case IBV_QPT_UC:
277 switch (wr->opcode) {
278 case IBV_WR_ATOMIC_CMP_AND_SWP:
279 case IBV_WR_ATOMIC_FETCH_AND_ADD:
280 set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
281 wr->wr.atomic.rkey);
282 wqe += sizeof (struct mlx4_wqe_raddr_seg);
283
284 set_atomic_seg(wqe, wr);
285 wqe += sizeof (struct mlx4_wqe_atomic_seg);
286 size += (sizeof (struct mlx4_wqe_raddr_seg) +
287 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
288
289 break;
290
291 case IBV_WR_RDMA_READ:
292 inl = 1;
293 /* fall through */
294 case IBV_WR_RDMA_WRITE:
295 case IBV_WR_RDMA_WRITE_WITH_IMM:
296 if (!wr->num_sge)
297 inl = 1;
298 set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
299 wr->wr.rdma.rkey);
300 wqe += sizeof (struct mlx4_wqe_raddr_seg);
301 size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
302
303 break;
304 case IBV_WR_LOCAL_INV:
305 ctrl->srcrb_flags |=
306 htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
307 set_local_inv_seg(wqe, wr->imm_data);
308 wqe += sizeof
309 (struct mlx4_wqe_local_inval_seg);
310 size += sizeof
311 (struct mlx4_wqe_local_inval_seg) / 16;
312 break;
313 case IBV_WR_BIND_MW:
314 ctrl->srcrb_flags |=
315 htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
316 set_bind_seg(wqe, wr);
317 wqe += sizeof
318 (struct mlx4_wqe_bind_seg);
319 size += sizeof
320 (struct mlx4_wqe_bind_seg) / 16;
321 break;
322 case IBV_WR_SEND_WITH_INV:
323 ctrl->imm = htobe32(wr->imm_data);
324 break;
325
326 default:
327 /* No extra segments required for sends */
328 break;
329 }
330 break;
331
332 case IBV_QPT_UD:
333 set_datagram_seg(wqe, wr);
334 wqe += sizeof (struct mlx4_wqe_datagram_seg);
335 size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
336
337 if (wr->send_flags & IBV_SEND_IP_CSUM) {
338 if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_UD_OVER_IB)) {
339 ret = EINVAL;
340 *bad_wr = wr;
341 goto out;
342 }
343 ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
344 MLX4_WQE_CTRL_TCP_UDP_CSUM);
345 }
346 break;
347
348 case IBV_QPT_RAW_PACKET:
349 /* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
350 * to indicate that no icrc should be calculated */
351 ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_SOLICIT);
352 if (wr->send_flags & IBV_SEND_IP_CSUM) {
353 if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_RAW_OVER_ETH)) {
354 ret = EINVAL;
355 *bad_wr = wr;
356 goto out;
357 }
358 ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
359 MLX4_WQE_CTRL_TCP_UDP_CSUM);
360 }
361 break;
362
363 default:
364 break;
365 }
366
367 if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
368 struct mlx4_wqe_inline_seg *seg;
369 void *addr;
370 int len, seg_len;
371 int num_seg;
372 int off, to_copy;
373
374 inl = 0;
375
376 seg = wqe;
377 wqe += sizeof *seg;
378 off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
379 num_seg = 0;
380 seg_len = 0;
381
382 for (i = 0; i < wr->num_sge; ++i) {
383 addr = (void *) (uintptr_t) wr->sg_list[i].addr;
384 len = wr->sg_list[i].length;
385 inl += len;
386
387 if (inl > qp->max_inline_data) {
388 inl = 0;
389 ret = ENOMEM;
390 *bad_wr = wr;
391 goto out;
392 }
393
394 while (len >= MLX4_INLINE_ALIGN - off) {
395 to_copy = MLX4_INLINE_ALIGN - off;
396 memcpy(wqe, addr, to_copy);
397 len -= to_copy;
398 wqe += to_copy;
399 addr += to_copy;
400 seg_len += to_copy;
401 udma_to_device_barrier(); /* see comment below */
402 seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
403 seg_len = 0;
404 seg = wqe;
405 wqe += sizeof *seg;
406 off = sizeof *seg;
407 ++num_seg;
408 }
409
410 memcpy(wqe, addr, len);
411 wqe += len;
412 seg_len += len;
413 off += len;
414 }
415
416 if (seg_len) {
417 ++num_seg;
418 /*
419 * Need a barrier here to make sure
420 * all the data is visible before the
421 * byte_count field is set. Otherwise
422 * the HCA prefetcher could grab the
423 * 64-byte chunk with this inline
424 * segment and get a valid (!=
425 * 0xffffffff) byte count but stale
426 * data, and end up sending the wrong
427 * data.
428 */
429 udma_to_device_barrier();
430 seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
431 }
432
433 size += (inl + num_seg * sizeof * seg + 15) / 16;
434 } else {
435 struct mlx4_wqe_data_seg *seg = wqe;
436
437 for (i = wr->num_sge - 1; i >= 0 ; --i)
438 set_data_seg(seg + i, wr->sg_list + i);
439
440 size += wr->num_sge * (sizeof *seg / 16);
441 }
442
443 ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
444 MLX4_WQE_CTRL_FENCE : 0) | size;
445
446 /*
447 * Make sure descriptor is fully written before
448 * setting ownership bit (because HW can start
449 * executing as soon as we do).
450 */
451 udma_to_device_barrier();
452
453 ctrl->owner_opcode = htobe32(mlx4_ib_opcode[wr->opcode]) |
454 (ind & qp->sq.wqe_cnt ? htobe32(1 << 31) : 0);
455
456 /*
457 * We can improve latency by not stamping the last
458 * send queue WQE until after ringing the doorbell, so
459 * only stamp here if there are still more WQEs to post.
460 */
461 if (wr->next)
462 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
463 (qp->sq.wqe_cnt - 1));
464
465 ++ind;
466 }
467
468 out:
469 ctx = to_mctx(ibqp->context);
470
471 if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) {
472 ctrl->owner_opcode |= htobe32((qp->sq.head & 0xffff) << 8);
473
474 ctrl->bf_qpn |= qp->doorbell_qpn;
475 ++qp->sq.head;
476 /*
477 * Make sure that descriptor is written to memory
478 * before writing to BlueFlame page.
479 */
480 mmio_wc_spinlock(&ctx->bf_lock);
481
482 mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
483 align(size * 16, 64));
484 /* Flush before toggling bf_offset to be latency oriented */
485 mmio_flush_writes();
486
487 ctx->bf_offset ^= ctx->bf_buf_size;
488
489 pthread_spin_unlock(&ctx->bf_lock);
490 } else if (nreq) {
491 qp->sq.head += nreq;
492
493 /*
494 * Make sure that descriptors are written before
495 * doorbell record.
496 */
497 udma_to_device_barrier();
498
499 mmio_writel((unsigned long)(ctx->uar + MLX4_SEND_DOORBELL),
500 qp->doorbell_qpn);
501 }
502
503 if (nreq)
504 stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
505 (qp->sq.wqe_cnt - 1));
506
507 pthread_spin_unlock(&qp->sq.lock);
508
509 return ret;
510 }
511
mlx4_post_recv(struct ibv_qp * ibqp,struct ibv_recv_wr * wr,struct ibv_recv_wr ** bad_wr)512 int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
513 struct ibv_recv_wr **bad_wr)
514 {
515 struct mlx4_qp *qp = to_mqp(ibqp);
516 struct mlx4_wqe_data_seg *scat;
517 int ret = 0;
518 int nreq;
519 int ind;
520 int i;
521
522 pthread_spin_lock(&qp->rq.lock);
523
524 /* XXX check that state is OK to post receive */
525
526 ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
527
528 for (nreq = 0; wr; ++nreq, wr = wr->next) {
529 if (wq_overflow(&qp->rq, nreq, to_mcq(ibqp->recv_cq))) {
530 ret = ENOMEM;
531 *bad_wr = wr;
532 goto out;
533 }
534
535 if (wr->num_sge > qp->rq.max_gs) {
536 ret = ENOMEM;
537 *bad_wr = wr;
538 goto out;
539 }
540
541 scat = get_recv_wqe(qp, ind);
542
543 for (i = 0; i < wr->num_sge; ++i)
544 __set_data_seg(scat + i, wr->sg_list + i);
545
546 if (i < qp->rq.max_gs) {
547 scat[i].byte_count = 0;
548 scat[i].lkey = htobe32(MLX4_INVALID_LKEY);
549 scat[i].addr = 0;
550 }
551
552 qp->rq.wrid[ind] = wr->wr_id;
553
554 ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
555 }
556
557 out:
558 if (nreq) {
559 qp->rq.head += nreq;
560
561 /*
562 * Make sure that descriptors are written before
563 * doorbell record.
564 */
565 udma_to_device_barrier();
566
567 *qp->db = htobe32(qp->rq.head & 0xffff);
568 }
569
570 pthread_spin_unlock(&qp->rq.lock);
571
572 return ret;
573 }
574
num_inline_segs(int data,enum ibv_qp_type type)575 static int num_inline_segs(int data, enum ibv_qp_type type)
576 {
577 /*
578 * Inline data segments are not allowed to cross 64 byte
579 * boundaries. For UD QPs, the data segments always start
580 * aligned to 64 bytes (16 byte control segment + 48 byte
581 * datagram segment); for other QPs, there will be a 16 byte
582 * control segment and possibly a 16 byte remote address
583 * segment, so in the worst case there will be only 32 bytes
584 * available for the first data segment.
585 */
586 if (type == IBV_QPT_UD)
587 data += (sizeof (struct mlx4_wqe_ctrl_seg) +
588 sizeof (struct mlx4_wqe_datagram_seg)) %
589 MLX4_INLINE_ALIGN;
590 else
591 data += (sizeof (struct mlx4_wqe_ctrl_seg) +
592 sizeof (struct mlx4_wqe_raddr_seg)) %
593 MLX4_INLINE_ALIGN;
594
595 return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
596 (MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
597 }
598
mlx4_calc_sq_wqe_size(struct ibv_qp_cap * cap,enum ibv_qp_type type,struct mlx4_qp * qp)599 void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
600 struct mlx4_qp *qp)
601 {
602 int size;
603 int max_sq_sge;
604
605 max_sq_sge = align(cap->max_inline_data +
606 num_inline_segs(cap->max_inline_data, type) *
607 sizeof (struct mlx4_wqe_inline_seg),
608 sizeof (struct mlx4_wqe_data_seg)) /
609 sizeof (struct mlx4_wqe_data_seg);
610 if (max_sq_sge < cap->max_send_sge)
611 max_sq_sge = cap->max_send_sge;
612
613 size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
614 switch (type) {
615 case IBV_QPT_UD:
616 size += sizeof (struct mlx4_wqe_datagram_seg);
617 break;
618
619 case IBV_QPT_UC:
620 size += sizeof (struct mlx4_wqe_raddr_seg);
621 break;
622
623 case IBV_QPT_XRC_SEND:
624 case IBV_QPT_RC:
625 size += sizeof (struct mlx4_wqe_raddr_seg);
626 /*
627 * An atomic op will require an atomic segment, a
628 * remote address segment and one scatter entry.
629 */
630 if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
631 sizeof (struct mlx4_wqe_raddr_seg) +
632 sizeof (struct mlx4_wqe_data_seg)))
633 size = (sizeof (struct mlx4_wqe_atomic_seg) +
634 sizeof (struct mlx4_wqe_raddr_seg) +
635 sizeof (struct mlx4_wqe_data_seg));
636 break;
637
638 default:
639 break;
640 }
641
642 /* Make sure that we have enough space for a bind request */
643 if (size < sizeof (struct mlx4_wqe_bind_seg))
644 size = sizeof (struct mlx4_wqe_bind_seg);
645
646 size += sizeof (struct mlx4_wqe_ctrl_seg);
647
648 for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
649 qp->sq.wqe_shift++)
650 ; /* nothing */
651 }
652
mlx4_alloc_qp_buf(struct ibv_context * context,struct ibv_qp_cap * cap,enum ibv_qp_type type,struct mlx4_qp * qp)653 int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap,
654 enum ibv_qp_type type, struct mlx4_qp *qp)
655 {
656 qp->rq.max_gs = cap->max_recv_sge;
657
658 if (qp->sq.wqe_cnt) {
659 qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
660 if (!qp->sq.wrid)
661 return -1;
662 }
663
664 if (qp->rq.wqe_cnt) {
665 qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
666 if (!qp->rq.wrid) {
667 free(qp->sq.wrid);
668 return -1;
669 }
670 }
671
672 for (qp->rq.wqe_shift = 4;
673 1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
674 qp->rq.wqe_shift++)
675 ; /* nothing */
676
677 qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
678 (qp->sq.wqe_cnt << qp->sq.wqe_shift);
679 if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
680 qp->rq.offset = 0;
681 qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
682 } else {
683 qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
684 qp->sq.offset = 0;
685 }
686
687 if (qp->buf_size) {
688 if (mlx4_alloc_buf(&qp->buf,
689 align(qp->buf_size, to_mdev(context->device)->page_size),
690 to_mdev(context->device)->page_size)) {
691 free(qp->sq.wrid);
692 free(qp->rq.wrid);
693 return -1;
694 }
695
696 memset(qp->buf.buf, 0, qp->buf_size);
697 } else {
698 qp->buf.buf = NULL;
699 }
700
701 return 0;
702 }
703
mlx4_set_sq_sizes(struct mlx4_qp * qp,struct ibv_qp_cap * cap,enum ibv_qp_type type)704 void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
705 enum ibv_qp_type type)
706 {
707 int wqe_size;
708
709 wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg);
710 switch (type) {
711 case IBV_QPT_UD:
712 wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
713 break;
714
715 case IBV_QPT_XRC_SEND:
716 case IBV_QPT_UC:
717 case IBV_QPT_RC:
718 wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
719 break;
720
721 default:
722 break;
723 }
724
725 qp->sq.max_gs = wqe_size / sizeof (struct mlx4_wqe_data_seg);
726 cap->max_send_sge = qp->sq.max_gs;
727 qp->sq.max_post = qp->sq.wqe_cnt - qp->sq_spare_wqes;
728 cap->max_send_wr = qp->sq.max_post;
729
730 /*
731 * Inline data segments can't cross a 64 byte boundary. So
732 * subtract off one segment header for each 64-byte chunk,
733 * taking into account the fact that wqe_size will be 32 mod
734 * 64 for non-UD QPs.
735 */
736 qp->max_inline_data = wqe_size -
737 sizeof (struct mlx4_wqe_inline_seg) *
738 (align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
739 cap->max_inline_data = qp->max_inline_data;
740 }
741
mlx4_find_qp(struct mlx4_context * ctx,uint32_t qpn)742 struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
743 {
744 int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
745
746 if (ctx->qp_table[tind].refcnt)
747 return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
748 else
749 return NULL;
750 }
751
mlx4_store_qp(struct mlx4_context * ctx,uint32_t qpn,struct mlx4_qp * qp)752 int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
753 {
754 int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
755
756 if (!ctx->qp_table[tind].refcnt) {
757 ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
758 sizeof (struct mlx4_qp *));
759 if (!ctx->qp_table[tind].table)
760 return -1;
761 }
762
763 ++ctx->qp_table[tind].refcnt;
764 ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
765 return 0;
766 }
767
mlx4_clear_qp(struct mlx4_context * ctx,uint32_t qpn)768 void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
769 {
770 int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
771
772 if (!--ctx->qp_table[tind].refcnt)
773 free(ctx->qp_table[tind].table);
774 else
775 ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
776 }
777