xref: /freebsd/contrib/ofed/libmlx4/qp.c (revision bdafb02fcb88389fd1ab684cfe734cb429d35618)
1 /*
2  * Copyright (c) 2005 Topspin Communications.  All rights reserved.
3  * Copyright (c) 2005 Mellanox Technologies Ltd.  All rights reserved.
4  * Copyright (c) 2007 Cisco, Inc.  All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenIB.org BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  */
34 
35 #include <config.h>
36 
37 #include <stdlib.h>
38 #include <pthread.h>
39 #include <string.h>
40 #include <errno.h>
41 
42 #include "mlx4.h"
43 #include "doorbell.h"
44 #include "wqe.h"
45 
46 static const uint32_t mlx4_ib_opcode[] = {
47 	[IBV_WR_SEND]			= MLX4_OPCODE_SEND,
48 	[IBV_WR_SEND_WITH_IMM]		= MLX4_OPCODE_SEND_IMM,
49 	[IBV_WR_RDMA_WRITE]		= MLX4_OPCODE_RDMA_WRITE,
50 	[IBV_WR_RDMA_WRITE_WITH_IMM]	= MLX4_OPCODE_RDMA_WRITE_IMM,
51 	[IBV_WR_RDMA_READ]		= MLX4_OPCODE_RDMA_READ,
52 	[IBV_WR_ATOMIC_CMP_AND_SWP]	= MLX4_OPCODE_ATOMIC_CS,
53 	[IBV_WR_ATOMIC_FETCH_AND_ADD]	= MLX4_OPCODE_ATOMIC_FA,
54 	[IBV_WR_LOCAL_INV]		= MLX4_OPCODE_LOCAL_INVAL,
55 	[IBV_WR_BIND_MW]		= MLX4_OPCODE_BIND_MW,
56 	[IBV_WR_SEND_WITH_INV]		= MLX4_OPCODE_SEND_INVAL,
57 };
58 
59 static void *get_recv_wqe(struct mlx4_qp *qp, int n)
60 {
61 	return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
62 }
63 
64 static void *get_send_wqe(struct mlx4_qp *qp, int n)
65 {
66 	return qp->buf.buf + qp->sq.offset + (n << qp->sq.wqe_shift);
67 }
68 
69 /*
70  * Stamp a SQ WQE so that it is invalid if prefetched by marking the
71  * first four bytes of every 64 byte chunk with 0xffffffff, except for
72  * the very first chunk of the WQE.
73  */
74 static void stamp_send_wqe(struct mlx4_qp *qp, int n)
75 {
76 	uint32_t *wqe = get_send_wqe(qp, n);
77 	int i;
78 	int ds = (((struct mlx4_wqe_ctrl_seg *)wqe)->fence_size & 0x3f) << 2;
79 
80 	for (i = 16; i < ds; i += 16)
81 		wqe[i] = 0xffffffff;
82 }
83 
84 void mlx4_init_qp_indices(struct mlx4_qp *qp)
85 {
86 	qp->sq.head	 = 0;
87 	qp->sq.tail	 = 0;
88 	qp->rq.head	 = 0;
89 	qp->rq.tail	 = 0;
90 }
91 
92 void mlx4_qp_init_sq_ownership(struct mlx4_qp *qp)
93 {
94 	struct mlx4_wqe_ctrl_seg *ctrl;
95 	int i;
96 
97 	for (i = 0; i < qp->sq.wqe_cnt; ++i) {
98 		ctrl = get_send_wqe(qp, i);
99 		ctrl->owner_opcode = htobe32(1 << 31);
100 		ctrl->fence_size = 1 << (qp->sq.wqe_shift - 4);
101 
102 		stamp_send_wqe(qp, i);
103 	}
104 }
105 
106 static int wq_overflow(struct mlx4_wq *wq, int nreq, struct mlx4_cq *cq)
107 {
108 	unsigned cur;
109 
110 	cur = wq->head - wq->tail;
111 	if (cur + nreq < wq->max_post)
112 		return 0;
113 
114 	pthread_spin_lock(&cq->lock);
115 	cur = wq->head - wq->tail;
116 	pthread_spin_unlock(&cq->lock);
117 
118 	return cur + nreq >= wq->max_post;
119 }
120 
121 static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, struct ibv_send_wr *wr)
122 {
123 	int acc = wr->bind_mw.bind_info.mw_access_flags;
124 	bseg->flags1 = 0;
125 	if (acc & IBV_ACCESS_REMOTE_ATOMIC)
126 		bseg->flags1 |= htobe32(MLX4_WQE_MW_ATOMIC);
127 	if (acc & IBV_ACCESS_REMOTE_WRITE)
128 		bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_WRITE);
129 	if (acc & IBV_ACCESS_REMOTE_READ)
130 		bseg->flags1 |= htobe32(MLX4_WQE_MW_REMOTE_READ);
131 
132 	bseg->flags2 = 0;
133 	if (((struct ibv_mw *)(wr->bind_mw.mw))->type == IBV_MW_TYPE_2)
134 		bseg->flags2 |= htobe32(MLX4_WQE_BIND_TYPE_2);
135 	if (acc & IBV_ACCESS_ZERO_BASED)
136 		bseg->flags2 |= htobe32(MLX4_WQE_BIND_ZERO_BASED);
137 
138 	bseg->new_rkey = htobe32(wr->bind_mw.rkey);
139 	bseg->lkey = htobe32(wr->bind_mw.bind_info.mr->lkey);
140 	bseg->addr = htobe64((uint64_t) wr->bind_mw.bind_info.addr);
141 	bseg->length = htobe64(wr->bind_mw.bind_info.length);
142 }
143 
144 static inline void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg,
145 		uint32_t rkey)
146 {
147 	iseg->mem_key	= htobe32(rkey);
148 
149 	iseg->reserved1    = 0;
150 	iseg->reserved2    = 0;
151 	iseg->reserved3[0] = 0;
152 	iseg->reserved3[1] = 0;
153 }
154 
155 static inline void set_raddr_seg(struct mlx4_wqe_raddr_seg *rseg,
156 				 uint64_t remote_addr, uint32_t rkey)
157 {
158 	rseg->raddr    = htobe64(remote_addr);
159 	rseg->rkey     = htobe32(rkey);
160 	rseg->reserved = 0;
161 }
162 
163 static void set_atomic_seg(struct mlx4_wqe_atomic_seg *aseg, struct ibv_send_wr *wr)
164 {
165 	if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
166 		aseg->swap_add = htobe64(wr->wr.atomic.swap);
167 		aseg->compare  = htobe64(wr->wr.atomic.compare_add);
168 	} else {
169 		aseg->swap_add = htobe64(wr->wr.atomic.compare_add);
170 		aseg->compare  = 0;
171 	}
172 
173 }
174 
175 static void set_datagram_seg(struct mlx4_wqe_datagram_seg *dseg,
176 			     struct ibv_send_wr *wr)
177 {
178 	memcpy(dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof (struct mlx4_av));
179 	dseg->dqpn = htobe32(wr->wr.ud.remote_qpn);
180 	dseg->qkey = htobe32(wr->wr.ud.remote_qkey);
181 	dseg->vlan = htobe16(to_mah(wr->wr.ud.ah)->vlan);
182 	memcpy(dseg->mac, to_mah(wr->wr.ud.ah)->mac, 6);
183 }
184 
185 static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
186 {
187 	dseg->byte_count = htobe32(sg->length);
188 	dseg->lkey       = htobe32(sg->lkey);
189 	dseg->addr       = htobe64(sg->addr);
190 }
191 
192 static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ibv_sge *sg)
193 {
194 	dseg->lkey       = htobe32(sg->lkey);
195 	dseg->addr       = htobe64(sg->addr);
196 
197 	/*
198 	 * Need a barrier here before writing the byte_count field to
199 	 * make sure that all the data is visible before the
200 	 * byte_count field is set.  Otherwise, if the segment begins
201 	 * a new cacheline, the HCA prefetcher could grab the 64-byte
202 	 * chunk and get a valid (!= * 0xffffffff) byte count but
203 	 * stale data, and end up sending the wrong data.
204 	 */
205 	udma_to_device_barrier();
206 
207 	if (likely(sg->length))
208 		dseg->byte_count = htobe32(sg->length);
209 	else
210 		dseg->byte_count = htobe32(0x80000000);
211 }
212 
213 int mlx4_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
214 			  struct ibv_send_wr **bad_wr)
215 {
216 	struct mlx4_context *ctx;
217 	struct mlx4_qp *qp = to_mqp(ibqp);
218 	void *wqe;
219 	struct mlx4_wqe_ctrl_seg *ctrl = NULL;
220 	int ind;
221 	int nreq;
222 	int inl = 0;
223 	int ret = 0;
224 	int size = 0;
225 	int i;
226 
227 	pthread_spin_lock(&qp->sq.lock);
228 
229 	/* XXX check that state is OK to post send */
230 
231 	ind = qp->sq.head;
232 
233 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
234 		if (wq_overflow(&qp->sq, nreq, to_mcq(ibqp->send_cq))) {
235 			ret = ENOMEM;
236 			*bad_wr = wr;
237 			goto out;
238 		}
239 
240 		if (wr->num_sge > qp->sq.max_gs) {
241 			ret = ENOMEM;
242 			*bad_wr = wr;
243 			goto out;
244 		}
245 
246 		if (wr->opcode >= sizeof mlx4_ib_opcode / sizeof mlx4_ib_opcode[0]) {
247 			ret = EINVAL;
248 			*bad_wr = wr;
249 			goto out;
250 		}
251 
252 		ctrl = wqe = get_send_wqe(qp, ind & (qp->sq.wqe_cnt - 1));
253 		qp->sq.wrid[ind & (qp->sq.wqe_cnt - 1)] = wr->wr_id;
254 
255 		ctrl->srcrb_flags =
256 			(wr->send_flags & IBV_SEND_SIGNALED ?
257 			 htobe32(MLX4_WQE_CTRL_CQ_UPDATE) : 0) |
258 			(wr->send_flags & IBV_SEND_SOLICITED ?
259 			 htobe32(MLX4_WQE_CTRL_SOLICIT) : 0)   |
260 			qp->sq_signal_bits;
261 
262 		if (wr->opcode == IBV_WR_SEND_WITH_IMM ||
263 		    wr->opcode == IBV_WR_RDMA_WRITE_WITH_IMM)
264 			ctrl->imm = wr->imm_data;
265 		else
266 			ctrl->imm = 0;
267 
268 		wqe += sizeof *ctrl;
269 		size = sizeof *ctrl / 16;
270 
271 		switch (ibqp->qp_type) {
272 		case IBV_QPT_XRC_SEND:
273 			ctrl->srcrb_flags |= MLX4_REMOTE_SRQN_FLAGS(wr);
274 			/* fall through */
275 		case IBV_QPT_RC:
276 		case IBV_QPT_UC:
277 			switch (wr->opcode) {
278 			case IBV_WR_ATOMIC_CMP_AND_SWP:
279 			case IBV_WR_ATOMIC_FETCH_AND_ADD:
280 				set_raddr_seg(wqe, wr->wr.atomic.remote_addr,
281 					      wr->wr.atomic.rkey);
282 				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
283 
284 				set_atomic_seg(wqe, wr);
285 				wqe  += sizeof (struct mlx4_wqe_atomic_seg);
286 				size += (sizeof (struct mlx4_wqe_raddr_seg) +
287 					 sizeof (struct mlx4_wqe_atomic_seg)) / 16;
288 
289 				break;
290 
291 			case IBV_WR_RDMA_READ:
292 				inl = 1;
293 				/* fall through */
294 			case IBV_WR_RDMA_WRITE:
295 			case IBV_WR_RDMA_WRITE_WITH_IMM:
296 				if (!wr->num_sge)
297 					inl = 1;
298 				set_raddr_seg(wqe, wr->wr.rdma.remote_addr,
299 					      wr->wr.rdma.rkey);
300 				wqe  += sizeof (struct mlx4_wqe_raddr_seg);
301 				size += sizeof (struct mlx4_wqe_raddr_seg) / 16;
302 
303 				break;
304 			case IBV_WR_LOCAL_INV:
305 				ctrl->srcrb_flags |=
306 					htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
307 				set_local_inv_seg(wqe, wr->imm_data);
308 				wqe  += sizeof
309 					(struct mlx4_wqe_local_inval_seg);
310 				size += sizeof
311 					(struct mlx4_wqe_local_inval_seg) / 16;
312 				break;
313 			case IBV_WR_BIND_MW:
314 				ctrl->srcrb_flags |=
315 					htobe32(MLX4_WQE_CTRL_STRONG_ORDER);
316 				set_bind_seg(wqe, wr);
317 				wqe  += sizeof
318 					(struct mlx4_wqe_bind_seg);
319 				size += sizeof
320 					(struct mlx4_wqe_bind_seg) / 16;
321 				break;
322 			case IBV_WR_SEND_WITH_INV:
323 				ctrl->imm = htobe32(wr->imm_data);
324 				break;
325 
326 			default:
327 				/* No extra segments required for sends */
328 				break;
329 			}
330 			break;
331 
332 		case IBV_QPT_UD:
333 			set_datagram_seg(wqe, wr);
334 			wqe  += sizeof (struct mlx4_wqe_datagram_seg);
335 			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
336 
337 			if (wr->send_flags & IBV_SEND_IP_CSUM) {
338 				if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_UD_OVER_IB)) {
339 					ret = EINVAL;
340 					*bad_wr = wr;
341 					goto out;
342 				}
343 				ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
344 							   MLX4_WQE_CTRL_TCP_UDP_CSUM);
345 			}
346 			break;
347 
348 		case IBV_QPT_RAW_PACKET:
349 			/* For raw eth, the MLX4_WQE_CTRL_SOLICIT flag is used
350 			 * to indicate that no icrc should be calculated */
351 			ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_SOLICIT);
352 			if (wr->send_flags & IBV_SEND_IP_CSUM) {
353 				if (!(qp->qp_cap_cache & MLX4_CSUM_SUPPORT_RAW_OVER_ETH)) {
354 					ret = EINVAL;
355 					*bad_wr = wr;
356 					goto out;
357 				}
358 				ctrl->srcrb_flags |= htobe32(MLX4_WQE_CTRL_IP_HDR_CSUM |
359 							   MLX4_WQE_CTRL_TCP_UDP_CSUM);
360 			}
361 			break;
362 
363 		default:
364 			break;
365 		}
366 
367 		if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
368 			struct mlx4_wqe_inline_seg *seg;
369 			void *addr;
370 			int len, seg_len;
371 			int num_seg;
372 			int off, to_copy;
373 
374 			inl = 0;
375 
376 			seg = wqe;
377 			wqe += sizeof *seg;
378 			off = ((uintptr_t) wqe) & (MLX4_INLINE_ALIGN - 1);
379 			num_seg = 0;
380 			seg_len = 0;
381 
382 			for (i = 0; i < wr->num_sge; ++i) {
383 				addr = (void *) (uintptr_t) wr->sg_list[i].addr;
384 				len  = wr->sg_list[i].length;
385 				inl += len;
386 
387 				if (inl > qp->max_inline_data) {
388 					inl = 0;
389 					ret = ENOMEM;
390 					*bad_wr = wr;
391 					goto out;
392 				}
393 
394 				while (len >= MLX4_INLINE_ALIGN - off) {
395 					to_copy = MLX4_INLINE_ALIGN - off;
396 					memcpy(wqe, addr, to_copy);
397 					len -= to_copy;
398 					wqe += to_copy;
399 					addr += to_copy;
400 					seg_len += to_copy;
401 					udma_to_device_barrier(); /* see comment below */
402 					seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
403 					seg_len = 0;
404 					seg = wqe;
405 					wqe += sizeof *seg;
406 					off = sizeof *seg;
407 					++num_seg;
408 				}
409 
410 				memcpy(wqe, addr, len);
411 				wqe += len;
412 				seg_len += len;
413 				off += len;
414 			}
415 
416 			if (seg_len) {
417 				++num_seg;
418 				/*
419 				 * Need a barrier here to make sure
420 				 * all the data is visible before the
421 				 * byte_count field is set.  Otherwise
422 				 * the HCA prefetcher could grab the
423 				 * 64-byte chunk with this inline
424 				 * segment and get a valid (!=
425 				 * 0xffffffff) byte count but stale
426 				 * data, and end up sending the wrong
427 				 * data.
428 				 */
429 				udma_to_device_barrier();
430 				seg->byte_count = htobe32(MLX4_INLINE_SEG | seg_len);
431 			}
432 
433 			size += (inl + num_seg * sizeof * seg + 15) / 16;
434 		} else {
435 			struct mlx4_wqe_data_seg *seg = wqe;
436 
437 			for (i = wr->num_sge - 1; i >= 0 ; --i)
438 				set_data_seg(seg + i, wr->sg_list + i);
439 
440 			size += wr->num_sge * (sizeof *seg / 16);
441 		}
442 
443 		ctrl->fence_size = (wr->send_flags & IBV_SEND_FENCE ?
444 				    MLX4_WQE_CTRL_FENCE : 0) | size;
445 
446 		/*
447 		 * Make sure descriptor is fully written before
448 		 * setting ownership bit (because HW can start
449 		 * executing as soon as we do).
450 		 */
451 		udma_to_device_barrier();
452 
453 		ctrl->owner_opcode = htobe32(mlx4_ib_opcode[wr->opcode]) |
454 			(ind & qp->sq.wqe_cnt ? htobe32(1 << 31) : 0);
455 
456 		/*
457 		 * We can improve latency by not stamping the last
458 		 * send queue WQE until after ringing the doorbell, so
459 		 * only stamp here if there are still more WQEs to post.
460 		 */
461 		if (wr->next)
462 			stamp_send_wqe(qp, (ind + qp->sq_spare_wqes) &
463 				       (qp->sq.wqe_cnt - 1));
464 
465 		++ind;
466 	}
467 
468 out:
469 	ctx = to_mctx(ibqp->context);
470 
471 	if (nreq == 1 && inl && size > 1 && size <= ctx->bf_buf_size / 16) {
472 		ctrl->owner_opcode |= htobe32((qp->sq.head & 0xffff) << 8);
473 
474 		ctrl->bf_qpn |= qp->doorbell_qpn;
475 		++qp->sq.head;
476 		/*
477 		 * Make sure that descriptor is written to memory
478 		 * before writing to BlueFlame page.
479 		 */
480 		mmio_wc_spinlock(&ctx->bf_lock);
481 
482 		mlx4_bf_copy(ctx->bf_page + ctx->bf_offset, (unsigned long *) ctrl,
483 			     align(size * 16, 64));
484 		/* Flush before toggling bf_offset to be latency oriented */
485 		mmio_flush_writes();
486 
487 		ctx->bf_offset ^= ctx->bf_buf_size;
488 
489 		pthread_spin_unlock(&ctx->bf_lock);
490 	} else if (nreq) {
491 		qp->sq.head += nreq;
492 
493 		/*
494 		 * Make sure that descriptors are written before
495 		 * doorbell record.
496 		 */
497 		udma_to_device_barrier();
498 
499 		mmio_writel((unsigned long)(ctx->uar + MLX4_SEND_DOORBELL),
500 			    qp->doorbell_qpn);
501 	}
502 
503 	if (nreq)
504 		stamp_send_wqe(qp, (ind + qp->sq_spare_wqes - 1) &
505 			       (qp->sq.wqe_cnt - 1));
506 
507 	pthread_spin_unlock(&qp->sq.lock);
508 
509 	return ret;
510 }
511 
512 int mlx4_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
513 		   struct ibv_recv_wr **bad_wr)
514 {
515 	struct mlx4_qp *qp = to_mqp(ibqp);
516 	struct mlx4_wqe_data_seg *scat;
517 	int ret = 0;
518 	int nreq;
519 	int ind;
520 	int i;
521 
522 	pthread_spin_lock(&qp->rq.lock);
523 
524 	/* XXX check that state is OK to post receive */
525 
526 	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
527 
528 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
529 		if (wq_overflow(&qp->rq, nreq, to_mcq(ibqp->recv_cq))) {
530 			ret = ENOMEM;
531 			*bad_wr = wr;
532 			goto out;
533 		}
534 
535 		if (wr->num_sge > qp->rq.max_gs) {
536 			ret = ENOMEM;
537 			*bad_wr = wr;
538 			goto out;
539 		}
540 
541 		scat = get_recv_wqe(qp, ind);
542 
543 		for (i = 0; i < wr->num_sge; ++i)
544 			__set_data_seg(scat + i, wr->sg_list + i);
545 
546 		if (i < qp->rq.max_gs) {
547 			scat[i].byte_count = 0;
548 			scat[i].lkey       = htobe32(MLX4_INVALID_LKEY);
549 			scat[i].addr       = 0;
550 		}
551 
552 		qp->rq.wrid[ind] = wr->wr_id;
553 
554 		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
555 	}
556 
557 out:
558 	if (nreq) {
559 		qp->rq.head += nreq;
560 
561 		/*
562 		 * Make sure that descriptors are written before
563 		 * doorbell record.
564 		 */
565 		udma_to_device_barrier();
566 
567 		*qp->db = htobe32(qp->rq.head & 0xffff);
568 	}
569 
570 	pthread_spin_unlock(&qp->rq.lock);
571 
572 	return ret;
573 }
574 
575 static int num_inline_segs(int data, enum ibv_qp_type type)
576 {
577 	/*
578 	 * Inline data segments are not allowed to cross 64 byte
579 	 * boundaries.  For UD QPs, the data segments always start
580 	 * aligned to 64 bytes (16 byte control segment + 48 byte
581 	 * datagram segment); for other QPs, there will be a 16 byte
582 	 * control segment and possibly a 16 byte remote address
583 	 * segment, so in the worst case there will be only 32 bytes
584 	 * available for the first data segment.
585 	 */
586 	if (type == IBV_QPT_UD)
587 		data += (sizeof (struct mlx4_wqe_ctrl_seg) +
588 			 sizeof (struct mlx4_wqe_datagram_seg)) %
589 			MLX4_INLINE_ALIGN;
590 	else
591 		data += (sizeof (struct mlx4_wqe_ctrl_seg) +
592 			 sizeof (struct mlx4_wqe_raddr_seg)) %
593 			MLX4_INLINE_ALIGN;
594 
595 	return (data + MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg) - 1) /
596 		(MLX4_INLINE_ALIGN - sizeof (struct mlx4_wqe_inline_seg));
597 }
598 
599 void mlx4_calc_sq_wqe_size(struct ibv_qp_cap *cap, enum ibv_qp_type type,
600 			   struct mlx4_qp *qp)
601 {
602 	int size;
603 	int max_sq_sge;
604 
605 	max_sq_sge	 = align(cap->max_inline_data +
606 				 num_inline_segs(cap->max_inline_data, type) *
607 				 sizeof (struct mlx4_wqe_inline_seg),
608 				 sizeof (struct mlx4_wqe_data_seg)) /
609 		sizeof (struct mlx4_wqe_data_seg);
610 	if (max_sq_sge < cap->max_send_sge)
611 		max_sq_sge = cap->max_send_sge;
612 
613 	size = max_sq_sge * sizeof (struct mlx4_wqe_data_seg);
614 	switch (type) {
615 	case IBV_QPT_UD:
616 		size += sizeof (struct mlx4_wqe_datagram_seg);
617 		break;
618 
619 	case IBV_QPT_UC:
620 		size += sizeof (struct mlx4_wqe_raddr_seg);
621 		break;
622 
623 	case IBV_QPT_XRC_SEND:
624 	case IBV_QPT_RC:
625 		size += sizeof (struct mlx4_wqe_raddr_seg);
626 		/*
627 		 * An atomic op will require an atomic segment, a
628 		 * remote address segment and one scatter entry.
629 		 */
630 		if (size < (sizeof (struct mlx4_wqe_atomic_seg) +
631 			    sizeof (struct mlx4_wqe_raddr_seg) +
632 			    sizeof (struct mlx4_wqe_data_seg)))
633 			size = (sizeof (struct mlx4_wqe_atomic_seg) +
634 				sizeof (struct mlx4_wqe_raddr_seg) +
635 				sizeof (struct mlx4_wqe_data_seg));
636 		break;
637 
638 	default:
639 		break;
640 	}
641 
642 	/* Make sure that we have enough space for a bind request */
643 	if (size < sizeof (struct mlx4_wqe_bind_seg))
644 		size = sizeof (struct mlx4_wqe_bind_seg);
645 
646 	size += sizeof (struct mlx4_wqe_ctrl_seg);
647 
648 	for (qp->sq.wqe_shift = 6; 1 << qp->sq.wqe_shift < size;
649 	     qp->sq.wqe_shift++)
650 		; /* nothing */
651 }
652 
653 int mlx4_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_cap *cap,
654 		       enum ibv_qp_type type, struct mlx4_qp *qp)
655 {
656 	qp->rq.max_gs	 = cap->max_recv_sge;
657 
658 	if (qp->sq.wqe_cnt) {
659 		qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof (uint64_t));
660 		if (!qp->sq.wrid)
661 			return -1;
662 	}
663 
664 	if (qp->rq.wqe_cnt) {
665 		qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof (uint64_t));
666 		if (!qp->rq.wrid) {
667 			free(qp->sq.wrid);
668 			return -1;
669 		}
670 	}
671 
672 	for (qp->rq.wqe_shift = 4;
673 	     1 << qp->rq.wqe_shift < qp->rq.max_gs * sizeof (struct mlx4_wqe_data_seg);
674 	     qp->rq.wqe_shift++)
675 		; /* nothing */
676 
677 	qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
678 		(qp->sq.wqe_cnt << qp->sq.wqe_shift);
679 	if (qp->rq.wqe_shift > qp->sq.wqe_shift) {
680 		qp->rq.offset = 0;
681 		qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
682 	} else {
683 		qp->rq.offset = qp->sq.wqe_cnt << qp->sq.wqe_shift;
684 		qp->sq.offset = 0;
685 	}
686 
687 	if (qp->buf_size) {
688 		if (mlx4_alloc_buf(&qp->buf,
689 				   align(qp->buf_size, to_mdev(context->device)->page_size),
690 				   to_mdev(context->device)->page_size)) {
691 			free(qp->sq.wrid);
692 			free(qp->rq.wrid);
693 			return -1;
694 		}
695 
696 		memset(qp->buf.buf, 0, qp->buf_size);
697 	} else {
698 		qp->buf.buf = NULL;
699 	}
700 
701 	return 0;
702 }
703 
704 void mlx4_set_sq_sizes(struct mlx4_qp *qp, struct ibv_qp_cap *cap,
705 		       enum ibv_qp_type type)
706 {
707 	int wqe_size;
708 
709 	wqe_size = (1 << qp->sq.wqe_shift) - sizeof (struct mlx4_wqe_ctrl_seg);
710 	switch (type) {
711 	case IBV_QPT_UD:
712 		wqe_size -= sizeof (struct mlx4_wqe_datagram_seg);
713 		break;
714 
715 	case IBV_QPT_XRC_SEND:
716 	case IBV_QPT_UC:
717 	case IBV_QPT_RC:
718 		wqe_size -= sizeof (struct mlx4_wqe_raddr_seg);
719 		break;
720 
721 	default:
722 		break;
723 	}
724 
725 	qp->sq.max_gs	     = wqe_size / sizeof (struct mlx4_wqe_data_seg);
726 	cap->max_send_sge    = qp->sq.max_gs;
727 	qp->sq.max_post	     = qp->sq.wqe_cnt - qp->sq_spare_wqes;
728 	cap->max_send_wr     = qp->sq.max_post;
729 
730 	/*
731 	 * Inline data segments can't cross a 64 byte boundary.  So
732 	 * subtract off one segment header for each 64-byte chunk,
733 	 * taking into account the fact that wqe_size will be 32 mod
734 	 * 64 for non-UD QPs.
735 	 */
736 	qp->max_inline_data  = wqe_size -
737 		sizeof (struct mlx4_wqe_inline_seg) *
738 		(align(wqe_size, MLX4_INLINE_ALIGN) / MLX4_INLINE_ALIGN);
739 	cap->max_inline_data = qp->max_inline_data;
740 }
741 
742 struct mlx4_qp *mlx4_find_qp(struct mlx4_context *ctx, uint32_t qpn)
743 {
744 	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
745 
746 	if (ctx->qp_table[tind].refcnt)
747 		return ctx->qp_table[tind].table[qpn & ctx->qp_table_mask];
748 	else
749 		return NULL;
750 }
751 
752 int mlx4_store_qp(struct mlx4_context *ctx, uint32_t qpn, struct mlx4_qp *qp)
753 {
754 	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
755 
756 	if (!ctx->qp_table[tind].refcnt) {
757 		ctx->qp_table[tind].table = calloc(ctx->qp_table_mask + 1,
758 						   sizeof (struct mlx4_qp *));
759 		if (!ctx->qp_table[tind].table)
760 			return -1;
761 	}
762 
763 	++ctx->qp_table[tind].refcnt;
764 	ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = qp;
765 	return 0;
766 }
767 
768 void mlx4_clear_qp(struct mlx4_context *ctx, uint32_t qpn)
769 {
770 	int tind = (qpn & (ctx->num_qps - 1)) >> ctx->qp_table_shift;
771 
772 	if (!--ctx->qp_table[tind].refcnt)
773 		free(ctx->qp_table[tind].table);
774 	else
775 		ctx->qp_table[tind].table[qpn & ctx->qp_table_mask] = NULL;
776 }
777