xref: /freebsd/contrib/ofed/libmlx5/qp.c (revision 488ab515d6cc02f6f743f0badfc8e94eb553cd30)
1 /*
2  * Copyright (c) 2012 Mellanox Technologies, Inc.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <config.h>
34 
35 #include <stdlib.h>
36 #include <pthread.h>
37 #include <string.h>
38 #include <errno.h>
39 #include <stdio.h>
40 
41 #include "mlx5.h"
42 #include "doorbell.h"
43 #include "wqe.h"
44 
45 #define MLX5_ATOMIC_SIZE 8
46 
47 static const uint32_t mlx5_ib_opcode[] = {
48 	[IBV_WR_SEND]			= MLX5_OPCODE_SEND,
49 	[IBV_WR_SEND_WITH_INV]		= MLX5_OPCODE_SEND_INVAL,
50 	[IBV_WR_SEND_WITH_IMM]		= MLX5_OPCODE_SEND_IMM,
51 	[IBV_WR_RDMA_WRITE]		= MLX5_OPCODE_RDMA_WRITE,
52 	[IBV_WR_RDMA_WRITE_WITH_IMM]	= MLX5_OPCODE_RDMA_WRITE_IMM,
53 	[IBV_WR_RDMA_READ]		= MLX5_OPCODE_RDMA_READ,
54 	[IBV_WR_ATOMIC_CMP_AND_SWP]	= MLX5_OPCODE_ATOMIC_CS,
55 	[IBV_WR_ATOMIC_FETCH_AND_ADD]	= MLX5_OPCODE_ATOMIC_FA,
56 	[IBV_WR_BIND_MW]		= MLX5_OPCODE_UMR,
57 	[IBV_WR_LOCAL_INV]		= MLX5_OPCODE_UMR,
58 	[IBV_WR_TSO]			= MLX5_OPCODE_TSO,
59 };
60 
61 static void *get_recv_wqe(struct mlx5_qp *qp, int n)
62 {
63 	return qp->buf.buf + qp->rq.offset + (n << qp->rq.wqe_shift);
64 }
65 
66 static void *get_wq_recv_wqe(struct mlx5_rwq *rwq, int n)
67 {
68 	return rwq->pbuff  + (n << rwq->rq.wqe_shift);
69 }
70 
71 static int copy_to_scat(struct mlx5_wqe_data_seg *scat, void *buf, int *size,
72 			 int max)
73 {
74 	int copy;
75 	int i;
76 
77 	if (unlikely(!(*size)))
78 		return IBV_WC_SUCCESS;
79 
80 	for (i = 0; i < max; ++i) {
81 		copy = min_t(long, *size, be32toh(scat->byte_count));
82 		memcpy((void *)(unsigned long)be64toh(scat->addr), buf, copy);
83 		*size -= copy;
84 		if (*size == 0)
85 			return IBV_WC_SUCCESS;
86 
87 		buf += copy;
88 		++scat;
89 	}
90 	return IBV_WC_LOC_LEN_ERR;
91 }
92 
93 int mlx5_copy_to_recv_wqe(struct mlx5_qp *qp, int idx, void *buf, int size)
94 {
95 	struct mlx5_wqe_data_seg *scat;
96 	int max = 1 << (qp->rq.wqe_shift - 4);
97 
98 	scat = get_recv_wqe(qp, idx);
99 	if (unlikely(qp->wq_sig))
100 		++scat;
101 
102 	return copy_to_scat(scat, buf, &size, max);
103 }
104 
105 int mlx5_copy_to_send_wqe(struct mlx5_qp *qp, int idx, void *buf, int size)
106 {
107 	struct mlx5_wqe_ctrl_seg *ctrl;
108 	struct mlx5_wqe_data_seg *scat;
109 	void *p;
110 	int max;
111 
112 	idx &= (qp->sq.wqe_cnt - 1);
113 	ctrl = mlx5_get_send_wqe(qp, idx);
114 	if (qp->ibv_qp->qp_type != IBV_QPT_RC) {
115 		fprintf(stderr, "scatter to CQE is supported only for RC QPs\n");
116 		return IBV_WC_GENERAL_ERR;
117 	}
118 	p = ctrl + 1;
119 
120 	switch (be32toh(ctrl->opmod_idx_opcode) & 0xff) {
121 	case MLX5_OPCODE_RDMA_READ:
122 		p = p + sizeof(struct mlx5_wqe_raddr_seg);
123 		break;
124 
125 	case MLX5_OPCODE_ATOMIC_CS:
126 	case MLX5_OPCODE_ATOMIC_FA:
127 		p = p + sizeof(struct mlx5_wqe_raddr_seg) +
128 			sizeof(struct mlx5_wqe_atomic_seg);
129 		break;
130 
131 	default:
132 		fprintf(stderr, "scatter to CQE for opcode %d\n",
133 			be32toh(ctrl->opmod_idx_opcode) & 0xff);
134 		return IBV_WC_REM_INV_REQ_ERR;
135 	}
136 
137 	scat = p;
138 	max = (be32toh(ctrl->qpn_ds) & 0x3F) - (((void *)scat - (void *)ctrl) >> 4);
139 	if (unlikely((void *)(scat + max) > qp->sq.qend)) {
140 		int tmp = ((void *)qp->sq.qend - (void *)scat) >> 4;
141 		int orig_size = size;
142 
143 		if (copy_to_scat(scat, buf, &size, tmp) == IBV_WC_SUCCESS)
144 			return IBV_WC_SUCCESS;
145 		max = max - tmp;
146 		buf += orig_size - size;
147 		scat = mlx5_get_send_wqe(qp, 0);
148 	}
149 
150 	return copy_to_scat(scat, buf, &size, max);
151 }
152 
153 void *mlx5_get_send_wqe(struct mlx5_qp *qp, int n)
154 {
155 	return qp->sq_start + (n << MLX5_SEND_WQE_SHIFT);
156 }
157 
158 void mlx5_init_rwq_indices(struct mlx5_rwq *rwq)
159 {
160 	rwq->rq.head	 = 0;
161 	rwq->rq.tail	 = 0;
162 }
163 
164 void mlx5_init_qp_indices(struct mlx5_qp *qp)
165 {
166 	qp->sq.head	 = 0;
167 	qp->sq.tail	 = 0;
168 	qp->rq.head	 = 0;
169 	qp->rq.tail	 = 0;
170 	qp->sq.cur_post  = 0;
171 }
172 
173 static int mlx5_wq_overflow(struct mlx5_wq *wq, int nreq, struct mlx5_cq *cq)
174 {
175 	unsigned cur;
176 
177 	cur = wq->head - wq->tail;
178 	if (cur + nreq < wq->max_post)
179 		return 0;
180 
181 	mlx5_spin_lock(&cq->lock);
182 	cur = wq->head - wq->tail;
183 	mlx5_spin_unlock(&cq->lock);
184 
185 	return cur + nreq >= wq->max_post;
186 }
187 
188 static inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg,
189 				 uint64_t remote_addr, uint32_t rkey)
190 {
191 	rseg->raddr    = htobe64(remote_addr);
192 	rseg->rkey     = htobe32(rkey);
193 	rseg->reserved = 0;
194 }
195 
196 static void set_atomic_seg(struct mlx5_wqe_atomic_seg *aseg,
197 			   enum ibv_wr_opcode   opcode,
198 			   uint64_t swap,
199 			   uint64_t compare_add)
200 {
201 	if (opcode == IBV_WR_ATOMIC_CMP_AND_SWP) {
202 		aseg->swap_add = htobe64(swap);
203 		aseg->compare  = htobe64(compare_add);
204 	} else {
205 		aseg->swap_add = htobe64(compare_add);
206 	}
207 }
208 
209 static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg,
210 			     struct ibv_send_wr *wr)
211 {
212 	memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof dseg->av);
213 	dseg->av.dqp_dct = htobe32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV);
214 	dseg->av.key.qkey.qkey = htobe32(wr->wr.ud.remote_qkey);
215 }
216 
217 static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ibv_sge *sg,
218 			     int offset)
219 {
220 	dseg->byte_count = htobe32(sg->length - offset);
221 	dseg->lkey       = htobe32(sg->lkey);
222 	dseg->addr       = htobe64(sg->addr + offset);
223 }
224 
225 static void set_data_ptr_seg_atomic(struct mlx5_wqe_data_seg *dseg,
226 				    struct ibv_sge *sg)
227 {
228 	dseg->byte_count = htobe32(MLX5_ATOMIC_SIZE);
229 	dseg->lkey       = htobe32(sg->lkey);
230 	dseg->addr       = htobe64(sg->addr);
231 }
232 
233 /*
234  * Avoid using memcpy() to copy to BlueFlame page, since memcpy()
235  * implementations may use move-string-buffer assembler instructions,
236  * which do not guarantee order of copying.
237  */
238 static void mlx5_bf_copy(unsigned long long *dst, unsigned long long *src,
239 			 unsigned bytecnt, struct mlx5_qp *qp)
240 {
241 	while (bytecnt > 0) {
242 		*dst++ = *src++;
243 		*dst++ = *src++;
244 		*dst++ = *src++;
245 		*dst++ = *src++;
246 		*dst++ = *src++;
247 		*dst++ = *src++;
248 		*dst++ = *src++;
249 		*dst++ = *src++;
250 		bytecnt -= 8 * sizeof(unsigned long long);
251 		if (unlikely(src == qp->sq.qend))
252 			src = qp->sq_start;
253 	}
254 }
255 
256 static uint32_t send_ieth(struct ibv_send_wr *wr)
257 {
258 	switch (wr->opcode) {
259 	case IBV_WR_SEND_WITH_IMM:
260 	case IBV_WR_RDMA_WRITE_WITH_IMM:
261 		return wr->imm_data;
262 	case IBV_WR_SEND_WITH_INV:
263 		return htobe32(wr->imm_data);
264 	default:
265 		return 0;
266 	}
267 }
268 
269 static int set_data_inl_seg(struct mlx5_qp *qp, struct ibv_send_wr *wr,
270 			    void *wqe, int *sz,
271 			    struct mlx5_sg_copy_ptr *sg_copy_ptr)
272 {
273 	struct mlx5_wqe_inline_seg *seg;
274 	void *addr;
275 	int len;
276 	int i;
277 	int inl = 0;
278 	void *qend = qp->sq.qend;
279 	int copy;
280 	int offset = sg_copy_ptr->offset;
281 
282 	seg = wqe;
283 	wqe += sizeof *seg;
284 	for (i = sg_copy_ptr->index; i < wr->num_sge; ++i) {
285 		addr = (void *) (unsigned long)(wr->sg_list[i].addr + offset);
286 		len  = wr->sg_list[i].length - offset;
287 		inl += len;
288 		offset = 0;
289 
290 		if (unlikely(inl > qp->max_inline_data))
291 			return ENOMEM;
292 
293 		if (unlikely(wqe + len > qend)) {
294 			copy = qend - wqe;
295 			memcpy(wqe, addr, copy);
296 			addr += copy;
297 			len -= copy;
298 			wqe = mlx5_get_send_wqe(qp, 0);
299 		}
300 		memcpy(wqe, addr, len);
301 		wqe += len;
302 	}
303 
304 	if (likely(inl)) {
305 		seg->byte_count = htobe32(inl | MLX5_INLINE_SEG);
306 		*sz = align(inl + sizeof seg->byte_count, 16) / 16;
307 	} else
308 		*sz = 0;
309 
310 	return 0;
311 }
312 
313 static uint8_t wq_sig(struct mlx5_wqe_ctrl_seg *ctrl)
314 {
315 	return calc_sig(ctrl, be32toh(ctrl->qpn_ds));
316 }
317 
318 #ifdef MLX5_DEBUG
319 static void dump_wqe(FILE *fp, int idx, int size_16, struct mlx5_qp *qp)
320 {
321 	uint32_t *p = NULL;
322 	int i, j;
323 	int tidx = idx;
324 
325 	fprintf(fp, "dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx));
326 	for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) {
327 		if ((i & 0xf) == 0) {
328 			void *buf = mlx5_get_send_wqe(qp, tidx);
329 			tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1);
330 			p = buf;
331 			j = 0;
332 		}
333 		fprintf(fp, "%08x %08x %08x %08x\n", be32toh(p[j]), be32toh(p[j + 1]),
334 			be32toh(p[j + 2]), be32toh(p[j + 3]));
335 	}
336 }
337 #endif /* MLX5_DEBUG */
338 
339 
340 void *mlx5_get_atomic_laddr(struct mlx5_qp *qp, uint16_t idx, int *byte_count)
341 {
342 	struct mlx5_wqe_data_seg *dpseg;
343 	void *addr;
344 
345 	dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) +
346 		sizeof(struct mlx5_wqe_raddr_seg) +
347 		sizeof(struct mlx5_wqe_atomic_seg);
348 	addr = (void *)(unsigned long)be64toh(dpseg->addr);
349 
350 	/*
351 	 * Currently byte count is always 8 bytes. Fix this when
352 	 * we support variable size of atomics
353 	 */
354 	*byte_count = 8;
355 	return addr;
356 }
357 
358 static inline int copy_eth_inline_headers(struct ibv_qp *ibqp,
359 					  struct ibv_send_wr *wr,
360 					  struct mlx5_wqe_eth_seg *eseg,
361 					  struct mlx5_sg_copy_ptr *sg_copy_ptr)
362 {
363 	uint32_t inl_hdr_size = MLX5_ETH_L2_INLINE_HEADER_SIZE;
364 	int inl_hdr_copy_size = 0;
365 	int j = 0;
366 	FILE *fp = to_mctx(ibqp->context)->dbg_fp;
367 
368 	if (unlikely(wr->num_sge < 1)) {
369 		mlx5_dbg(fp, MLX5_DBG_QP_SEND, "illegal num_sge: %d, minimum is 1\n",
370 			 wr->num_sge);
371 		return EINVAL;
372 	}
373 
374 	if (likely(wr->sg_list[0].length >= MLX5_ETH_L2_INLINE_HEADER_SIZE)) {
375 		inl_hdr_copy_size = MLX5_ETH_L2_INLINE_HEADER_SIZE;
376 		memcpy(eseg->inline_hdr_start,
377 		       (void *)(uintptr_t)wr->sg_list[0].addr,
378 		       inl_hdr_copy_size);
379 	} else {
380 		for (j = 0; j < wr->num_sge && inl_hdr_size > 0; ++j) {
381 			inl_hdr_copy_size = min(wr->sg_list[j].length,
382 						inl_hdr_size);
383 			memcpy(eseg->inline_hdr_start +
384 			       (MLX5_ETH_L2_INLINE_HEADER_SIZE - inl_hdr_size),
385 			       (void *)(uintptr_t)wr->sg_list[j].addr,
386 			       inl_hdr_copy_size);
387 			inl_hdr_size -= inl_hdr_copy_size;
388 		}
389 		if (unlikely(inl_hdr_size)) {
390 			mlx5_dbg(fp, MLX5_DBG_QP_SEND, "Ethernet headers < 16 bytes\n");
391 			return EINVAL;
392 		}
393 		--j;
394 	}
395 
396 
397 	eseg->inline_hdr_sz = htobe16(MLX5_ETH_L2_INLINE_HEADER_SIZE);
398 
399 	/* If we copied all the sge into the inline-headers, then we need to
400 	 * start copying from the next sge into the data-segment.
401 	 */
402 	if (unlikely(wr->sg_list[j].length == inl_hdr_copy_size)) {
403 		++j;
404 		inl_hdr_copy_size = 0;
405 	}
406 
407 	sg_copy_ptr->index = j;
408 	sg_copy_ptr->offset = inl_hdr_copy_size;
409 
410 	return 0;
411 }
412 
413 #undef	ALIGN
414 #define ALIGN(x, log_a) ((((x) + (1 << (log_a)) - 1)) & ~((1 << (log_a)) - 1))
415 
416 static inline uint16_t get_klm_octo(int nentries)
417 {
418 	return htobe16(ALIGN(nentries, 3) / 2);
419 }
420 
421 static void set_umr_data_seg(struct mlx5_qp *qp, enum ibv_mw_type type,
422 			     int32_t rkey, struct ibv_mw_bind_info *bind_info,
423 			     uint32_t qpn, void **seg, int *size)
424 {
425 	union {
426 		struct mlx5_wqe_umr_klm_seg	klm;
427 		uint8_t				reserved[64];
428 	} *data = *seg;
429 
430 	data->klm.byte_count = htobe32(bind_info->length);
431 	data->klm.mkey = htobe32(bind_info->mr->lkey);
432 	data->klm.address = htobe64(bind_info->addr);
433 
434 	memset(&data->klm + 1, 0, sizeof(data->reserved) -
435 	       sizeof(data->klm));
436 
437 	*seg += sizeof(*data);
438 	*size += (sizeof(*data) / 16);
439 }
440 
441 static void set_umr_mkey_seg(struct mlx5_qp *qp, enum ibv_mw_type type,
442 			     int32_t rkey, struct ibv_mw_bind_info *bind_info,
443 			     uint32_t qpn, void **seg, int *size)
444 {
445 	struct mlx5_wqe_mkey_context_seg	*mkey = *seg;
446 
447 	mkey->qpn_mkey = htobe32((rkey & 0xFF) |
448 				   ((type == IBV_MW_TYPE_1 || !bind_info->length) ?
449 				    0xFFFFFF00 : qpn << 8));
450 	if (bind_info->length) {
451 		/* Local read is set in kernel */
452 		mkey->access_flags = 0;
453 		mkey->free = 0;
454 		if (bind_info->mw_access_flags & IBV_ACCESS_LOCAL_WRITE)
455 			mkey->access_flags |=
456 				MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_LOCAL_WRITE;
457 		if (bind_info->mw_access_flags & IBV_ACCESS_REMOTE_WRITE)
458 			mkey->access_flags |=
459 				MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_REMOTE_WRITE;
460 		if (bind_info->mw_access_flags & IBV_ACCESS_REMOTE_READ)
461 			mkey->access_flags |=
462 				MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_REMOTE_READ;
463 		if (bind_info->mw_access_flags & IBV_ACCESS_REMOTE_ATOMIC)
464 			mkey->access_flags |=
465 				MLX5_WQE_MKEY_CONTEXT_ACCESS_FLAGS_ATOMIC;
466 		if (bind_info->mw_access_flags & IBV_ACCESS_ZERO_BASED)
467 			mkey->start_addr = 0;
468 		else
469 			mkey->start_addr = htobe64(bind_info->addr);
470 		mkey->len = htobe64(bind_info->length);
471 	} else {
472 		mkey->free = MLX5_WQE_MKEY_CONTEXT_FREE;
473 	}
474 
475 	*seg += sizeof(struct mlx5_wqe_mkey_context_seg);
476 	*size += (sizeof(struct mlx5_wqe_mkey_context_seg) / 16);
477 }
478 
479 static inline void set_umr_control_seg(struct mlx5_qp *qp, enum ibv_mw_type type,
480 				       int32_t rkey, struct ibv_mw_bind_info *bind_info,
481 				       uint32_t qpn, void **seg, int *size)
482 {
483 	struct mlx5_wqe_umr_ctrl_seg		*ctrl = *seg;
484 
485 	ctrl->flags = MLX5_WQE_UMR_CTRL_FLAG_TRNSLATION_OFFSET |
486 		MLX5_WQE_UMR_CTRL_FLAG_INLINE;
487 	ctrl->mkey_mask = htobe64(MLX5_WQE_UMR_CTRL_MKEY_MASK_FREE |
488 				     MLX5_WQE_UMR_CTRL_MKEY_MASK_MKEY);
489 	ctrl->translation_offset = 0;
490 	memset(ctrl->rsvd0, 0, sizeof(ctrl->rsvd0));
491 	memset(ctrl->rsvd1, 0, sizeof(ctrl->rsvd1));
492 
493 	if (type == IBV_MW_TYPE_2)
494 		ctrl->mkey_mask |= htobe64(MLX5_WQE_UMR_CTRL_MKEY_MASK_QPN);
495 
496 	if (bind_info->length) {
497 		ctrl->klm_octowords = get_klm_octo(1);
498 		if (type == IBV_MW_TYPE_2)
499 			ctrl->flags |=  MLX5_WQE_UMR_CTRL_FLAG_CHECK_FREE;
500 		ctrl->mkey_mask |= htobe64(MLX5_WQE_UMR_CTRL_MKEY_MASK_LEN	|
501 					      MLX5_WQE_UMR_CTRL_MKEY_MASK_START_ADDR |
502 					      MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_LOCAL_WRITE |
503 					      MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_REMOTE_READ |
504 					      MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_REMOTE_WRITE |
505 					      MLX5_WQE_UMR_CTRL_MKEY_MASK_ACCESS_ATOMIC);
506 	} else {
507 		ctrl->klm_octowords = get_klm_octo(0);
508 		if (type == IBV_MW_TYPE_2)
509 			ctrl->flags |= MLX5_WQE_UMR_CTRL_FLAG_CHECK_QPN;
510 	}
511 
512 	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
513 	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
514 }
515 
516 static inline int set_bind_wr(struct mlx5_qp *qp, enum ibv_mw_type type,
517 			      int32_t rkey, struct ibv_mw_bind_info *bind_info,
518 			      uint32_t qpn, void **seg, int *size)
519 {
520 	void *qend = qp->sq.qend;
521 
522 #ifdef MW_DEBUG
523 	if (bind_info->mw_access_flags &
524 	    ~(IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_REMOTE_READ |
525 	     IBV_ACCESS_REMOTE_WRITE))
526 		return EINVAL;
527 
528 	if (bind_info->mr &&
529 	    (bind_info->mr->addr > (void *)bind_info->addr ||
530 	     bind_info->mr->addr + bind_info->mr->length <
531 	     (void *)bind_info->addr + bind_info->length ||
532 	     !(to_mmr(bind_info->mr)->alloc_flags &  IBV_ACCESS_MW_BIND) ||
533 	     (bind_info->mw_access_flags &
534 	      (IBV_ACCESS_REMOTE_ATOMIC | IBV_ACCESS_REMOTE_WRITE) &&
535 	      !(to_mmr(bind_info->mr)->alloc_flags & IBV_ACCESS_LOCAL_WRITE))))
536 		return EINVAL;
537 
538 #endif
539 
540 	/* check that len > 2GB because KLM support only 2GB */
541 	if (bind_info->length > 1UL << 31)
542 		return EOPNOTSUPP;
543 
544 	set_umr_control_seg(qp, type, rkey, bind_info, qpn, seg, size);
545 	if (unlikely((*seg == qend)))
546 		*seg = mlx5_get_send_wqe(qp, 0);
547 
548 	set_umr_mkey_seg(qp, type, rkey, bind_info, qpn, seg, size);
549 	if (!bind_info->length)
550 		return 0;
551 
552 	if (unlikely((seg == qend)))
553 		*seg = mlx5_get_send_wqe(qp, 0);
554 
555 	set_umr_data_seg(qp, type, rkey, bind_info, qpn, seg, size);
556 	return 0;
557 }
558 
559 /* Copy tso header to eth segment with considering padding and WQE
560  * wrap around in WQ buffer.
561  */
562 static inline int set_tso_eth_seg(void **seg, struct ibv_send_wr *wr,
563 				   void *qend, struct mlx5_qp *qp, int *size)
564 {
565 	struct mlx5_wqe_eth_seg *eseg = *seg;
566 	int size_of_inl_hdr_start = sizeof(eseg->inline_hdr_start);
567 	uint64_t left, left_len, copy_sz;
568 	void *pdata = wr->tso.hdr;
569 	FILE *fp = to_mctx(qp->ibv_qp->context)->dbg_fp;
570 
571 	if (unlikely(wr->tso.hdr_sz < MLX5_ETH_L2_MIN_HEADER_SIZE ||
572 		     wr->tso.hdr_sz > qp->max_tso_header)) {
573 		mlx5_dbg(fp, MLX5_DBG_QP_SEND,
574 			 "TSO header size should be at least %d and at most %d\n",
575 			 MLX5_ETH_L2_MIN_HEADER_SIZE,
576 			 qp->max_tso_header);
577 		return EINVAL;
578 	}
579 
580 	left = wr->tso.hdr_sz;
581 	eseg->mss = htobe16(wr->tso.mss);
582 	eseg->inline_hdr_sz = htobe16(wr->tso.hdr_sz);
583 
584 	/* Check if there is space till the end of queue, if yes,
585 	 * copy all in one shot, otherwise copy till the end of queue,
586 	 * rollback and then copy the left
587 	 */
588 	left_len = qend - (void *)eseg->inline_hdr_start;
589 	copy_sz = min(left_len, left);
590 
591 	memcpy(eseg->inline_hdr_start, pdata, copy_sz);
592 
593 	/* The -1 is because there are already 16 bytes included in
594 	 * eseg->inline_hdr[16]
595 	 */
596 	*seg += align(copy_sz - size_of_inl_hdr_start, 16) - 16;
597 	*size += align(copy_sz - size_of_inl_hdr_start, 16) / 16 - 1;
598 
599 	/* The last wqe in the queue */
600 	if (unlikely(copy_sz < left)) {
601 		*seg = mlx5_get_send_wqe(qp, 0);
602 		left -= copy_sz;
603 		pdata += copy_sz;
604 		memcpy(*seg, pdata, left);
605 		*seg += align(left, 16);
606 		*size += align(left, 16) / 16;
607 	}
608 
609 	return 0;
610 }
611 
612 static inline int _mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
613 				  struct ibv_send_wr **bad_wr)
614 {
615 	struct mlx5_context *ctx;
616 	struct mlx5_qp *qp = to_mqp(ibqp);
617 	void *seg;
618 	struct mlx5_wqe_eth_seg *eseg;
619 	struct mlx5_wqe_ctrl_seg *ctrl = NULL;
620 	struct mlx5_wqe_data_seg *dpseg;
621 	struct mlx5_sg_copy_ptr sg_copy_ptr = {.index = 0, .offset = 0};
622 	int nreq;
623 	int inl = 0;
624 	int err = 0;
625 	int size = 0;
626 	int i;
627 	unsigned idx;
628 	uint8_t opmod = 0;
629 	struct mlx5_bf *bf = qp->bf;
630 	void *qend = qp->sq.qend;
631 	uint32_t mlx5_opcode;
632 	struct mlx5_wqe_xrc_seg *xrc;
633 	uint8_t fence;
634 	uint8_t next_fence;
635 	uint32_t max_tso = 0;
636 	FILE *fp = to_mctx(ibqp->context)->dbg_fp; /* The compiler ignores in non-debug mode */
637 
638 	mlx5_spin_lock(&qp->sq.lock);
639 
640 	next_fence = qp->fm_cache;
641 
642 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
643 		if (unlikely(wr->opcode < 0 ||
644 		    wr->opcode >= sizeof mlx5_ib_opcode / sizeof mlx5_ib_opcode[0])) {
645 			mlx5_dbg(fp, MLX5_DBG_QP_SEND, "bad opcode %d\n", wr->opcode);
646 			err = EINVAL;
647 			*bad_wr = wr;
648 			goto out;
649 		}
650 
651 		if (unlikely(mlx5_wq_overflow(&qp->sq, nreq,
652 					      to_mcq(qp->ibv_qp->send_cq)))) {
653 			mlx5_dbg(fp, MLX5_DBG_QP_SEND, "work queue overflow\n");
654 			err = ENOMEM;
655 			*bad_wr = wr;
656 			goto out;
657 		}
658 
659 		if (unlikely(wr->num_sge > qp->sq.max_gs)) {
660 			mlx5_dbg(fp, MLX5_DBG_QP_SEND, "max gs exceeded %d (max = %d)\n",
661 				 wr->num_sge, qp->sq.max_gs);
662 			err = ENOMEM;
663 			*bad_wr = wr;
664 			goto out;
665 		}
666 
667 		if (wr->send_flags & IBV_SEND_FENCE)
668 			fence = MLX5_WQE_CTRL_FENCE;
669 		else
670 			fence = next_fence;
671 		next_fence = 0;
672 		idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
673 		ctrl = seg = mlx5_get_send_wqe(qp, idx);
674 		*(uint32_t *)(seg + 8) = 0;
675 		ctrl->imm = send_ieth(wr);
676 		ctrl->fm_ce_se = qp->sq_signal_bits | fence |
677 			(wr->send_flags & IBV_SEND_SIGNALED ?
678 			 MLX5_WQE_CTRL_CQ_UPDATE : 0) |
679 			(wr->send_flags & IBV_SEND_SOLICITED ?
680 			 MLX5_WQE_CTRL_SOLICITED : 0);
681 
682 		seg += sizeof *ctrl;
683 		size = sizeof *ctrl / 16;
684 
685 		switch (ibqp->qp_type) {
686 		case IBV_QPT_XRC_SEND:
687 			if (unlikely(wr->opcode != IBV_WR_BIND_MW &&
688 				     wr->opcode != IBV_WR_LOCAL_INV)) {
689 				xrc = seg;
690 				xrc->xrc_srqn = htobe32(wr->qp_type.xrc.remote_srqn);
691 				seg += sizeof(*xrc);
692 				size += sizeof(*xrc) / 16;
693 			}
694 			/* fall through */
695 		case IBV_QPT_RC:
696 			switch (wr->opcode) {
697 			case IBV_WR_RDMA_READ:
698 			case IBV_WR_RDMA_WRITE:
699 			case IBV_WR_RDMA_WRITE_WITH_IMM:
700 				set_raddr_seg(seg, wr->wr.rdma.remote_addr,
701 					      wr->wr.rdma.rkey);
702 				seg  += sizeof(struct mlx5_wqe_raddr_seg);
703 				size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
704 				break;
705 
706 			case IBV_WR_ATOMIC_CMP_AND_SWP:
707 			case IBV_WR_ATOMIC_FETCH_AND_ADD:
708 				if (unlikely(!qp->atomics_enabled)) {
709 					mlx5_dbg(fp, MLX5_DBG_QP_SEND, "atomic operations are not supported\n");
710 					err = ENOSYS;
711 					*bad_wr = wr;
712 					goto out;
713 				}
714 				set_raddr_seg(seg, wr->wr.atomic.remote_addr,
715 					      wr->wr.atomic.rkey);
716 				seg  += sizeof(struct mlx5_wqe_raddr_seg);
717 
718 				set_atomic_seg(seg, wr->opcode,
719 					       wr->wr.atomic.swap,
720 					       wr->wr.atomic.compare_add);
721 				seg  += sizeof(struct mlx5_wqe_atomic_seg);
722 
723 				size += (sizeof(struct mlx5_wqe_raddr_seg) +
724 				sizeof(struct mlx5_wqe_atomic_seg)) / 16;
725 				break;
726 
727 			case IBV_WR_BIND_MW:
728 				next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
729 				ctrl->imm = htobe32(wr->bind_mw.mw->rkey);
730 				err = set_bind_wr(qp, wr->bind_mw.mw->type,
731 						  wr->bind_mw.rkey,
732 						  &wr->bind_mw.bind_info,
733 						  ibqp->qp_num, &seg, &size);
734 				if (err) {
735 					*bad_wr = wr;
736 					goto out;
737 				}
738 
739 				qp->sq.wr_data[idx] = IBV_WC_BIND_MW;
740 				break;
741 			case IBV_WR_LOCAL_INV: {
742 				struct ibv_mw_bind_info	bind_info = {};
743 
744 				next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
745 				ctrl->imm = htobe32(wr->imm_data);
746 				err = set_bind_wr(qp, IBV_MW_TYPE_2, 0,
747 						  &bind_info, ibqp->qp_num,
748 						  &seg, &size);
749 				if (err) {
750 					*bad_wr = wr;
751 					goto out;
752 				}
753 
754 				qp->sq.wr_data[idx] = IBV_WC_LOCAL_INV;
755 				break;
756 			}
757 
758 			default:
759 				break;
760 			}
761 			break;
762 
763 		case IBV_QPT_UC:
764 			switch (wr->opcode) {
765 			case IBV_WR_RDMA_WRITE:
766 			case IBV_WR_RDMA_WRITE_WITH_IMM:
767 				set_raddr_seg(seg, wr->wr.rdma.remote_addr,
768 					      wr->wr.rdma.rkey);
769 				seg  += sizeof(struct mlx5_wqe_raddr_seg);
770 				size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
771 				break;
772 			case IBV_WR_BIND_MW:
773 				next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
774 				ctrl->imm = htobe32(wr->bind_mw.mw->rkey);
775 				err = set_bind_wr(qp, wr->bind_mw.mw->type,
776 						  wr->bind_mw.rkey,
777 						  &wr->bind_mw.bind_info,
778 						  ibqp->qp_num, &seg, &size);
779 				if (err) {
780 					*bad_wr = wr;
781 					goto out;
782 				}
783 
784 				qp->sq.wr_data[idx] = IBV_WC_BIND_MW;
785 				break;
786 			case IBV_WR_LOCAL_INV: {
787 				struct ibv_mw_bind_info	bind_info = {};
788 
789 				next_fence = MLX5_WQE_CTRL_INITIATOR_SMALL_FENCE;
790 				ctrl->imm = htobe32(wr->imm_data);
791 				err = set_bind_wr(qp, IBV_MW_TYPE_2, 0,
792 						  &bind_info, ibqp->qp_num,
793 						  &seg, &size);
794 				if (err) {
795 					*bad_wr = wr;
796 					goto out;
797 				}
798 
799 				qp->sq.wr_data[idx] = IBV_WC_LOCAL_INV;
800 				break;
801 			}
802 
803 			default:
804 				break;
805 			}
806 			break;
807 
808 		case IBV_QPT_UD:
809 			set_datagram_seg(seg, wr);
810 			seg  += sizeof(struct mlx5_wqe_datagram_seg);
811 			size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
812 			if (unlikely((seg == qend)))
813 				seg = mlx5_get_send_wqe(qp, 0);
814 			break;
815 
816 		case IBV_QPT_RAW_PACKET:
817 			memset(seg, 0, sizeof(struct mlx5_wqe_eth_seg));
818 			eseg = seg;
819 
820 			if (wr->send_flags & IBV_SEND_IP_CSUM) {
821 				if (!(qp->qp_cap_cache & MLX5_CSUM_SUPPORT_RAW_OVER_ETH)) {
822 					err = EINVAL;
823 					*bad_wr = wr;
824 					goto out;
825 				}
826 
827 				eseg->cs_flags |= MLX5_ETH_WQE_L3_CSUM | MLX5_ETH_WQE_L4_CSUM;
828 			}
829 
830 			if (wr->opcode == IBV_WR_TSO) {
831 				max_tso = qp->max_tso;
832 				err = set_tso_eth_seg(&seg, wr, qend, qp, &size);
833 				if (unlikely(err)) {
834 					*bad_wr = wr;
835 					goto out;
836 				}
837 			} else {
838 				err = copy_eth_inline_headers(ibqp, wr, seg, &sg_copy_ptr);
839 				if (unlikely(err)) {
840 					*bad_wr = wr;
841 					mlx5_dbg(fp, MLX5_DBG_QP_SEND,
842 						 "copy_eth_inline_headers failed, err: %d\n",
843 						 err);
844 					goto out;
845 				}
846 			}
847 
848 			seg += sizeof(struct mlx5_wqe_eth_seg);
849 			size += sizeof(struct mlx5_wqe_eth_seg) / 16;
850 			break;
851 
852 		default:
853 			break;
854 		}
855 
856 		if (wr->send_flags & IBV_SEND_INLINE && wr->num_sge) {
857 			int sz = 0;
858 
859 			err = set_data_inl_seg(qp, wr, seg, &sz, &sg_copy_ptr);
860 			if (unlikely(err)) {
861 				*bad_wr = wr;
862 				mlx5_dbg(fp, MLX5_DBG_QP_SEND,
863 					 "inline layout failed, err %d\n", err);
864 				goto out;
865 			}
866 			inl = 1;
867 			size += sz;
868 		} else {
869 			dpseg = seg;
870 			for (i = sg_copy_ptr.index; i < wr->num_sge; ++i) {
871 				if (unlikely(dpseg == qend)) {
872 					seg = mlx5_get_send_wqe(qp, 0);
873 					dpseg = seg;
874 				}
875 				if (likely(wr->sg_list[i].length)) {
876 					if (unlikely(wr->opcode ==
877 						   IBV_WR_ATOMIC_CMP_AND_SWP ||
878 						   wr->opcode ==
879 						   IBV_WR_ATOMIC_FETCH_AND_ADD))
880 						set_data_ptr_seg_atomic(dpseg, wr->sg_list + i);
881 					else {
882 						if (unlikely(wr->opcode == IBV_WR_TSO)) {
883 							if (max_tso < wr->sg_list[i].length) {
884 								err = EINVAL;
885 								*bad_wr = wr;
886 								goto out;
887 							}
888 							max_tso -= wr->sg_list[i].length;
889 						}
890 						set_data_ptr_seg(dpseg, wr->sg_list + i,
891 								 sg_copy_ptr.offset);
892 					}
893 					sg_copy_ptr.offset = 0;
894 					++dpseg;
895 					size += sizeof(struct mlx5_wqe_data_seg) / 16;
896 				}
897 			}
898 		}
899 
900 		mlx5_opcode = mlx5_ib_opcode[wr->opcode];
901 		ctrl->opmod_idx_opcode = htobe32(((qp->sq.cur_post & 0xffff) << 8) |
902 					       mlx5_opcode			 |
903 					       (opmod << 24));
904 		ctrl->qpn_ds = htobe32(size | (ibqp->qp_num << 8));
905 
906 		if (unlikely(qp->wq_sig))
907 			ctrl->signature = wq_sig(ctrl);
908 
909 		qp->sq.wrid[idx] = wr->wr_id;
910 		qp->sq.wqe_head[idx] = qp->sq.head + nreq;
911 		qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
912 
913 #ifdef MLX5_DEBUG
914 		if (mlx5_debug_mask & MLX5_DBG_QP_SEND)
915 			dump_wqe(to_mctx(ibqp->context)->dbg_fp, idx, size, qp);
916 #endif
917 	}
918 
919 out:
920 	if (likely(nreq)) {
921 		qp->sq.head += nreq;
922 		qp->fm_cache = next_fence;
923 
924 		/*
925 		 * Make sure that descriptors are written before
926 		 * updating doorbell record and ringing the doorbell
927 		 */
928 		udma_to_device_barrier();
929 		qp->db[MLX5_SND_DBR] = htobe32(qp->sq.cur_post & 0xffff);
930 
931 		/* Make sure that the doorbell write happens before the memcpy
932 		 * to WC memory below */
933 		ctx = to_mctx(ibqp->context);
934 		if (bf->need_lock)
935 			mmio_wc_spinlock(&bf->lock.lock);
936 		else
937 			mmio_wc_start();
938 
939 		if (!ctx->shut_up_bf && nreq == 1 && bf->uuarn &&
940 		    (inl || ctx->prefer_bf) && size > 1 &&
941 		    size <= bf->buf_size / 16)
942 			mlx5_bf_copy(bf->reg + bf->offset, (unsigned long long *)ctrl,
943 				     align(size * 16, 64), qp);
944 		else
945 			mlx5_write64((__be32 *)ctrl, bf->reg + bf->offset,
946 				     &ctx->lock32);
947 
948 		/*
949 		 * use mmio_flush_writes() to ensure write combining buffers are flushed out
950 		 * of the running CPU. This must be carried inside the spinlock.
951 		 * Otherwise, there is a potential race. In the race, CPU A
952 		 * writes doorbell 1, which is waiting in the WC buffer. CPU B
953 		 * writes doorbell 2, and it's write is flushed earlier. Since
954 		 * the mmio_flush_writes is CPU local, this will result in the HCA seeing
955 		 * doorbell 2, followed by doorbell 1.
956 		 * Flush before toggling bf_offset to be latency oriented.
957 		 */
958 		mmio_flush_writes();
959 		bf->offset ^= bf->buf_size;
960 		if (bf->need_lock)
961 			mlx5_spin_unlock(&bf->lock);
962 	}
963 
964 	mlx5_spin_unlock(&qp->sq.lock);
965 
966 	return err;
967 }
968 
969 int mlx5_post_send(struct ibv_qp *ibqp, struct ibv_send_wr *wr,
970 		   struct ibv_send_wr **bad_wr)
971 {
972 #ifdef MW_DEBUG
973 	if (wr->opcode == IBV_WR_BIND_MW) {
974 		if (wr->bind_mw.mw->type == IBV_MW_TYPE_1)
975 			return EINVAL;
976 
977 		if (!wr->bind_mw.bind_info.mr ||
978 		    !wr->bind_mw.bind_info.addr ||
979 		    !wr->bind_mw.bind_info.length)
980 			return EINVAL;
981 
982 		if (wr->bind_mw.bind_info.mr->pd != wr->bind_mw.mw->pd)
983 			return EINVAL;
984 	}
985 #endif
986 
987 	return _mlx5_post_send(ibqp, wr, bad_wr);
988 }
989 
990 int mlx5_bind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
991 		 struct ibv_mw_bind *mw_bind)
992 {
993 	struct ibv_mw_bind_info	*bind_info = &mw_bind->bind_info;
994 	struct ibv_send_wr wr = {};
995 	struct ibv_send_wr *bad_wr = NULL;
996 	int ret;
997 
998 	if (!bind_info->mr && (bind_info->addr || bind_info->length)) {
999 		errno = EINVAL;
1000 		return errno;
1001 	}
1002 
1003 	if (bind_info->mw_access_flags & IBV_ACCESS_ZERO_BASED) {
1004 		errno = EINVAL;
1005 		return errno;
1006 	}
1007 
1008 	if (bind_info->mr) {
1009 		if (to_mmr(bind_info->mr)->alloc_flags & IBV_ACCESS_ZERO_BASED) {
1010 			errno = EINVAL;
1011 			return errno;
1012 		}
1013 
1014 		if (mw->pd != bind_info->mr->pd) {
1015 			errno = EPERM;
1016 			return errno;
1017 		}
1018 	}
1019 
1020 	wr.opcode = IBV_WR_BIND_MW;
1021 	wr.next = NULL;
1022 	wr.wr_id = mw_bind->wr_id;
1023 	wr.send_flags = mw_bind->send_flags;
1024 	wr.bind_mw.bind_info = mw_bind->bind_info;
1025 	wr.bind_mw.mw = mw;
1026 	wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey);
1027 
1028 	ret = _mlx5_post_send(qp, &wr, &bad_wr);
1029 	if (ret)
1030 		return ret;
1031 
1032 	mw->rkey = wr.bind_mw.rkey;
1033 
1034 	return 0;
1035 }
1036 
1037 static void set_sig_seg(struct mlx5_qp *qp, struct mlx5_rwqe_sig *sig,
1038 			int size, uint16_t idx)
1039 {
1040 	uint8_t  sign;
1041 	uint32_t qpn = qp->ibv_qp->qp_num;
1042 
1043 	sign = calc_sig(sig, size);
1044 	sign ^= calc_sig(&qpn, 4);
1045 	sign ^= calc_sig(&idx, 2);
1046 	sig->signature = sign;
1047 }
1048 
1049 static void set_wq_sig_seg(struct mlx5_rwq *rwq, struct mlx5_rwqe_sig *sig,
1050 			   int size, uint16_t idx)
1051 {
1052 	uint8_t  sign;
1053 	uint32_t qpn = rwq->wq.wq_num;
1054 
1055 	sign = calc_sig(sig, size);
1056 	sign ^= calc_sig(&qpn, 4);
1057 	sign ^= calc_sig(&idx, 2);
1058 	sig->signature = sign;
1059 }
1060 
1061 int mlx5_post_wq_recv(struct ibv_wq *ibwq, struct ibv_recv_wr *wr,
1062 		      struct ibv_recv_wr **bad_wr)
1063 {
1064 	struct mlx5_rwq *rwq = to_mrwq(ibwq);
1065 	struct mlx5_wqe_data_seg *scat;
1066 	int err = 0;
1067 	int nreq;
1068 	int ind;
1069 	int i, j;
1070 	struct mlx5_rwqe_sig *sig;
1071 
1072 	mlx5_spin_lock(&rwq->rq.lock);
1073 
1074 	ind = rwq->rq.head & (rwq->rq.wqe_cnt - 1);
1075 
1076 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
1077 		if (unlikely(mlx5_wq_overflow(&rwq->rq, nreq,
1078 					      to_mcq(rwq->wq.cq)))) {
1079 			err = ENOMEM;
1080 			*bad_wr = wr;
1081 			goto out;
1082 		}
1083 
1084 		if (unlikely(wr->num_sge > rwq->rq.max_gs)) {
1085 			err = EINVAL;
1086 			*bad_wr = wr;
1087 			goto out;
1088 		}
1089 
1090 		scat = get_wq_recv_wqe(rwq, ind);
1091 		sig = (struct mlx5_rwqe_sig *)scat;
1092 		if (unlikely(rwq->wq_sig)) {
1093 			memset(sig, 0, 1 << rwq->rq.wqe_shift);
1094 			++scat;
1095 		}
1096 
1097 		for (i = 0, j = 0; i < wr->num_sge; ++i) {
1098 			if (unlikely(!wr->sg_list[i].length))
1099 				continue;
1100 			set_data_ptr_seg(scat + j++, wr->sg_list + i, 0);
1101 		}
1102 
1103 		if (j < rwq->rq.max_gs) {
1104 			scat[j].byte_count = 0;
1105 			scat[j].lkey       = htobe32(MLX5_INVALID_LKEY);
1106 			scat[j].addr       = 0;
1107 		}
1108 
1109 		if (unlikely(rwq->wq_sig))
1110 			set_wq_sig_seg(rwq, sig, (wr->num_sge + 1) << 4,
1111 				       rwq->rq.head & 0xffff);
1112 
1113 		rwq->rq.wrid[ind] = wr->wr_id;
1114 
1115 		ind = (ind + 1) & (rwq->rq.wqe_cnt - 1);
1116 	}
1117 
1118 out:
1119 	if (likely(nreq)) {
1120 		rwq->rq.head += nreq;
1121 		/*
1122 		 * Make sure that descriptors are written before
1123 		 * doorbell record.
1124 		 */
1125 		udma_to_device_barrier();
1126 		*(rwq->recv_db) = htobe32(rwq->rq.head & 0xffff);
1127 	}
1128 
1129 	mlx5_spin_unlock(&rwq->rq.lock);
1130 
1131 	return err;
1132 }
1133 
1134 int mlx5_post_recv(struct ibv_qp *ibqp, struct ibv_recv_wr *wr,
1135 		   struct ibv_recv_wr **bad_wr)
1136 {
1137 	struct mlx5_qp *qp = to_mqp(ibqp);
1138 	struct mlx5_wqe_data_seg *scat;
1139 	int err = 0;
1140 	int nreq;
1141 	int ind;
1142 	int i, j;
1143 	struct mlx5_rwqe_sig *sig;
1144 
1145 	mlx5_spin_lock(&qp->rq.lock);
1146 
1147 	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
1148 
1149 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
1150 		if (unlikely(mlx5_wq_overflow(&qp->rq, nreq,
1151 					      to_mcq(qp->ibv_qp->recv_cq)))) {
1152 			err = ENOMEM;
1153 			*bad_wr = wr;
1154 			goto out;
1155 		}
1156 
1157 		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
1158 			err = EINVAL;
1159 			*bad_wr = wr;
1160 			goto out;
1161 		}
1162 
1163 		scat = get_recv_wqe(qp, ind);
1164 		sig = (struct mlx5_rwqe_sig *)scat;
1165 		if (unlikely(qp->wq_sig)) {
1166 			memset(sig, 0, 1 << qp->rq.wqe_shift);
1167 			++scat;
1168 		}
1169 
1170 		for (i = 0, j = 0; i < wr->num_sge; ++i) {
1171 			if (unlikely(!wr->sg_list[i].length))
1172 				continue;
1173 			set_data_ptr_seg(scat + j++, wr->sg_list + i, 0);
1174 		}
1175 
1176 		if (j < qp->rq.max_gs) {
1177 			scat[j].byte_count = 0;
1178 			scat[j].lkey       = htobe32(MLX5_INVALID_LKEY);
1179 			scat[j].addr       = 0;
1180 		}
1181 
1182 		if (unlikely(qp->wq_sig))
1183 			set_sig_seg(qp, sig, (wr->num_sge + 1) << 4,
1184 				    qp->rq.head & 0xffff);
1185 
1186 		qp->rq.wrid[ind] = wr->wr_id;
1187 
1188 		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
1189 	}
1190 
1191 out:
1192 	if (likely(nreq)) {
1193 		qp->rq.head += nreq;
1194 
1195 		/*
1196 		 * Make sure that descriptors are written before
1197 		 * doorbell record.
1198 		 */
1199 		udma_to_device_barrier();
1200 
1201 		/*
1202 		 * For Raw Packet QP, avoid updating the doorbell record
1203 		 * as long as the QP isn't in RTR state, to avoid receiving
1204 		 * packets in illegal states.
1205 		 * This is only for Raw Packet QPs since they are represented
1206 		 * differently in the hardware.
1207 		 */
1208 		if (likely(!(ibqp->qp_type == IBV_QPT_RAW_PACKET &&
1209 			     ibqp->state < IBV_QPS_RTR)))
1210 			qp->db[MLX5_RCV_DBR] = htobe32(qp->rq.head & 0xffff);
1211 	}
1212 
1213 	mlx5_spin_unlock(&qp->rq.lock);
1214 
1215 	return err;
1216 }
1217 
1218 int mlx5_use_huge(const char *key)
1219 {
1220 	char *e;
1221 	e = getenv(key);
1222 	if (e && !strcmp(e, "y"))
1223 		return 1;
1224 
1225 	return 0;
1226 }
1227 
1228 struct mlx5_qp *mlx5_find_qp(struct mlx5_context *ctx, uint32_t qpn)
1229 {
1230 	int tind = qpn >> MLX5_QP_TABLE_SHIFT;
1231 
1232 	if (ctx->qp_table[tind].refcnt)
1233 		return ctx->qp_table[tind].table[qpn & MLX5_QP_TABLE_MASK];
1234 	else
1235 		return NULL;
1236 }
1237 
1238 int mlx5_store_qp(struct mlx5_context *ctx, uint32_t qpn, struct mlx5_qp *qp)
1239 {
1240 	int tind = qpn >> MLX5_QP_TABLE_SHIFT;
1241 
1242 	if (!ctx->qp_table[tind].refcnt) {
1243 		ctx->qp_table[tind].table = calloc(MLX5_QP_TABLE_MASK + 1,
1244 						   sizeof(struct mlx5_qp *));
1245 		if (!ctx->qp_table[tind].table)
1246 			return -1;
1247 	}
1248 
1249 	++ctx->qp_table[tind].refcnt;
1250 	ctx->qp_table[tind].table[qpn & MLX5_QP_TABLE_MASK] = qp;
1251 	return 0;
1252 }
1253 
1254 void mlx5_clear_qp(struct mlx5_context *ctx, uint32_t qpn)
1255 {
1256 	int tind = qpn >> MLX5_QP_TABLE_SHIFT;
1257 
1258 	if (!--ctx->qp_table[tind].refcnt)
1259 		free(ctx->qp_table[tind].table);
1260 	else
1261 		ctx->qp_table[tind].table[qpn & MLX5_QP_TABLE_MASK] = NULL;
1262 }
1263