xref: /freebsd/contrib/ofed/libmlx5/verbs.c (revision a0b9e2e854027e6ff61fb075a1309dbc71c42b54)
1 /*
2  * Copyright (c) 2012 Mellanox Technologies, Inc.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <config.h>
34 
35 #include <stdlib.h>
36 #include <stdio.h>
37 #include <string.h>
38 #include <pthread.h>
39 #include <errno.h>
40 #include <limits.h>
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <fcntl.h>
44 #include <unistd.h>
45 #include <sys/mman.h>
46 
47 #include "mlx5.h"
48 #include "mlx5-abi.h"
49 #include "wqe.h"
50 
51 int mlx5_single_threaded = 0;
52 
53 static inline int is_xrc_tgt(int type)
54 {
55 	return type == IBV_QPT_XRC_RECV;
56 }
57 
58 int mlx5_query_device(struct ibv_context *context, struct ibv_device_attr *attr)
59 {
60 	struct ibv_query_device cmd;
61 	uint64_t raw_fw_ver;
62 	unsigned major, minor, sub_minor;
63 	int ret;
64 
65 	ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd);
66 	if (ret)
67 		return ret;
68 
69 	major     = (raw_fw_ver >> 32) & 0xffff;
70 	minor     = (raw_fw_ver >> 16) & 0xffff;
71 	sub_minor = raw_fw_ver & 0xffff;
72 
73 	snprintf(attr->fw_ver, sizeof attr->fw_ver,
74 		 "%d.%d.%04d", major, minor, sub_minor);
75 
76 	return 0;
77 }
78 
79 #define READL(ptr) (*((uint32_t *)(ptr)))
80 static int mlx5_read_clock(struct ibv_context *context, uint64_t *cycles)
81 {
82 	unsigned int clockhi, clocklo, clockhi1;
83 	int i;
84 	struct mlx5_context *ctx = to_mctx(context);
85 
86 	if (!ctx->hca_core_clock)
87 		return -EOPNOTSUPP;
88 
89 	/* Handle wraparound */
90 	for (i = 0; i < 2; i++) {
91 		clockhi = be32toh(READL(ctx->hca_core_clock));
92 		clocklo = be32toh(READL(ctx->hca_core_clock + 4));
93 		clockhi1 = be32toh(READL(ctx->hca_core_clock));
94 		if (clockhi == clockhi1)
95 			break;
96 	}
97 
98 	*cycles = (uint64_t)clockhi << 32 | (uint64_t)clocklo;
99 
100 	return 0;
101 }
102 
103 int mlx5_query_rt_values(struct ibv_context *context,
104 			 struct ibv_values_ex *values)
105 {
106 	uint32_t comp_mask = 0;
107 	int err = 0;
108 
109 	if (values->comp_mask & IBV_VALUES_MASK_RAW_CLOCK) {
110 		uint64_t cycles;
111 
112 		err = mlx5_read_clock(context, &cycles);
113 		if (!err) {
114 			values->raw_clock.tv_sec = 0;
115 			values->raw_clock.tv_nsec = cycles;
116 			comp_mask |= IBV_VALUES_MASK_RAW_CLOCK;
117 		}
118 	}
119 
120 	values->comp_mask = comp_mask;
121 
122 	return err;
123 }
124 
125 int mlx5_query_port(struct ibv_context *context, uint8_t port,
126 		     struct ibv_port_attr *attr)
127 {
128 	struct ibv_query_port cmd;
129 
130 	return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd);
131 }
132 
133 struct ibv_pd *mlx5_alloc_pd(struct ibv_context *context)
134 {
135 	struct ibv_alloc_pd       cmd;
136 	struct mlx5_alloc_pd_resp resp;
137 	struct mlx5_pd		 *pd;
138 
139 	pd = calloc(1, sizeof *pd);
140 	if (!pd)
141 		return NULL;
142 
143 	if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd,
144 			     &resp.ibv_resp, sizeof resp)) {
145 		free(pd);
146 		return NULL;
147 	}
148 
149 	pd->pdn = resp.pdn;
150 
151 	return &pd->ibv_pd;
152 }
153 
154 int mlx5_free_pd(struct ibv_pd *pd)
155 {
156 	int ret;
157 
158 	ret = ibv_cmd_dealloc_pd(pd);
159 	if (ret)
160 		return ret;
161 
162 	free(to_mpd(pd));
163 	return 0;
164 }
165 
166 struct ibv_mr *mlx5_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
167 			   int acc)
168 {
169 	struct mlx5_mr *mr;
170 	struct ibv_reg_mr cmd;
171 	int ret;
172 	enum ibv_access_flags access = (enum ibv_access_flags)acc;
173 	struct ibv_reg_mr_resp resp;
174 
175 	mr = calloc(1, sizeof(*mr));
176 	if (!mr)
177 		return NULL;
178 
179 	ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t)addr, access,
180 			     &(mr->ibv_mr), &cmd, sizeof(cmd), &resp,
181 			     sizeof resp);
182 	if (ret) {
183 		mlx5_free_buf(&(mr->buf));
184 		free(mr);
185 		return NULL;
186 	}
187 	mr->alloc_flags = acc;
188 
189 	return &mr->ibv_mr;
190 }
191 
192 int mlx5_rereg_mr(struct ibv_mr *ibmr, int flags, struct ibv_pd *pd, void *addr,
193 		  size_t length, int access)
194 {
195 	struct ibv_rereg_mr cmd;
196 	struct ibv_rereg_mr_resp resp;
197 
198 	if (flags & IBV_REREG_MR_KEEP_VALID)
199 		return ENOTSUP;
200 
201 	return ibv_cmd_rereg_mr(ibmr, flags, addr, length, (uintptr_t)addr,
202 				access, pd, &cmd, sizeof(cmd), &resp,
203 				sizeof(resp));
204 }
205 
206 int mlx5_dereg_mr(struct ibv_mr *ibmr)
207 {
208 	int ret;
209 	struct mlx5_mr *mr = to_mmr(ibmr);
210 
211 	ret = ibv_cmd_dereg_mr(ibmr);
212 	if (ret)
213 		return ret;
214 
215 	free(mr);
216 	return 0;
217 }
218 
219 struct ibv_mw *mlx5_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type)
220 {
221 	struct ibv_mw *mw;
222 	struct ibv_alloc_mw cmd;
223 	struct ibv_alloc_mw_resp resp;
224 	int ret;
225 
226 	mw = malloc(sizeof(*mw));
227 	if (!mw)
228 		return NULL;
229 
230 	memset(mw, 0, sizeof(*mw));
231 
232 	ret = ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd), &resp,
233 			       sizeof(resp));
234 	if (ret) {
235 		free(mw);
236 		return NULL;
237 	}
238 
239 	return mw;
240 }
241 
242 int mlx5_dealloc_mw(struct ibv_mw *mw)
243 {
244 	int ret;
245 	struct ibv_dealloc_mw cmd;
246 
247 	ret = ibv_cmd_dealloc_mw(mw, &cmd, sizeof(cmd));
248 	if (ret)
249 		return ret;
250 
251 	free(mw);
252 	return 0;
253 }
254 
255 int mlx5_round_up_power_of_two(long long sz)
256 {
257 	long long ret;
258 
259 	for (ret = 1; ret < sz; ret <<= 1)
260 		; /* nothing */
261 
262 	if (ret > INT_MAX) {
263 		fprintf(stderr, "%s: roundup overflow\n", __func__);
264 		return -ENOMEM;
265 	}
266 
267 	return (int)ret;
268 }
269 
270 static int align_queue_size(long long req)
271 {
272 	return mlx5_round_up_power_of_two(req);
273 }
274 
275 static int get_cqe_size(void)
276 {
277 	char *env;
278 	int size = 64;
279 
280 	env = getenv("MLX5_CQE_SIZE");
281 	if (env)
282 		size = atoi(env);
283 
284 	switch (size) {
285 	case 64:
286 	case 128:
287 		return size;
288 
289 	default:
290 		return -EINVAL;
291 	}
292 }
293 
294 static int use_scatter_to_cqe(void)
295 {
296 	char *env;
297 
298 	env = getenv("MLX5_SCATTER_TO_CQE");
299 	if (env && !strcmp(env, "0"))
300 		return 0;
301 
302 	return 1;
303 }
304 
305 static int srq_sig_enabled(void)
306 {
307 	char *env;
308 
309 	env = getenv("MLX5_SRQ_SIGNATURE");
310 	if (env)
311 		return 1;
312 
313 	return 0;
314 }
315 
316 static int qp_sig_enabled(void)
317 {
318 	char *env;
319 
320 	env = getenv("MLX5_QP_SIGNATURE");
321 	if (env)
322 		return 1;
323 
324 	return 0;
325 }
326 
327 enum {
328 	CREATE_CQ_SUPPORTED_WC_FLAGS = IBV_WC_STANDARD_FLAGS	|
329 				       IBV_WC_EX_WITH_COMPLETION_TIMESTAMP |
330 				       IBV_WC_EX_WITH_CVLAN |
331 				       IBV_WC_EX_WITH_FLOW_TAG
332 };
333 
334 enum {
335 	CREATE_CQ_SUPPORTED_COMP_MASK = IBV_CQ_INIT_ATTR_MASK_FLAGS
336 };
337 
338 enum {
339 	CREATE_CQ_SUPPORTED_FLAGS = IBV_CREATE_CQ_ATTR_SINGLE_THREADED
340 };
341 
342 static struct ibv_cq_ex *create_cq(struct ibv_context *context,
343 				   const struct ibv_cq_init_attr_ex *cq_attr,
344 				   int cq_alloc_flags,
345 				   struct mlx5dv_cq_init_attr *mlx5cq_attr)
346 {
347 	struct mlx5_create_cq		cmd;
348 	struct mlx5_create_cq_resp	resp;
349 	struct mlx5_cq		       *cq;
350 	int				cqe_sz;
351 	int				ret;
352 	int				ncqe;
353 	struct mlx5_context *mctx = to_mctx(context);
354 	FILE *fp = to_mctx(context)->dbg_fp;
355 
356 	if (!cq_attr->cqe) {
357 		mlx5_dbg(fp, MLX5_DBG_CQ, "CQE invalid\n");
358 		errno = EINVAL;
359 		return NULL;
360 	}
361 
362 	if (cq_attr->comp_mask & ~CREATE_CQ_SUPPORTED_COMP_MASK) {
363 		mlx5_dbg(fp, MLX5_DBG_CQ,
364 			 "Unsupported comp_mask for create_cq\n");
365 		errno = EINVAL;
366 		return NULL;
367 	}
368 
369 	if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS &&
370 	    cq_attr->flags & ~CREATE_CQ_SUPPORTED_FLAGS) {
371 		mlx5_dbg(fp, MLX5_DBG_CQ,
372 			 "Unsupported creation flags requested for create_cq\n");
373 		errno = EINVAL;
374 		return NULL;
375 	}
376 
377 	if (cq_attr->wc_flags & ~CREATE_CQ_SUPPORTED_WC_FLAGS) {
378 		mlx5_dbg(fp, MLX5_DBG_CQ, "\n");
379 		errno = ENOTSUP;
380 		return NULL;
381 	}
382 
383 	cq =  calloc(1, sizeof *cq);
384 	if (!cq) {
385 		mlx5_dbg(fp, MLX5_DBG_CQ, "\n");
386 		return NULL;
387 	}
388 
389 	memset(&cmd, 0, sizeof cmd);
390 	cq->cons_index = 0;
391 
392 	if (mlx5_spinlock_init(&cq->lock))
393 		goto err;
394 
395 	ncqe = align_queue_size(cq_attr->cqe + 1);
396 	if ((ncqe > (1 << 24)) || (ncqe < (cq_attr->cqe + 1))) {
397 		mlx5_dbg(fp, MLX5_DBG_CQ, "ncqe %d\n", ncqe);
398 		errno = EINVAL;
399 		goto err_spl;
400 	}
401 
402 	cqe_sz = get_cqe_size();
403 	if (cqe_sz < 0) {
404 		mlx5_dbg(fp, MLX5_DBG_CQ, "\n");
405 		errno = -cqe_sz;
406 		goto err_spl;
407 	}
408 
409 	if (mlx5_alloc_cq_buf(to_mctx(context), cq, &cq->buf_a, ncqe, cqe_sz)) {
410 		mlx5_dbg(fp, MLX5_DBG_CQ, "\n");
411 		goto err_spl;
412 	}
413 
414 	cq->dbrec  = mlx5_alloc_dbrec(to_mctx(context));
415 	if (!cq->dbrec) {
416 		mlx5_dbg(fp, MLX5_DBG_CQ, "\n");
417 		goto err_buf;
418 	}
419 
420 	cq->dbrec[MLX5_CQ_SET_CI]	= 0;
421 	cq->dbrec[MLX5_CQ_ARM_DB]	= 0;
422 	cq->arm_sn			= 0;
423 	cq->cqe_sz			= cqe_sz;
424 	cq->flags			= cq_alloc_flags;
425 
426 	if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS &&
427 	    cq_attr->flags & IBV_CREATE_CQ_ATTR_SINGLE_THREADED)
428 		cq->flags |= MLX5_CQ_FLAGS_SINGLE_THREADED;
429 	cmd.buf_addr = (uintptr_t) cq->buf_a.buf;
430 	cmd.db_addr  = (uintptr_t) cq->dbrec;
431 	cmd.cqe_size = cqe_sz;
432 
433 	if (mlx5cq_attr) {
434 		if (mlx5cq_attr->comp_mask & ~(MLX5DV_CQ_INIT_ATTR_MASK_RESERVED - 1)) {
435 			mlx5_dbg(fp, MLX5_DBG_CQ,
436 				   "Unsupported vendor comp_mask for create_cq\n");
437 			errno = EINVAL;
438 			goto err_db;
439 		}
440 
441 		if (mlx5cq_attr->comp_mask & MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE) {
442 			if (mctx->cqe_comp_caps.max_num &&
443 			    (mlx5cq_attr->cqe_comp_res_format &
444 			     mctx->cqe_comp_caps.supported_format)) {
445 				cmd.cqe_comp_en = 1;
446 				cmd.cqe_comp_res_format = mlx5cq_attr->cqe_comp_res_format;
447 			} else {
448 				mlx5_dbg(fp, MLX5_DBG_CQ, "CQE Compression is not supported\n");
449 				errno = EINVAL;
450 				goto err_db;
451 			}
452 		}
453 	}
454 
455 	ret = ibv_cmd_create_cq(context, ncqe - 1, cq_attr->channel,
456 				cq_attr->comp_vector,
457 				ibv_cq_ex_to_cq(&cq->ibv_cq), &cmd.ibv_cmd,
458 				sizeof(cmd), &resp.ibv_resp, sizeof(resp));
459 	if (ret) {
460 		mlx5_dbg(fp, MLX5_DBG_CQ, "ret %d\n", ret);
461 		goto err_db;
462 	}
463 
464 	cq->active_buf = &cq->buf_a;
465 	cq->resize_buf = NULL;
466 	cq->cqn = resp.cqn;
467 	cq->stall_enable = to_mctx(context)->stall_enable;
468 	cq->stall_adaptive_enable = to_mctx(context)->stall_adaptive_enable;
469 	cq->stall_cycles = to_mctx(context)->stall_cycles;
470 
471 	if (cq_alloc_flags & MLX5_CQ_FLAGS_EXTENDED)
472 		mlx5_cq_fill_pfns(cq, cq_attr);
473 
474 	return &cq->ibv_cq;
475 
476 err_db:
477 	mlx5_free_db(to_mctx(context), cq->dbrec);
478 
479 err_buf:
480 	mlx5_free_cq_buf(to_mctx(context), &cq->buf_a);
481 
482 err_spl:
483 	mlx5_spinlock_destroy(&cq->lock);
484 
485 err:
486 	free(cq);
487 
488 	return NULL;
489 }
490 
491 struct ibv_cq *mlx5_create_cq(struct ibv_context *context, int cqe,
492 			      struct ibv_comp_channel *channel,
493 			      int comp_vector)
494 {
495 	struct ibv_cq_ex *cq;
496 	struct ibv_cq_init_attr_ex cq_attr = {.cqe = cqe, .channel = channel,
497 						.comp_vector = comp_vector,
498 						.wc_flags = IBV_WC_STANDARD_FLAGS};
499 
500 	if (cqe <= 0) {
501 		errno = EINVAL;
502 		return NULL;
503 	}
504 
505 	cq = create_cq(context, &cq_attr, 0, NULL);
506 	return cq ? ibv_cq_ex_to_cq(cq) : NULL;
507 }
508 
509 struct ibv_cq_ex *mlx5_create_cq_ex(struct ibv_context *context,
510 				    struct ibv_cq_init_attr_ex *cq_attr)
511 {
512 	return create_cq(context, cq_attr, MLX5_CQ_FLAGS_EXTENDED, NULL);
513 }
514 
515 struct ibv_cq_ex *mlx5dv_create_cq(struct ibv_context *context,
516 				      struct ibv_cq_init_attr_ex *cq_attr,
517 				      struct mlx5dv_cq_init_attr *mlx5_cq_attr)
518 {
519 	struct ibv_cq_ex *cq;
520 
521 	cq = create_cq(context, cq_attr, MLX5_CQ_FLAGS_EXTENDED, mlx5_cq_attr);
522 	if (!cq)
523 		return NULL;
524 
525 	verbs_init_cq(ibv_cq_ex_to_cq(cq), context,
526 		      cq_attr->channel, cq_attr->cq_context);
527 	return cq;
528 }
529 
530 int mlx5_resize_cq(struct ibv_cq *ibcq, int cqe)
531 {
532 	struct mlx5_cq *cq = to_mcq(ibcq);
533 	struct mlx5_resize_cq_resp resp;
534 	struct mlx5_resize_cq cmd;
535 	struct mlx5_context *mctx = to_mctx(ibcq->context);
536 	int err;
537 
538 	if (cqe < 0) {
539 		errno = EINVAL;
540 		return errno;
541 	}
542 
543 	memset(&cmd, 0, sizeof(cmd));
544 	memset(&resp, 0, sizeof(resp));
545 
546 	if (((long long)cqe * 64) > INT_MAX)
547 		return EINVAL;
548 
549 	mlx5_spin_lock(&cq->lock);
550 	cq->active_cqes = cq->ibv_cq.cqe;
551 	if (cq->active_buf == &cq->buf_a)
552 		cq->resize_buf = &cq->buf_b;
553 	else
554 		cq->resize_buf = &cq->buf_a;
555 
556 	cqe = align_queue_size(cqe + 1);
557 	if (cqe == ibcq->cqe + 1) {
558 		cq->resize_buf = NULL;
559 		err = 0;
560 		goto out;
561 	}
562 
563 	/* currently we don't change cqe size */
564 	cq->resize_cqe_sz = cq->cqe_sz;
565 	cq->resize_cqes = cqe;
566 	err = mlx5_alloc_cq_buf(mctx, cq, cq->resize_buf, cq->resize_cqes, cq->resize_cqe_sz);
567 	if (err) {
568 		cq->resize_buf = NULL;
569 		errno = ENOMEM;
570 		goto out;
571 	}
572 
573 	cmd.buf_addr = (uintptr_t)cq->resize_buf->buf;
574 	cmd.cqe_size = cq->resize_cqe_sz;
575 
576 	err = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof(cmd),
577 				&resp.ibv_resp, sizeof(resp));
578 	if (err)
579 		goto out_buf;
580 
581 	mlx5_cq_resize_copy_cqes(cq);
582 	mlx5_free_cq_buf(mctx, cq->active_buf);
583 	cq->active_buf = cq->resize_buf;
584 	cq->ibv_cq.cqe = cqe - 1;
585 	mlx5_spin_unlock(&cq->lock);
586 	cq->resize_buf = NULL;
587 	return 0;
588 
589 out_buf:
590 	mlx5_free_cq_buf(mctx, cq->resize_buf);
591 	cq->resize_buf = NULL;
592 
593 out:
594 	mlx5_spin_unlock(&cq->lock);
595 	return err;
596 }
597 
598 int mlx5_destroy_cq(struct ibv_cq *cq)
599 {
600 	int ret;
601 
602 	ret = ibv_cmd_destroy_cq(cq);
603 	if (ret)
604 		return ret;
605 
606 	mlx5_free_db(to_mctx(cq->context), to_mcq(cq)->dbrec);
607 	mlx5_free_cq_buf(to_mctx(cq->context), to_mcq(cq)->active_buf);
608 	free(to_mcq(cq));
609 
610 	return 0;
611 }
612 
613 struct ibv_srq *mlx5_create_srq(struct ibv_pd *pd,
614 				struct ibv_srq_init_attr *attr)
615 {
616 	struct mlx5_create_srq      cmd;
617 	struct mlx5_create_srq_resp resp;
618 	struct mlx5_srq		   *srq;
619 	int			    ret;
620 	struct mlx5_context	   *ctx;
621 	int			    max_sge;
622 	struct ibv_srq		   *ibsrq;
623 
624 	ctx = to_mctx(pd->context);
625 	srq = calloc(1, sizeof *srq);
626 	if (!srq) {
627 		fprintf(stderr, "%s-%d:\n", __func__, __LINE__);
628 		return NULL;
629 	}
630 	ibsrq = &srq->vsrq.srq;
631 
632 	memset(&cmd, 0, sizeof cmd);
633 	if (mlx5_spinlock_init(&srq->lock)) {
634 		fprintf(stderr, "%s-%d:\n", __func__, __LINE__);
635 		goto err;
636 	}
637 
638 	if (attr->attr.max_wr > ctx->max_srq_recv_wr) {
639 		fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n", __func__, __LINE__,
640 			attr->attr.max_wr, ctx->max_srq_recv_wr);
641 		errno = EINVAL;
642 		goto err;
643 	}
644 
645 	/*
646 	 * this calculation does not consider required control segments. The
647 	 * final calculation is done again later. This is done so to avoid
648 	 * overflows of variables
649 	 */
650 	max_sge = ctx->max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg);
651 	if (attr->attr.max_sge > max_sge) {
652 		fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n", __func__, __LINE__,
653 			attr->attr.max_wr, ctx->max_srq_recv_wr);
654 		errno = EINVAL;
655 		goto err;
656 	}
657 
658 	srq->max     = align_queue_size(attr->attr.max_wr + 1);
659 	srq->max_gs  = attr->attr.max_sge;
660 	srq->counter = 0;
661 
662 	if (mlx5_alloc_srq_buf(pd->context, srq)) {
663 		fprintf(stderr, "%s-%d:\n", __func__, __LINE__);
664 		goto err;
665 	}
666 
667 	srq->db = mlx5_alloc_dbrec(to_mctx(pd->context));
668 	if (!srq->db) {
669 		fprintf(stderr, "%s-%d:\n", __func__, __LINE__);
670 		goto err_free;
671 	}
672 
673 	*srq->db = 0;
674 
675 	cmd.buf_addr = (uintptr_t) srq->buf.buf;
676 	cmd.db_addr  = (uintptr_t) srq->db;
677 	srq->wq_sig = srq_sig_enabled();
678 	if (srq->wq_sig)
679 		cmd.flags = MLX5_SRQ_FLAG_SIGNATURE;
680 
681 	attr->attr.max_sge = srq->max_gs;
682 	pthread_mutex_lock(&ctx->srq_table_mutex);
683 	ret = ibv_cmd_create_srq(pd, ibsrq, attr, &cmd.ibv_cmd, sizeof(cmd),
684 				 &resp.ibv_resp, sizeof(resp));
685 	if (ret)
686 		goto err_db;
687 
688 	ret = mlx5_store_srq(ctx, resp.srqn, srq);
689 	if (ret)
690 		goto err_destroy;
691 
692 	pthread_mutex_unlock(&ctx->srq_table_mutex);
693 
694 	srq->srqn = resp.srqn;
695 	srq->rsc.rsn = resp.srqn;
696 	srq->rsc.type = MLX5_RSC_TYPE_SRQ;
697 
698 	return ibsrq;
699 
700 err_destroy:
701 	ibv_cmd_destroy_srq(ibsrq);
702 
703 err_db:
704 	pthread_mutex_unlock(&ctx->srq_table_mutex);
705 	mlx5_free_db(to_mctx(pd->context), srq->db);
706 
707 err_free:
708 	free(srq->wrid);
709 	mlx5_free_buf(&srq->buf);
710 
711 err:
712 	free(srq);
713 
714 	return NULL;
715 }
716 
717 int mlx5_modify_srq(struct ibv_srq *srq,
718 		    struct ibv_srq_attr *attr,
719 		    int attr_mask)
720 {
721 	struct ibv_modify_srq cmd;
722 
723 	return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd);
724 }
725 
726 int mlx5_query_srq(struct ibv_srq *srq,
727 		    struct ibv_srq_attr *attr)
728 {
729 	struct ibv_query_srq cmd;
730 
731 	return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
732 }
733 
734 int mlx5_destroy_srq(struct ibv_srq *srq)
735 {
736 	int ret;
737 	struct mlx5_srq *msrq = to_msrq(srq);
738 	struct mlx5_context *ctx = to_mctx(srq->context);
739 
740 	ret = ibv_cmd_destroy_srq(srq);
741 	if (ret)
742 		return ret;
743 
744 	if (ctx->cqe_version && msrq->rsc.type == MLX5_RSC_TYPE_XSRQ)
745 		mlx5_clear_uidx(ctx, msrq->rsc.rsn);
746 	else
747 		mlx5_clear_srq(ctx, msrq->srqn);
748 
749 	mlx5_free_db(ctx, msrq->db);
750 	mlx5_free_buf(&msrq->buf);
751 	free(msrq->wrid);
752 	free(msrq);
753 
754 	return 0;
755 }
756 
757 static int sq_overhead(enum ibv_qp_type	qp_type)
758 {
759 	size_t size = 0;
760 	size_t mw_bind_size =
761 	    sizeof(struct mlx5_wqe_umr_ctrl_seg) +
762 	    sizeof(struct mlx5_wqe_mkey_context_seg) +
763 	    max_t(size_t, sizeof(struct mlx5_wqe_umr_klm_seg), 64);
764 
765 	switch (qp_type) {
766 	case IBV_QPT_RC:
767 		size += sizeof(struct mlx5_wqe_ctrl_seg) +
768 			max(sizeof(struct mlx5_wqe_atomic_seg) +
769 			    sizeof(struct mlx5_wqe_raddr_seg),
770 			    mw_bind_size);
771 		break;
772 
773 	case IBV_QPT_UC:
774 		size = sizeof(struct mlx5_wqe_ctrl_seg) +
775 			max(sizeof(struct mlx5_wqe_raddr_seg),
776 			    mw_bind_size);
777 		break;
778 
779 	case IBV_QPT_UD:
780 		size = sizeof(struct mlx5_wqe_ctrl_seg) +
781 			sizeof(struct mlx5_wqe_datagram_seg);
782 		break;
783 
784 	case IBV_QPT_XRC_SEND:
785 		size = sizeof(struct mlx5_wqe_ctrl_seg) + mw_bind_size;
786 		SWITCH_FALLTHROUGH;
787 
788 	case IBV_QPT_XRC_RECV:
789 		size = max(size, sizeof(struct mlx5_wqe_ctrl_seg) +
790 			   sizeof(struct mlx5_wqe_xrc_seg) +
791 			   sizeof(struct mlx5_wqe_raddr_seg));
792 		break;
793 
794 	case IBV_QPT_RAW_PACKET:
795 		size = sizeof(struct mlx5_wqe_ctrl_seg) +
796 			sizeof(struct mlx5_wqe_eth_seg);
797 		break;
798 
799 	default:
800 		return -EINVAL;
801 	}
802 
803 	return size;
804 }
805 
806 static int mlx5_calc_send_wqe(struct mlx5_context *ctx,
807 			      struct ibv_qp_init_attr_ex *attr,
808 			      struct mlx5_qp *qp)
809 {
810 	int size;
811 	int inl_size = 0;
812 	int max_gather;
813 	int tot_size;
814 
815 	size = sq_overhead(attr->qp_type);
816 	if (size < 0)
817 		return size;
818 
819 	if (attr->cap.max_inline_data) {
820 		inl_size = size + align(sizeof(struct mlx5_wqe_inl_data_seg) +
821 			attr->cap.max_inline_data, 16);
822 	}
823 
824 	if (attr->comp_mask & IBV_QP_INIT_ATTR_MAX_TSO_HEADER) {
825 		size += align(attr->max_tso_header, 16);
826 		qp->max_tso_header = attr->max_tso_header;
827 	}
828 
829 	max_gather = (ctx->max_sq_desc_sz - size) /
830 		sizeof(struct mlx5_wqe_data_seg);
831 	if (attr->cap.max_send_sge > max_gather)
832 		return -EINVAL;
833 
834 	size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg);
835 	tot_size = max_int(size, inl_size);
836 
837 	if (tot_size > ctx->max_sq_desc_sz)
838 		return -EINVAL;
839 
840 	return align(tot_size, MLX5_SEND_WQE_BB);
841 }
842 
843 static int mlx5_calc_rcv_wqe(struct mlx5_context *ctx,
844 			     struct ibv_qp_init_attr_ex *attr,
845 			     struct mlx5_qp *qp)
846 {
847 	uint32_t size;
848 	int num_scatter;
849 
850 	if (attr->srq)
851 		return 0;
852 
853 	num_scatter = max_t(uint32_t, attr->cap.max_recv_sge, 1);
854 	size = sizeof(struct mlx5_wqe_data_seg) * num_scatter;
855 	if (qp->wq_sig)
856 		size += sizeof(struct mlx5_rwqe_sig);
857 
858 	if (size > ctx->max_rq_desc_sz)
859 		return -EINVAL;
860 
861 	size = mlx5_round_up_power_of_two(size);
862 
863 	return size;
864 }
865 
866 static int mlx5_calc_sq_size(struct mlx5_context *ctx,
867 			     struct ibv_qp_init_attr_ex *attr,
868 			     struct mlx5_qp *qp)
869 {
870 	int wqe_size;
871 	int wq_size;
872 	FILE *fp = ctx->dbg_fp;
873 
874 	if (!attr->cap.max_send_wr)
875 		return 0;
876 
877 	wqe_size = mlx5_calc_send_wqe(ctx, attr, qp);
878 	if (wqe_size < 0) {
879 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
880 		return wqe_size;
881 	}
882 
883 	if (wqe_size > ctx->max_sq_desc_sz) {
884 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
885 		return -EINVAL;
886 	}
887 
888 	qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) -
889 		sizeof(struct mlx5_wqe_inl_data_seg);
890 	attr->cap.max_inline_data = qp->max_inline_data;
891 
892 	/*
893 	 * to avoid overflow, we limit max_send_wr so
894 	 * that the multiplication will fit in int
895 	 */
896 	if (attr->cap.max_send_wr > 0x7fffffff / ctx->max_sq_desc_sz) {
897 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
898 		return -EINVAL;
899 	}
900 
901 	wq_size = mlx5_round_up_power_of_two(attr->cap.max_send_wr * wqe_size);
902 	qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB;
903 	if (qp->sq.wqe_cnt > ctx->max_send_wqebb) {
904 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
905 		return -EINVAL;
906 	}
907 
908 	qp->sq.wqe_shift = mlx5_ilog2(MLX5_SEND_WQE_BB);
909 	qp->sq.max_gs = attr->cap.max_send_sge;
910 	qp->sq.max_post = wq_size / wqe_size;
911 
912 	return wq_size;
913 }
914 
915 static int mlx5_calc_rwq_size(struct mlx5_context *ctx,
916 			      struct mlx5_rwq *rwq,
917 			      struct ibv_wq_init_attr *attr)
918 {
919 	size_t wqe_size;
920 	int wq_size;
921 	uint32_t num_scatter;
922 	int scat_spc;
923 
924 	if (!attr->max_wr)
925 		return -EINVAL;
926 
927 	/* TBD: check caps for RQ */
928 	num_scatter = max_t(uint32_t, attr->max_sge, 1);
929 	wqe_size = sizeof(struct mlx5_wqe_data_seg) * num_scatter;
930 
931 	if (rwq->wq_sig)
932 		wqe_size += sizeof(struct mlx5_rwqe_sig);
933 
934 	if (wqe_size <= 0 || wqe_size > ctx->max_rq_desc_sz)
935 		return -EINVAL;
936 
937 	wqe_size = mlx5_round_up_power_of_two(wqe_size);
938 	wq_size = mlx5_round_up_power_of_two(attr->max_wr) * wqe_size;
939 	wq_size = max(wq_size, MLX5_SEND_WQE_BB);
940 	rwq->rq.wqe_cnt = wq_size / wqe_size;
941 	rwq->rq.wqe_shift = mlx5_ilog2(wqe_size);
942 	rwq->rq.max_post = 1 << mlx5_ilog2(wq_size / wqe_size);
943 	scat_spc = wqe_size -
944 		((rwq->wq_sig) ? sizeof(struct mlx5_rwqe_sig) : 0);
945 	rwq->rq.max_gs = scat_spc / sizeof(struct mlx5_wqe_data_seg);
946 	return wq_size;
947 }
948 
949 static int mlx5_calc_rq_size(struct mlx5_context *ctx,
950 			     struct ibv_qp_init_attr_ex *attr,
951 			     struct mlx5_qp *qp)
952 {
953 	int wqe_size;
954 	int wq_size;
955 	int scat_spc;
956 	FILE *fp = ctx->dbg_fp;
957 
958 	if (!attr->cap.max_recv_wr)
959 		return 0;
960 
961 	if (attr->cap.max_recv_wr > ctx->max_recv_wr) {
962 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
963 		return -EINVAL;
964 	}
965 
966 	wqe_size = mlx5_calc_rcv_wqe(ctx, attr, qp);
967 	if (wqe_size < 0 || wqe_size > ctx->max_rq_desc_sz) {
968 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
969 		return -EINVAL;
970 	}
971 
972 	wq_size = mlx5_round_up_power_of_two(attr->cap.max_recv_wr) * wqe_size;
973 	if (wqe_size) {
974 		wq_size = max(wq_size, MLX5_SEND_WQE_BB);
975 		qp->rq.wqe_cnt = wq_size / wqe_size;
976 		qp->rq.wqe_shift = mlx5_ilog2(wqe_size);
977 		qp->rq.max_post = 1 << mlx5_ilog2(wq_size / wqe_size);
978 		scat_spc = wqe_size -
979 			(qp->wq_sig ? sizeof(struct mlx5_rwqe_sig) : 0);
980 		qp->rq.max_gs = scat_spc / sizeof(struct mlx5_wqe_data_seg);
981 	} else {
982 		qp->rq.wqe_cnt = 0;
983 		qp->rq.wqe_shift = 0;
984 		qp->rq.max_post = 0;
985 		qp->rq.max_gs = 0;
986 	}
987 	return wq_size;
988 }
989 
990 static int mlx5_calc_wq_size(struct mlx5_context *ctx,
991 			     struct ibv_qp_init_attr_ex *attr,
992 			     struct mlx5_qp *qp)
993 {
994 	int ret;
995 	int result;
996 
997 	ret = mlx5_calc_sq_size(ctx, attr, qp);
998 	if (ret < 0)
999 		return ret;
1000 
1001 	result = ret;
1002 	ret = mlx5_calc_rq_size(ctx, attr, qp);
1003 	if (ret < 0)
1004 		return ret;
1005 
1006 	result += ret;
1007 
1008 	qp->sq.offset = ret;
1009 	qp->rq.offset = 0;
1010 
1011 	return result;
1012 }
1013 
1014 static void map_uuar(struct ibv_context *context, struct mlx5_qp *qp,
1015 		     int uuar_index)
1016 {
1017 	struct mlx5_context *ctx = to_mctx(context);
1018 
1019 	qp->bf = &ctx->bfs[uuar_index];
1020 }
1021 
1022 static const char *qptype2key(enum ibv_qp_type type)
1023 {
1024 	switch (type) {
1025 	case IBV_QPT_RC: return "HUGE_RC";
1026 	case IBV_QPT_UC: return "HUGE_UC";
1027 	case IBV_QPT_UD: return "HUGE_UD";
1028 	case IBV_QPT_RAW_PACKET: return "HUGE_RAW_ETH";
1029 	default: return "HUGE_NA";
1030 	}
1031 }
1032 
1033 static int mlx5_alloc_qp_buf(struct ibv_context *context,
1034 			     struct ibv_qp_init_attr_ex *attr,
1035 			     struct mlx5_qp *qp,
1036 			     int size)
1037 {
1038 	int err;
1039 	enum mlx5_alloc_type alloc_type;
1040 	enum mlx5_alloc_type default_alloc_type = MLX5_ALLOC_TYPE_ANON;
1041 	const char *qp_huge_key;
1042 
1043 	if (qp->sq.wqe_cnt) {
1044 		qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid));
1045 		if (!qp->sq.wrid) {
1046 			errno = ENOMEM;
1047 			err = -1;
1048 			return err;
1049 		}
1050 
1051 		qp->sq.wr_data = malloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data));
1052 		if (!qp->sq.wr_data) {
1053 			errno = ENOMEM;
1054 			err = -1;
1055 			goto ex_wrid;
1056 		}
1057 	}
1058 
1059 	qp->sq.wqe_head = malloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head));
1060 	if (!qp->sq.wqe_head) {
1061 		errno = ENOMEM;
1062 		err = -1;
1063 			goto ex_wrid;
1064 	}
1065 
1066 	if (qp->rq.wqe_cnt) {
1067 		qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof(uint64_t));
1068 		if (!qp->rq.wrid) {
1069 			errno = ENOMEM;
1070 			err = -1;
1071 			goto ex_wrid;
1072 		}
1073 	}
1074 
1075 	/* compatibility support */
1076 	qp_huge_key  = qptype2key(qp->ibv_qp->qp_type);
1077 	if (mlx5_use_huge(qp_huge_key))
1078 		default_alloc_type = MLX5_ALLOC_TYPE_HUGE;
1079 
1080 	mlx5_get_alloc_type(MLX5_QP_PREFIX, &alloc_type,
1081 			    default_alloc_type);
1082 
1083 	err = mlx5_alloc_prefered_buf(to_mctx(context), &qp->buf,
1084 				      align(qp->buf_size, to_mdev
1085 				      (context->device)->page_size),
1086 				      to_mdev(context->device)->page_size,
1087 				      alloc_type,
1088 				      MLX5_QP_PREFIX);
1089 
1090 	if (err) {
1091 		err = -ENOMEM;
1092 		goto ex_wrid;
1093 	}
1094 
1095 	memset(qp->buf.buf, 0, qp->buf_size);
1096 
1097 	if (attr->qp_type == IBV_QPT_RAW_PACKET) {
1098 		size_t aligned_sq_buf_size = align(qp->sq_buf_size,
1099 						   to_mdev(context->device)->page_size);
1100 		/* For Raw Packet QP, allocate a separate buffer for the SQ */
1101 		err = mlx5_alloc_prefered_buf(to_mctx(context), &qp->sq_buf,
1102 					      aligned_sq_buf_size,
1103 					      to_mdev(context->device)->page_size,
1104 					      alloc_type,
1105 					      MLX5_QP_PREFIX);
1106 		if (err) {
1107 			err = -ENOMEM;
1108 			goto rq_buf;
1109 		}
1110 
1111 		memset(qp->sq_buf.buf, 0, aligned_sq_buf_size);
1112 	}
1113 
1114 	return 0;
1115 rq_buf:
1116 	mlx5_free_actual_buf(to_mctx(qp->verbs_qp.qp.context), &qp->buf);
1117 ex_wrid:
1118 	if (qp->rq.wrid)
1119 		free(qp->rq.wrid);
1120 
1121 	if (qp->sq.wqe_head)
1122 		free(qp->sq.wqe_head);
1123 
1124 	if (qp->sq.wr_data)
1125 		free(qp->sq.wr_data);
1126 	if (qp->sq.wrid)
1127 		free(qp->sq.wrid);
1128 
1129 	return err;
1130 }
1131 
1132 static void mlx5_free_qp_buf(struct mlx5_qp *qp)
1133 {
1134 	struct mlx5_context *ctx = to_mctx(qp->ibv_qp->context);
1135 
1136 	mlx5_free_actual_buf(ctx, &qp->buf);
1137 
1138 	if (qp->sq_buf.buf)
1139 		mlx5_free_actual_buf(ctx, &qp->sq_buf);
1140 
1141 	if (qp->rq.wrid)
1142 		free(qp->rq.wrid);
1143 
1144 	if (qp->sq.wqe_head)
1145 		free(qp->sq.wqe_head);
1146 
1147 	if (qp->sq.wrid)
1148 		free(qp->sq.wrid);
1149 
1150 	if (qp->sq.wr_data)
1151 		free(qp->sq.wr_data);
1152 }
1153 
1154 static int mlx5_cmd_create_rss_qp(struct ibv_context *context,
1155 				 struct ibv_qp_init_attr_ex *attr,
1156 				 struct mlx5_qp *qp)
1157 {
1158 	struct mlx5_create_qp_ex_rss cmd_ex_rss = {};
1159 	struct mlx5_create_qp_resp_ex resp = {};
1160 	int ret;
1161 
1162 	if (attr->rx_hash_conf.rx_hash_key_len > sizeof(cmd_ex_rss.rx_hash_key)) {
1163 		errno = EINVAL;
1164 		return errno;
1165 	}
1166 
1167 	cmd_ex_rss.rx_hash_fields_mask = attr->rx_hash_conf.rx_hash_fields_mask;
1168 	cmd_ex_rss.rx_hash_function = attr->rx_hash_conf.rx_hash_function;
1169 	cmd_ex_rss.rx_key_len = attr->rx_hash_conf.rx_hash_key_len;
1170 	memcpy(cmd_ex_rss.rx_hash_key, attr->rx_hash_conf.rx_hash_key,
1171 			attr->rx_hash_conf.rx_hash_key_len);
1172 
1173 	ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp,
1174 					    sizeof(qp->verbs_qp), attr,
1175 					    &cmd_ex_rss.ibv_cmd, sizeof(cmd_ex_rss.ibv_cmd),
1176 					    sizeof(cmd_ex_rss), &resp.ibv_resp,
1177 					    sizeof(resp.ibv_resp), sizeof(resp));
1178 	if (ret)
1179 		return ret;
1180 
1181 	qp->rss_qp = 1;
1182 	return 0;
1183 }
1184 
1185 static int mlx5_cmd_create_qp_ex(struct ibv_context *context,
1186 				 struct ibv_qp_init_attr_ex *attr,
1187 				 struct mlx5_create_qp *cmd,
1188 				 struct mlx5_qp *qp,
1189 				 struct mlx5_create_qp_resp_ex *resp)
1190 {
1191 	struct mlx5_create_qp_ex cmd_ex;
1192 	int ret;
1193 
1194 	memset(&cmd_ex, 0, sizeof(cmd_ex));
1195 	memcpy(&cmd_ex.ibv_cmd.base, &cmd->ibv_cmd.user_handle,
1196 	       offsetof(typeof(cmd->ibv_cmd), is_srq) +
1197 	       sizeof(cmd->ibv_cmd.is_srq) -
1198 	       offsetof(typeof(cmd->ibv_cmd), user_handle));
1199 
1200 	memcpy(&cmd_ex.drv_ex, &cmd->buf_addr,
1201 	       offsetof(typeof(*cmd), sq_buf_addr) +
1202 	       sizeof(cmd->sq_buf_addr) - sizeof(cmd->ibv_cmd));
1203 
1204 	ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp,
1205 				    sizeof(qp->verbs_qp), attr,
1206 				    &cmd_ex.ibv_cmd, sizeof(cmd_ex.ibv_cmd),
1207 				    sizeof(cmd_ex), &resp->ibv_resp,
1208 				    sizeof(resp->ibv_resp), sizeof(*resp));
1209 
1210 	return ret;
1211 }
1212 
1213 enum {
1214 	MLX5_CREATE_QP_SUP_COMP_MASK = (IBV_QP_INIT_ATTR_PD |
1215 					IBV_QP_INIT_ATTR_XRCD |
1216 					IBV_QP_INIT_ATTR_CREATE_FLAGS |
1217 					IBV_QP_INIT_ATTR_MAX_TSO_HEADER |
1218 					IBV_QP_INIT_ATTR_IND_TABLE |
1219 					IBV_QP_INIT_ATTR_RX_HASH),
1220 };
1221 
1222 enum {
1223 	MLX5_CREATE_QP_EX2_COMP_MASK = (IBV_QP_INIT_ATTR_CREATE_FLAGS |
1224 					IBV_QP_INIT_ATTR_MAX_TSO_HEADER |
1225 					IBV_QP_INIT_ATTR_IND_TABLE |
1226 					IBV_QP_INIT_ATTR_RX_HASH),
1227 };
1228 
1229 static struct ibv_qp *create_qp(struct ibv_context *context,
1230 			 struct ibv_qp_init_attr_ex *attr)
1231 {
1232 	struct mlx5_create_qp		cmd;
1233 	struct mlx5_create_qp_resp	resp;
1234 	struct mlx5_create_qp_resp_ex resp_ex;
1235 	struct mlx5_qp		       *qp;
1236 	int				ret;
1237 	struct mlx5_context	       *ctx = to_mctx(context);
1238 	struct ibv_qp		       *ibqp;
1239 	int32_t				usr_idx = 0;
1240 	uint32_t			uuar_index;
1241 	FILE *fp = ctx->dbg_fp;
1242 
1243 	if (attr->comp_mask & ~MLX5_CREATE_QP_SUP_COMP_MASK)
1244 		return NULL;
1245 
1246 	if ((attr->comp_mask & IBV_QP_INIT_ATTR_MAX_TSO_HEADER) &&
1247 	    (attr->qp_type != IBV_QPT_RAW_PACKET))
1248 		return NULL;
1249 
1250 	qp = calloc(1, sizeof(*qp));
1251 	if (!qp) {
1252 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
1253 		return NULL;
1254 	}
1255 	ibqp = (struct ibv_qp *)&qp->verbs_qp;
1256 	qp->ibv_qp = ibqp;
1257 
1258 	memset(&cmd, 0, sizeof(cmd));
1259 	memset(&resp, 0, sizeof(resp));
1260 	memset(&resp_ex, 0, sizeof(resp_ex));
1261 
1262 	if (attr->comp_mask & IBV_QP_INIT_ATTR_RX_HASH) {
1263 		ret = mlx5_cmd_create_rss_qp(context, attr, qp);
1264 		if (ret)
1265 			goto err;
1266 
1267 		return ibqp;
1268 	}
1269 
1270 	qp->wq_sig = qp_sig_enabled();
1271 	if (qp->wq_sig)
1272 		cmd.flags |= MLX5_QP_FLAG_SIGNATURE;
1273 
1274 	if (use_scatter_to_cqe())
1275 		cmd.flags |= MLX5_QP_FLAG_SCATTER_CQE;
1276 
1277 	ret = mlx5_calc_wq_size(ctx, attr, qp);
1278 	if (ret < 0) {
1279 		errno = -ret;
1280 		goto err;
1281 	}
1282 
1283 	if (attr->qp_type == IBV_QPT_RAW_PACKET) {
1284 		qp->buf_size = qp->sq.offset;
1285 		qp->sq_buf_size = ret - qp->buf_size;
1286 		qp->sq.offset = 0;
1287 	} else {
1288 		qp->buf_size = ret;
1289 		qp->sq_buf_size = 0;
1290 	}
1291 
1292 	if (mlx5_alloc_qp_buf(context, attr, qp, ret)) {
1293 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
1294 		goto err;
1295 	}
1296 
1297 	if (attr->qp_type == IBV_QPT_RAW_PACKET) {
1298 		qp->sq_start = qp->sq_buf.buf;
1299 		qp->sq.qend = qp->sq_buf.buf +
1300 				(qp->sq.wqe_cnt << qp->sq.wqe_shift);
1301 	} else {
1302 		qp->sq_start = qp->buf.buf + qp->sq.offset;
1303 		qp->sq.qend = qp->buf.buf + qp->sq.offset +
1304 				(qp->sq.wqe_cnt << qp->sq.wqe_shift);
1305 	}
1306 
1307 	mlx5_init_qp_indices(qp);
1308 
1309 	if (mlx5_spinlock_init(&qp->sq.lock) ||
1310 	    mlx5_spinlock_init(&qp->rq.lock))
1311 		goto err_free_qp_buf;
1312 
1313 	qp->db = mlx5_alloc_dbrec(ctx);
1314 	if (!qp->db) {
1315 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
1316 		goto err_free_qp_buf;
1317 	}
1318 
1319 	qp->db[MLX5_RCV_DBR] = 0;
1320 	qp->db[MLX5_SND_DBR] = 0;
1321 
1322 	cmd.buf_addr = (uintptr_t) qp->buf.buf;
1323 	cmd.sq_buf_addr = (attr->qp_type == IBV_QPT_RAW_PACKET) ?
1324 			  (uintptr_t) qp->sq_buf.buf : 0;
1325 	cmd.db_addr  = (uintptr_t) qp->db;
1326 	cmd.sq_wqe_count = qp->sq.wqe_cnt;
1327 	cmd.rq_wqe_count = qp->rq.wqe_cnt;
1328 	cmd.rq_wqe_shift = qp->rq.wqe_shift;
1329 
1330 	if (ctx->atomic_cap == IBV_ATOMIC_HCA)
1331 		qp->atomics_enabled = 1;
1332 
1333 	if (!ctx->cqe_version) {
1334 		cmd.uidx = 0xffffff;
1335 		pthread_mutex_lock(&ctx->qp_table_mutex);
1336 	} else if (!is_xrc_tgt(attr->qp_type)) {
1337 		usr_idx = mlx5_store_uidx(ctx, qp);
1338 		if (usr_idx < 0) {
1339 			mlx5_dbg(fp, MLX5_DBG_QP, "Couldn't find free user index\n");
1340 			goto err_rq_db;
1341 		}
1342 
1343 		cmd.uidx = usr_idx;
1344 	}
1345 
1346 	if (attr->comp_mask & MLX5_CREATE_QP_EX2_COMP_MASK)
1347 		ret = mlx5_cmd_create_qp_ex(context, attr, &cmd, qp, &resp_ex);
1348 	else
1349 		ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, sizeof(qp->verbs_qp),
1350 					   attr, &cmd.ibv_cmd, sizeof(cmd),
1351 					   &resp.ibv_resp, sizeof(resp));
1352 	if (ret) {
1353 		mlx5_dbg(fp, MLX5_DBG_QP, "ret %d\n", ret);
1354 		goto err_free_uidx;
1355 	}
1356 
1357 	uuar_index = (attr->comp_mask & MLX5_CREATE_QP_EX2_COMP_MASK) ?
1358 			resp_ex.uuar_index : resp.uuar_index;
1359 	if (!ctx->cqe_version) {
1360 		if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) {
1361 			ret = mlx5_store_qp(ctx, ibqp->qp_num, qp);
1362 			if (ret) {
1363 				mlx5_dbg(fp, MLX5_DBG_QP, "ret %d\n", ret);
1364 				goto err_destroy;
1365 			}
1366 		}
1367 
1368 		pthread_mutex_unlock(&ctx->qp_table_mutex);
1369 	}
1370 
1371 	map_uuar(context, qp, uuar_index);
1372 
1373 	qp->rq.max_post = qp->rq.wqe_cnt;
1374 	if (attr->sq_sig_all)
1375 		qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
1376 	else
1377 		qp->sq_signal_bits = 0;
1378 
1379 	attr->cap.max_send_wr = qp->sq.max_post;
1380 	attr->cap.max_recv_wr = qp->rq.max_post;
1381 	attr->cap.max_recv_sge = qp->rq.max_gs;
1382 
1383 	qp->rsc.type = MLX5_RSC_TYPE_QP;
1384 	qp->rsc.rsn = (ctx->cqe_version && !is_xrc_tgt(attr->qp_type)) ?
1385 		      usr_idx : ibqp->qp_num;
1386 
1387 	return ibqp;
1388 
1389 err_destroy:
1390 	ibv_cmd_destroy_qp(ibqp);
1391 
1392 err_free_uidx:
1393 	if (!ctx->cqe_version)
1394 		pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
1395 	else if (!is_xrc_tgt(attr->qp_type))
1396 		mlx5_clear_uidx(ctx, usr_idx);
1397 
1398 err_rq_db:
1399 	mlx5_free_db(to_mctx(context), qp->db);
1400 
1401 err_free_qp_buf:
1402 	mlx5_free_qp_buf(qp);
1403 
1404 err:
1405 	free(qp);
1406 
1407 	return NULL;
1408 }
1409 
1410 struct ibv_qp *mlx5_create_qp(struct ibv_pd *pd,
1411 			      struct ibv_qp_init_attr *attr)
1412 {
1413 	struct ibv_qp *qp;
1414 	struct ibv_qp_init_attr_ex attrx;
1415 
1416 	memset(&attrx, 0, sizeof(attrx));
1417 	memcpy(&attrx, attr, sizeof(*attr));
1418 	attrx.comp_mask = IBV_QP_INIT_ATTR_PD;
1419 	attrx.pd = pd;
1420 	qp = create_qp(pd->context, &attrx);
1421 	if (qp)
1422 		memcpy(attr, &attrx, sizeof(*attr));
1423 
1424 	return qp;
1425 }
1426 
1427 static void mlx5_lock_cqs(struct ibv_qp *qp)
1428 {
1429 	struct mlx5_cq *send_cq = to_mcq(qp->send_cq);
1430 	struct mlx5_cq *recv_cq = to_mcq(qp->recv_cq);
1431 
1432 	if (send_cq && recv_cq) {
1433 		if (send_cq == recv_cq) {
1434 			mlx5_spin_lock(&send_cq->lock);
1435 		} else if (send_cq->cqn < recv_cq->cqn) {
1436 			mlx5_spin_lock(&send_cq->lock);
1437 			mlx5_spin_lock(&recv_cq->lock);
1438 		} else {
1439 			mlx5_spin_lock(&recv_cq->lock);
1440 			mlx5_spin_lock(&send_cq->lock);
1441 		}
1442 	} else if (send_cq) {
1443 		mlx5_spin_lock(&send_cq->lock);
1444 	} else if (recv_cq) {
1445 		mlx5_spin_lock(&recv_cq->lock);
1446 	}
1447 }
1448 
1449 static void mlx5_unlock_cqs(struct ibv_qp *qp)
1450 {
1451 	struct mlx5_cq *send_cq = to_mcq(qp->send_cq);
1452 	struct mlx5_cq *recv_cq = to_mcq(qp->recv_cq);
1453 
1454 	if (send_cq && recv_cq) {
1455 		if (send_cq == recv_cq) {
1456 			mlx5_spin_unlock(&send_cq->lock);
1457 		} else if (send_cq->cqn < recv_cq->cqn) {
1458 			mlx5_spin_unlock(&recv_cq->lock);
1459 			mlx5_spin_unlock(&send_cq->lock);
1460 		} else {
1461 			mlx5_spin_unlock(&send_cq->lock);
1462 			mlx5_spin_unlock(&recv_cq->lock);
1463 		}
1464 	} else if (send_cq) {
1465 		mlx5_spin_unlock(&send_cq->lock);
1466 	} else if (recv_cq) {
1467 		mlx5_spin_unlock(&recv_cq->lock);
1468 	}
1469 }
1470 
1471 int mlx5_destroy_qp(struct ibv_qp *ibqp)
1472 {
1473 	struct mlx5_qp *qp = to_mqp(ibqp);
1474 	struct mlx5_context *ctx = to_mctx(ibqp->context);
1475 	int ret;
1476 
1477 	if (qp->rss_qp) {
1478 		ret = ibv_cmd_destroy_qp(ibqp);
1479 		if (ret)
1480 			return ret;
1481 		goto free;
1482 	}
1483 
1484 	if (!ctx->cqe_version)
1485 		pthread_mutex_lock(&ctx->qp_table_mutex);
1486 
1487 	ret = ibv_cmd_destroy_qp(ibqp);
1488 	if (ret) {
1489 		if (!ctx->cqe_version)
1490 			pthread_mutex_unlock(&ctx->qp_table_mutex);
1491 		return ret;
1492 	}
1493 
1494 	mlx5_lock_cqs(ibqp);
1495 
1496 	__mlx5_cq_clean(to_mcq(ibqp->recv_cq), qp->rsc.rsn,
1497 			ibqp->srq ? to_msrq(ibqp->srq) : NULL);
1498 	if (ibqp->send_cq != ibqp->recv_cq)
1499 		__mlx5_cq_clean(to_mcq(ibqp->send_cq), qp->rsc.rsn, NULL);
1500 
1501 	if (!ctx->cqe_version) {
1502 		if (qp->sq.wqe_cnt || qp->rq.wqe_cnt)
1503 			mlx5_clear_qp(ctx, ibqp->qp_num);
1504 	}
1505 
1506 	mlx5_unlock_cqs(ibqp);
1507 	if (!ctx->cqe_version)
1508 		pthread_mutex_unlock(&ctx->qp_table_mutex);
1509 	else if (!is_xrc_tgt(ibqp->qp_type))
1510 		mlx5_clear_uidx(ctx, qp->rsc.rsn);
1511 
1512 	mlx5_free_db(ctx, qp->db);
1513 	mlx5_free_qp_buf(qp);
1514 free:
1515 	free(qp);
1516 
1517 	return 0;
1518 }
1519 
1520 int mlx5_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr,
1521 		  int attr_mask, struct ibv_qp_init_attr *init_attr)
1522 {
1523 	struct ibv_query_qp cmd;
1524 	struct mlx5_qp *qp = to_mqp(ibqp);
1525 	int ret;
1526 
1527 	if (qp->rss_qp)
1528 		return ENOSYS;
1529 
1530 	ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof(cmd));
1531 	if (ret)
1532 		return ret;
1533 
1534 	init_attr->cap.max_send_wr     = qp->sq.max_post;
1535 	init_attr->cap.max_send_sge    = qp->sq.max_gs;
1536 	init_attr->cap.max_inline_data = qp->max_inline_data;
1537 
1538 	attr->cap = init_attr->cap;
1539 
1540 	return 0;
1541 }
1542 
1543 enum {
1544 	MLX5_MODIFY_QP_EX_ATTR_MASK = IBV_QP_RATE_LIMIT,
1545 };
1546 
1547 int mlx5_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
1548 		   int attr_mask)
1549 {
1550 	struct ibv_modify_qp cmd = {};
1551 	struct ibv_modify_qp_ex cmd_ex = {};
1552 	struct ibv_modify_qp_resp_ex resp = {};
1553 	struct mlx5_qp *mqp = to_mqp(qp);
1554 	struct mlx5_context *context = to_mctx(qp->context);
1555 	int ret;
1556 	uint32_t *db;
1557 
1558 	if (mqp->rss_qp)
1559 		return ENOSYS;
1560 
1561 	if (attr_mask & IBV_QP_PORT) {
1562 		switch (qp->qp_type) {
1563 		case IBV_QPT_RAW_PACKET:
1564 			if (context->cached_link_layer[attr->port_num - 1] ==
1565 			     IBV_LINK_LAYER_ETHERNET) {
1566 				if (context->cached_device_cap_flags &
1567 				    IBV_DEVICE_RAW_IP_CSUM)
1568 					mqp->qp_cap_cache |=
1569 						MLX5_CSUM_SUPPORT_RAW_OVER_ETH |
1570 						MLX5_RX_CSUM_VALID;
1571 
1572 				if (ibv_is_qpt_supported(
1573 				 context->cached_tso_caps.supported_qpts,
1574 				 IBV_QPT_RAW_PACKET))
1575 					mqp->max_tso =
1576 					     context->cached_tso_caps.max_tso;
1577 			}
1578 			break;
1579 		default:
1580 			break;
1581 		}
1582 	}
1583 
1584 	if (attr_mask & MLX5_MODIFY_QP_EX_ATTR_MASK)
1585 		ret = ibv_cmd_modify_qp_ex(qp, attr, attr_mask,
1586 					   &cmd_ex,
1587 					   sizeof(cmd_ex), sizeof(cmd_ex),
1588 					   &resp,
1589 					   sizeof(resp), sizeof(resp));
1590 	else
1591 		ret = ibv_cmd_modify_qp(qp, attr, attr_mask,
1592 					&cmd, sizeof(cmd));
1593 
1594 	if (!ret		       &&
1595 	    (attr_mask & IBV_QP_STATE) &&
1596 	    attr->qp_state == IBV_QPS_RESET) {
1597 		if (qp->recv_cq) {
1598 			mlx5_cq_clean(to_mcq(qp->recv_cq), mqp->rsc.rsn,
1599 				      qp->srq ? to_msrq(qp->srq) : NULL);
1600 		}
1601 		if (qp->send_cq != qp->recv_cq && qp->send_cq)
1602 			mlx5_cq_clean(to_mcq(qp->send_cq),
1603 				      to_mqp(qp)->rsc.rsn, NULL);
1604 
1605 		mlx5_init_qp_indices(mqp);
1606 		db = mqp->db;
1607 		db[MLX5_RCV_DBR] = 0;
1608 		db[MLX5_SND_DBR] = 0;
1609 	}
1610 
1611 	/*
1612 	 * When the Raw Packet QP is in INIT state, its RQ
1613 	 * underneath is already in RDY, which means it can
1614 	 * receive packets. According to the IB spec, a QP can't
1615 	 * receive packets until moved to RTR state. To achieve this,
1616 	 * for Raw Packet QPs, we update the doorbell record
1617 	 * once the QP is moved to RTR.
1618 	 */
1619 	if (!ret &&
1620 	    (attr_mask & IBV_QP_STATE) &&
1621 	    attr->qp_state == IBV_QPS_RTR &&
1622 	    qp->qp_type == IBV_QPT_RAW_PACKET) {
1623 		mlx5_spin_lock(&mqp->rq.lock);
1624 		mqp->db[MLX5_RCV_DBR] = htobe32(mqp->rq.head & 0xffff);
1625 		mlx5_spin_unlock(&mqp->rq.lock);
1626 	}
1627 
1628 	return ret;
1629 }
1630 
1631 #define RROCE_UDP_SPORT_MIN 0xC000
1632 #define RROCE_UDP_SPORT_MAX 0xFFFF
1633 struct ibv_ah *mlx5_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
1634 {
1635 	struct mlx5_context *ctx = to_mctx(pd->context);
1636 	struct ibv_port_attr port_attr;
1637 	struct mlx5_ah *ah;
1638 	uint32_t gid_type;
1639 	uint32_t tmp;
1640 	uint8_t grh;
1641 	int is_eth;
1642 
1643 	if (attr->port_num < 1 || attr->port_num > ctx->num_ports)
1644 		return NULL;
1645 
1646 	if (ctx->cached_link_layer[attr->port_num - 1]) {
1647 		is_eth = ctx->cached_link_layer[attr->port_num - 1] ==
1648 			IBV_LINK_LAYER_ETHERNET;
1649 	} else {
1650 		if (ibv_query_port(pd->context, attr->port_num, &port_attr))
1651 			return NULL;
1652 
1653 		is_eth = (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET);
1654 	}
1655 
1656 	if (unlikely((!attr->is_global) && is_eth)) {
1657 		errno = EINVAL;
1658 		return NULL;
1659 	}
1660 
1661 	ah = calloc(1, sizeof *ah);
1662 	if (!ah)
1663 		return NULL;
1664 
1665 	if (is_eth) {
1666 		if (ibv_query_gid_type(pd->context, attr->port_num,
1667 				       attr->grh.sgid_index, &gid_type))
1668 			goto err;
1669 
1670 		if (gid_type == IBV_GID_TYPE_ROCE_V2)
1671 			ah->av.rlid = htobe16(rand() % (RROCE_UDP_SPORT_MAX + 1
1672 						      - RROCE_UDP_SPORT_MIN)
1673 					    + RROCE_UDP_SPORT_MIN);
1674 		/* Since RoCE packets must contain GRH, this bit is reserved
1675 		 * for RoCE and shouldn't be set.
1676 		 */
1677 		grh = 0;
1678 	} else {
1679 		ah->av.fl_mlid = attr->src_path_bits & 0x7f;
1680 		ah->av.rlid = htobe16(attr->dlid);
1681 		grh = 1;
1682 	}
1683 	ah->av.stat_rate_sl = (attr->static_rate << 4) | attr->sl;
1684 	if (attr->is_global) {
1685 		ah->av.tclass = attr->grh.traffic_class;
1686 		ah->av.hop_limit = attr->grh.hop_limit;
1687 		tmp = htobe32((grh << 30) |
1688 			    ((attr->grh.sgid_index & 0xff) << 20) |
1689 			    (attr->grh.flow_label & 0xfffff));
1690 		ah->av.grh_gid_fl = tmp;
1691 		memcpy(ah->av.rgid, attr->grh.dgid.raw, 16);
1692 	}
1693 
1694 	if (is_eth) {
1695 		if (ctx->cmds_supp_uhw & MLX5_USER_CMDS_SUPP_UHW_CREATE_AH) {
1696 			struct mlx5_create_ah_resp resp = {};
1697 
1698 			if (ibv_cmd_create_ah(pd, &ah->ibv_ah, attr, &resp.ibv_resp, sizeof(resp)))
1699 				goto err;
1700 
1701 			ah->kern_ah = true;
1702 			memcpy(ah->av.rmac, resp.dmac, ETHERNET_LL_SIZE);
1703 		} else {
1704 			uint16_t vid;
1705 
1706 			if (ibv_resolve_eth_l2_from_gid(pd->context, attr,
1707 							ah->av.rmac, &vid))
1708 				goto err;
1709 		}
1710 	}
1711 
1712 	return &ah->ibv_ah;
1713 err:
1714 	free(ah);
1715 	return NULL;
1716 }
1717 
1718 int mlx5_destroy_ah(struct ibv_ah *ah)
1719 {
1720 	struct mlx5_ah *mah = to_mah(ah);
1721 	int err;
1722 
1723 	if (mah->kern_ah) {
1724 		err = ibv_cmd_destroy_ah(ah);
1725 		if (err)
1726 			return err;
1727 	}
1728 
1729 	free(mah);
1730 	return 0;
1731 }
1732 
1733 int mlx5_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
1734 {
1735 	return ibv_cmd_attach_mcast(qp, gid, lid);
1736 }
1737 
1738 int mlx5_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
1739 {
1740 	return ibv_cmd_detach_mcast(qp, gid, lid);
1741 }
1742 
1743 struct ibv_qp *mlx5_create_qp_ex(struct ibv_context *context,
1744 				 struct ibv_qp_init_attr_ex *attr)
1745 {
1746 	return create_qp(context, attr);
1747 }
1748 
1749 int mlx5_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num)
1750 {
1751 	struct mlx5_srq *msrq = to_msrq(srq);
1752 
1753 	*srq_num = msrq->srqn;
1754 
1755 	return 0;
1756 }
1757 
1758 struct ibv_xrcd *
1759 mlx5_open_xrcd(struct ibv_context *context,
1760 	       struct ibv_xrcd_init_attr *xrcd_init_attr)
1761 {
1762 	int err;
1763 	struct verbs_xrcd *xrcd;
1764 	struct ibv_open_xrcd cmd = {};
1765 	struct ibv_open_xrcd_resp resp = {};
1766 
1767 	xrcd = calloc(1, sizeof(*xrcd));
1768 	if (!xrcd)
1769 		return NULL;
1770 
1771 	err = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), xrcd_init_attr,
1772 				&cmd, sizeof(cmd), &resp, sizeof(resp));
1773 	if (err) {
1774 		free(xrcd);
1775 		return NULL;
1776 	}
1777 
1778 	return &xrcd->xrcd;
1779 }
1780 
1781 int mlx5_close_xrcd(struct ibv_xrcd *ib_xrcd)
1782 {
1783 	struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd);
1784 	int ret;
1785 
1786 	ret = ibv_cmd_close_xrcd(xrcd);
1787 	if (!ret)
1788 		free(xrcd);
1789 
1790 	return ret;
1791 }
1792 
1793 static struct ibv_srq *
1794 mlx5_create_xrc_srq(struct ibv_context *context,
1795 		    struct ibv_srq_init_attr_ex *attr)
1796 {
1797 	int err;
1798 	struct mlx5_create_srq_ex cmd;
1799 	struct mlx5_create_srq_resp resp;
1800 	struct mlx5_srq *msrq;
1801 	struct mlx5_context *ctx = to_mctx(context);
1802 	int max_sge;
1803 	struct ibv_srq *ibsrq;
1804 	int uidx;
1805 	FILE *fp = ctx->dbg_fp;
1806 
1807 	msrq = calloc(1, sizeof(*msrq));
1808 	if (!msrq)
1809 		return NULL;
1810 
1811 	ibsrq = (struct ibv_srq *)&msrq->vsrq;
1812 
1813 	memset(&cmd, 0, sizeof(cmd));
1814 	memset(&resp, 0, sizeof(resp));
1815 
1816 	if (mlx5_spinlock_init(&msrq->lock)) {
1817 		fprintf(stderr, "%s-%d:\n", __func__, __LINE__);
1818 		goto err;
1819 	}
1820 
1821 	if (attr->attr.max_wr > ctx->max_srq_recv_wr) {
1822 		fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n",
1823 			__func__, __LINE__, attr->attr.max_wr,
1824 			ctx->max_srq_recv_wr);
1825 		errno = EINVAL;
1826 		goto err;
1827 	}
1828 
1829 	/*
1830 	 * this calculation does not consider required control segments. The
1831 	 * final calculation is done again later. This is done so to avoid
1832 	 * overflows of variables
1833 	 */
1834 	max_sge = ctx->max_recv_wr / sizeof(struct mlx5_wqe_data_seg);
1835 	if (attr->attr.max_sge > max_sge) {
1836 		fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n",
1837 			__func__, __LINE__, attr->attr.max_wr,
1838 			ctx->max_srq_recv_wr);
1839 		errno = EINVAL;
1840 		goto err;
1841 	}
1842 
1843 	msrq->max     = align_queue_size(attr->attr.max_wr + 1);
1844 	msrq->max_gs  = attr->attr.max_sge;
1845 	msrq->counter = 0;
1846 
1847 	if (mlx5_alloc_srq_buf(context, msrq)) {
1848 		fprintf(stderr, "%s-%d:\n", __func__, __LINE__);
1849 		goto err;
1850 	}
1851 
1852 	msrq->db = mlx5_alloc_dbrec(ctx);
1853 	if (!msrq->db) {
1854 		fprintf(stderr, "%s-%d:\n", __func__, __LINE__);
1855 		goto err_free;
1856 	}
1857 
1858 	*msrq->db = 0;
1859 
1860 	cmd.buf_addr = (uintptr_t)msrq->buf.buf;
1861 	cmd.db_addr  = (uintptr_t)msrq->db;
1862 	msrq->wq_sig = srq_sig_enabled();
1863 	if (msrq->wq_sig)
1864 		cmd.flags = MLX5_SRQ_FLAG_SIGNATURE;
1865 
1866 	attr->attr.max_sge = msrq->max_gs;
1867 	if (ctx->cqe_version) {
1868 		uidx = mlx5_store_uidx(ctx, msrq);
1869 		if (uidx < 0) {
1870 			mlx5_dbg(fp, MLX5_DBG_QP, "Couldn't find free user index\n");
1871 			goto err_free_db;
1872 		}
1873 		cmd.uidx = uidx;
1874 	} else {
1875 		cmd.uidx = 0xffffff;
1876 		pthread_mutex_lock(&ctx->srq_table_mutex);
1877 	}
1878 
1879 	err = ibv_cmd_create_srq_ex(context, &msrq->vsrq, sizeof(msrq->vsrq),
1880 				    attr, &cmd.ibv_cmd, sizeof(cmd),
1881 				    &resp.ibv_resp, sizeof(resp));
1882 	if (err)
1883 		goto err_free_uidx;
1884 
1885 	if (!ctx->cqe_version) {
1886 		err = mlx5_store_srq(to_mctx(context), resp.srqn, msrq);
1887 		if (err)
1888 			goto err_destroy;
1889 
1890 		pthread_mutex_unlock(&ctx->srq_table_mutex);
1891 	}
1892 
1893 	msrq->srqn = resp.srqn;
1894 	msrq->rsc.type = MLX5_RSC_TYPE_XSRQ;
1895 	msrq->rsc.rsn = ctx->cqe_version ? cmd.uidx : resp.srqn;
1896 
1897 	return ibsrq;
1898 
1899 err_destroy:
1900 	ibv_cmd_destroy_srq(ibsrq);
1901 
1902 err_free_uidx:
1903 	if (ctx->cqe_version)
1904 		mlx5_clear_uidx(ctx, cmd.uidx);
1905 	else
1906 		pthread_mutex_unlock(&ctx->srq_table_mutex);
1907 
1908 err_free_db:
1909 	mlx5_free_db(ctx, msrq->db);
1910 
1911 err_free:
1912 	free(msrq->wrid);
1913 	mlx5_free_buf(&msrq->buf);
1914 
1915 err:
1916 	free(msrq);
1917 
1918 	return NULL;
1919 }
1920 
1921 struct ibv_srq *mlx5_create_srq_ex(struct ibv_context *context,
1922 				   struct ibv_srq_init_attr_ex *attr)
1923 {
1924 	if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ||
1925 	    (attr->srq_type == IBV_SRQT_BASIC))
1926 		return mlx5_create_srq(attr->pd,
1927 				       (struct ibv_srq_init_attr *)attr);
1928 	else if (attr->srq_type == IBV_SRQT_XRC)
1929 		return mlx5_create_xrc_srq(context, attr);
1930 
1931 	return NULL;
1932 }
1933 
1934 int mlx5_query_device_ex(struct ibv_context *context,
1935 			 const struct ibv_query_device_ex_input *input,
1936 			 struct ibv_device_attr_ex *attr,
1937 			 size_t attr_size)
1938 {
1939 	struct mlx5_context *mctx = to_mctx(context);
1940 	struct mlx5_query_device_ex_resp resp;
1941 	struct mlx5_query_device_ex cmd;
1942 	struct ibv_device_attr *a;
1943 	uint64_t raw_fw_ver;
1944 	unsigned sub_minor;
1945 	unsigned major;
1946 	unsigned minor;
1947 	int err;
1948 	int cmd_supp_uhw = mctx->cmds_supp_uhw &
1949 		MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE;
1950 
1951 	memset(&cmd, 0, sizeof(cmd));
1952 	memset(&resp, 0, sizeof(resp));
1953 	err = ibv_cmd_query_device_ex(context, input, attr, attr_size,
1954 				      &raw_fw_ver,
1955 				      &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd),
1956 				      &resp.ibv_resp, sizeof(resp.ibv_resp),
1957 				      cmd_supp_uhw ? sizeof(resp) : sizeof(resp.ibv_resp));
1958 	if (err)
1959 		return err;
1960 
1961 	attr->tso_caps = resp.tso_caps;
1962 	attr->rss_caps.rx_hash_fields_mask = resp.rss_caps.rx_hash_fields_mask;
1963 	attr->rss_caps.rx_hash_function = resp.rss_caps.rx_hash_function;
1964 	attr->packet_pacing_caps = resp.packet_pacing_caps.caps;
1965 
1966 	if (resp.support_multi_pkt_send_wqe)
1967 		mctx->vendor_cap_flags |= MLX5_VENDOR_CAP_FLAGS_MPW;
1968 
1969 	mctx->cqe_comp_caps = resp.cqe_comp_caps;
1970 
1971 	major     = (raw_fw_ver >> 32) & 0xffff;
1972 	minor     = (raw_fw_ver >> 16) & 0xffff;
1973 	sub_minor = raw_fw_ver & 0xffff;
1974 	a = &attr->orig_attr;
1975 	snprintf(a->fw_ver, sizeof(a->fw_ver), "%d.%d.%04d",
1976 		 major, minor, sub_minor);
1977 
1978 	return 0;
1979 }
1980 
1981 static int rwq_sig_enabled(struct ibv_context *context)
1982 {
1983 	char *env;
1984 
1985 	env = getenv("MLX5_RWQ_SIGNATURE");
1986 	if (env)
1987 		return 1;
1988 
1989 	return 0;
1990 }
1991 
1992 static void mlx5_free_rwq_buf(struct mlx5_rwq *rwq, struct ibv_context *context)
1993 {
1994 	struct mlx5_context *ctx = to_mctx(context);
1995 
1996 	mlx5_free_actual_buf(ctx, &rwq->buf);
1997 	free(rwq->rq.wrid);
1998 }
1999 
2000 static int mlx5_alloc_rwq_buf(struct ibv_context *context,
2001 			      struct mlx5_rwq *rwq,
2002 			      int size)
2003 {
2004 	int err;
2005 	enum mlx5_alloc_type default_alloc_type = MLX5_ALLOC_TYPE_PREFER_CONTIG;
2006 
2007 	rwq->rq.wrid = malloc(rwq->rq.wqe_cnt * sizeof(uint64_t));
2008 	if (!rwq->rq.wrid) {
2009 		errno = ENOMEM;
2010 		return -1;
2011 	}
2012 
2013 	err = mlx5_alloc_prefered_buf(to_mctx(context), &rwq->buf,
2014 				      align(rwq->buf_size, to_mdev
2015 				      (context->device)->page_size),
2016 				      to_mdev(context->device)->page_size,
2017 				      default_alloc_type,
2018 				      MLX5_RWQ_PREFIX);
2019 
2020 	if (err) {
2021 		free(rwq->rq.wrid);
2022 		errno = ENOMEM;
2023 		return -1;
2024 	}
2025 
2026 	return 0;
2027 }
2028 
2029 struct ibv_wq *mlx5_create_wq(struct ibv_context *context,
2030 			      struct ibv_wq_init_attr *attr)
2031 {
2032 	struct mlx5_create_wq		cmd;
2033 	struct mlx5_create_wq_resp		resp;
2034 	int				err;
2035 	struct mlx5_rwq			*rwq;
2036 	struct mlx5_context	*ctx = to_mctx(context);
2037 	int ret;
2038 	int32_t				usr_idx = 0;
2039 	FILE *fp = ctx->dbg_fp;
2040 
2041 	if (attr->wq_type != IBV_WQT_RQ)
2042 		return NULL;
2043 
2044 	memset(&cmd, 0, sizeof(cmd));
2045 	memset(&resp, 0, sizeof(resp));
2046 
2047 	rwq = calloc(1, sizeof(*rwq));
2048 	if (!rwq)
2049 		return NULL;
2050 
2051 	rwq->wq_sig = rwq_sig_enabled(context);
2052 	if (rwq->wq_sig)
2053 		cmd.drv.flags = MLX5_RWQ_FLAG_SIGNATURE;
2054 
2055 	ret = mlx5_calc_rwq_size(ctx, rwq, attr);
2056 	if (ret < 0) {
2057 		errno = -ret;
2058 		goto err;
2059 	}
2060 
2061 	rwq->buf_size = ret;
2062 	if (mlx5_alloc_rwq_buf(context, rwq, ret))
2063 		goto err;
2064 
2065 	mlx5_init_rwq_indices(rwq);
2066 
2067 	if (mlx5_spinlock_init(&rwq->rq.lock))
2068 		goto err_free_rwq_buf;
2069 
2070 	rwq->db = mlx5_alloc_dbrec(ctx);
2071 	if (!rwq->db)
2072 		goto err_free_rwq_buf;
2073 
2074 	rwq->db[MLX5_RCV_DBR] = 0;
2075 	rwq->db[MLX5_SND_DBR] = 0;
2076 	rwq->pbuff = rwq->buf.buf + rwq->rq.offset;
2077 	rwq->recv_db =  &rwq->db[MLX5_RCV_DBR];
2078 	cmd.drv.buf_addr = (uintptr_t)rwq->buf.buf;
2079 	cmd.drv.db_addr  = (uintptr_t)rwq->db;
2080 	cmd.drv.rq_wqe_count = rwq->rq.wqe_cnt;
2081 	cmd.drv.rq_wqe_shift = rwq->rq.wqe_shift;
2082 	usr_idx = mlx5_store_uidx(ctx, rwq);
2083 	if (usr_idx < 0) {
2084 		mlx5_dbg(fp, MLX5_DBG_QP, "Couldn't find free user index\n");
2085 		goto err_free_db_rec;
2086 	}
2087 
2088 	cmd.drv.user_index = usr_idx;
2089 	err = ibv_cmd_create_wq(context, attr, &rwq->wq, &cmd.ibv_cmd,
2090 				sizeof(cmd.ibv_cmd),
2091 				sizeof(cmd),
2092 				&resp.ibv_resp, sizeof(resp.ibv_resp),
2093 				sizeof(resp));
2094 	if (err)
2095 		goto err_create;
2096 
2097 	rwq->rsc.type = MLX5_RSC_TYPE_RWQ;
2098 	rwq->rsc.rsn =  cmd.drv.user_index;
2099 
2100 	rwq->wq.post_recv = mlx5_post_wq_recv;
2101 	return &rwq->wq;
2102 
2103 err_create:
2104 	mlx5_clear_uidx(ctx, cmd.drv.user_index);
2105 err_free_db_rec:
2106 	mlx5_free_db(to_mctx(context), rwq->db);
2107 err_free_rwq_buf:
2108 	mlx5_free_rwq_buf(rwq, context);
2109 err:
2110 	free(rwq);
2111 	return NULL;
2112 }
2113 
2114 int mlx5_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *attr)
2115 {
2116 	struct mlx5_modify_wq	cmd = {};
2117 	struct mlx5_rwq *rwq = to_mrwq(wq);
2118 
2119 	if ((attr->attr_mask & IBV_WQ_ATTR_STATE) &&
2120 	    attr->wq_state == IBV_WQS_RDY) {
2121 		if ((attr->attr_mask & IBV_WQ_ATTR_CURR_STATE) &&
2122 		    attr->curr_wq_state != wq->state)
2123 			return -EINVAL;
2124 
2125 		if (wq->state == IBV_WQS_RESET) {
2126 			mlx5_spin_lock(&to_mcq(wq->cq)->lock);
2127 			__mlx5_cq_clean(to_mcq(wq->cq),
2128 					rwq->rsc.rsn, NULL);
2129 			mlx5_spin_unlock(&to_mcq(wq->cq)->lock);
2130 			mlx5_init_rwq_indices(rwq);
2131 			rwq->db[MLX5_RCV_DBR] = 0;
2132 			rwq->db[MLX5_SND_DBR] = 0;
2133 		}
2134 	}
2135 
2136 	return ibv_cmd_modify_wq(wq, attr, &cmd.ibv_cmd,  sizeof(cmd.ibv_cmd), sizeof(cmd));
2137 }
2138 
2139 int mlx5_destroy_wq(struct ibv_wq *wq)
2140 {
2141 	struct mlx5_rwq *rwq = to_mrwq(wq);
2142 	int ret;
2143 
2144 	ret = ibv_cmd_destroy_wq(wq);
2145 	if (ret)
2146 		return ret;
2147 
2148 	mlx5_spin_lock(&to_mcq(wq->cq)->lock);
2149 	__mlx5_cq_clean(to_mcq(wq->cq), rwq->rsc.rsn, NULL);
2150 	mlx5_spin_unlock(&to_mcq(wq->cq)->lock);
2151 	mlx5_clear_uidx(to_mctx(wq->context), rwq->rsc.rsn);
2152 	mlx5_free_db(to_mctx(wq->context), rwq->db);
2153 	mlx5_free_rwq_buf(rwq, wq->context);
2154 	free(rwq);
2155 
2156 	return 0;
2157 }
2158 
2159 struct ibv_rwq_ind_table *mlx5_create_rwq_ind_table(struct ibv_context *context,
2160 						    struct ibv_rwq_ind_table_init_attr *init_attr)
2161 {
2162 	struct ibv_create_rwq_ind_table *cmd;
2163 	struct mlx5_create_rwq_ind_table_resp resp;
2164 	struct ibv_rwq_ind_table *ind_table;
2165 	uint32_t required_tbl_size;
2166 	int num_tbl_entries;
2167 	int cmd_size;
2168 	int err;
2169 
2170 	num_tbl_entries = 1 << init_attr->log_ind_tbl_size;
2171 	/* Data must be u64 aligned */
2172 	required_tbl_size = (num_tbl_entries * sizeof(uint32_t)) < sizeof(uint64_t) ?
2173 			sizeof(uint64_t) : (num_tbl_entries * sizeof(uint32_t));
2174 
2175 	cmd_size = required_tbl_size + sizeof(*cmd);
2176 	cmd = calloc(1, cmd_size);
2177 	if (!cmd)
2178 		return NULL;
2179 
2180 	memset(&resp, 0, sizeof(resp));
2181 	ind_table = calloc(1, sizeof(*ind_table));
2182 	if (!ind_table)
2183 		goto free_cmd;
2184 
2185 	err = ibv_cmd_create_rwq_ind_table(context, init_attr, ind_table, cmd,
2186 					   cmd_size, cmd_size, &resp.ibv_resp, sizeof(resp.ibv_resp),
2187 					   sizeof(resp));
2188 	if (err)
2189 		goto err;
2190 
2191 	free(cmd);
2192 	return ind_table;
2193 
2194 err:
2195 	free(ind_table);
2196 free_cmd:
2197 	free(cmd);
2198 	return NULL;
2199 }
2200 
2201 int mlx5_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table)
2202 {
2203 	int ret;
2204 
2205 	ret = ibv_cmd_destroy_rwq_ind_table(rwq_ind_table);
2206 
2207 	if (ret)
2208 		return ret;
2209 
2210 	free(rwq_ind_table);
2211 	return 0;
2212 }
2213