xref: /freebsd/contrib/ofed/libmlx5/verbs.c (revision 02e9120893770924227138ba49df1edb3896112a)
1 /*
2  * Copyright (c) 2012 Mellanox Technologies, Inc.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <config.h>
34 
35 #include <stdlib.h>
36 #include <stdio.h>
37 #include <string.h>
38 #include <pthread.h>
39 #include <errno.h>
40 #include <limits.h>
41 #include <sys/types.h>
42 #include <sys/stat.h>
43 #include <fcntl.h>
44 #include <unistd.h>
45 #include <sys/mman.h>
46 
47 #include "mlx5.h"
48 #include "mlx5-abi.h"
49 #include "wqe.h"
50 
51 int mlx5_single_threaded = 0;
52 
53 static inline int is_xrc_tgt(int type)
54 {
55 	return type == IBV_QPT_XRC_RECV;
56 }
57 
58 int mlx5_query_device(struct ibv_context *context, struct ibv_device_attr *attr)
59 {
60 	struct ibv_query_device cmd;
61 	uint64_t raw_fw_ver;
62 	unsigned major, minor, sub_minor;
63 	int ret;
64 
65 	ret = ibv_cmd_query_device(context, attr, &raw_fw_ver, &cmd, sizeof cmd);
66 	if (ret)
67 		return ret;
68 
69 	major     = (raw_fw_ver >> 32) & 0xffff;
70 	minor     = (raw_fw_ver >> 16) & 0xffff;
71 	sub_minor = raw_fw_ver & 0xffff;
72 
73 	snprintf(attr->fw_ver, sizeof attr->fw_ver,
74 		 "%d.%d.%04d", major, minor, sub_minor);
75 
76 	return 0;
77 }
78 
79 #define READL(ptr) (*((uint32_t *)(ptr)))
80 static int mlx5_read_clock(struct ibv_context *context, uint64_t *cycles)
81 {
82 	unsigned int clockhi, clocklo, clockhi1;
83 	int i;
84 	struct mlx5_context *ctx = to_mctx(context);
85 
86 	if (!ctx->hca_core_clock)
87 		return -EOPNOTSUPP;
88 
89 	/* Handle wraparound */
90 	for (i = 0; i < 2; i++) {
91 		clockhi = be32toh(READL(ctx->hca_core_clock));
92 		clocklo = be32toh(READL(ctx->hca_core_clock + 4));
93 		clockhi1 = be32toh(READL(ctx->hca_core_clock));
94 		if (clockhi == clockhi1)
95 			break;
96 	}
97 
98 	*cycles = (uint64_t)clockhi << 32 | (uint64_t)clocklo;
99 
100 	return 0;
101 }
102 
103 int mlx5_query_rt_values(struct ibv_context *context,
104 			 struct ibv_values_ex *values)
105 {
106 	uint32_t comp_mask = 0;
107 	int err = 0;
108 
109 	if (values->comp_mask & IBV_VALUES_MASK_RAW_CLOCK) {
110 		uint64_t cycles;
111 
112 		err = mlx5_read_clock(context, &cycles);
113 		if (!err) {
114 			values->raw_clock.tv_sec = 0;
115 			values->raw_clock.tv_nsec = cycles;
116 			comp_mask |= IBV_VALUES_MASK_RAW_CLOCK;
117 		}
118 	}
119 
120 	values->comp_mask = comp_mask;
121 
122 	return err;
123 }
124 
125 int mlx5_query_port(struct ibv_context *context, uint8_t port,
126 		     struct ibv_port_attr *attr)
127 {
128 	struct ibv_query_port cmd;
129 
130 	return ibv_cmd_query_port(context, port, attr, &cmd, sizeof cmd);
131 }
132 
133 struct ibv_pd *mlx5_alloc_pd(struct ibv_context *context)
134 {
135 	struct ibv_alloc_pd       cmd;
136 	struct mlx5_alloc_pd_resp resp;
137 	struct mlx5_pd		 *pd;
138 
139 	pd = calloc(1, sizeof *pd);
140 	if (!pd)
141 		return NULL;
142 
143 	if (ibv_cmd_alloc_pd(context, &pd->ibv_pd, &cmd, sizeof cmd,
144 			     &resp.ibv_resp, sizeof resp)) {
145 		free(pd);
146 		return NULL;
147 	}
148 
149 	pd->pdn = resp.pdn;
150 
151 	return &pd->ibv_pd;
152 }
153 
154 int mlx5_free_pd(struct ibv_pd *pd)
155 {
156 	int ret;
157 
158 	ret = ibv_cmd_dealloc_pd(pd);
159 	if (ret)
160 		return ret;
161 
162 	free(to_mpd(pd));
163 	return 0;
164 }
165 
166 struct ibv_mr *mlx5_reg_mr(struct ibv_pd *pd, void *addr, size_t length,
167 			   int acc)
168 {
169 	struct mlx5_mr *mr;
170 	struct ibv_reg_mr cmd;
171 	int ret;
172 	enum ibv_access_flags access = (enum ibv_access_flags)acc;
173 	struct ibv_reg_mr_resp resp;
174 
175 	mr = calloc(1, sizeof(*mr));
176 	if (!mr)
177 		return NULL;
178 
179 	ret = ibv_cmd_reg_mr(pd, addr, length, (uintptr_t)addr, access,
180 			     &(mr->ibv_mr), &cmd, sizeof(cmd), &resp,
181 			     sizeof resp);
182 	if (ret) {
183 		mlx5_free_buf(&(mr->buf));
184 		free(mr);
185 		return NULL;
186 	}
187 	mr->alloc_flags = acc;
188 
189 	return &mr->ibv_mr;
190 }
191 
192 int mlx5_rereg_mr(struct ibv_mr *ibmr, int flags, struct ibv_pd *pd, void *addr,
193 		  size_t length, int access)
194 {
195 	struct ibv_rereg_mr cmd;
196 	struct ibv_rereg_mr_resp resp;
197 
198 	if (flags & IBV_REREG_MR_KEEP_VALID)
199 		return ENOTSUP;
200 
201 	return ibv_cmd_rereg_mr(ibmr, flags, addr, length, (uintptr_t)addr,
202 				access, pd, &cmd, sizeof(cmd), &resp,
203 				sizeof(resp));
204 }
205 
206 int mlx5_dereg_mr(struct ibv_mr *ibmr)
207 {
208 	int ret;
209 	struct mlx5_mr *mr = to_mmr(ibmr);
210 
211 	ret = ibv_cmd_dereg_mr(ibmr);
212 	if (ret)
213 		return ret;
214 
215 	free(mr);
216 	return 0;
217 }
218 
219 struct ibv_mw *mlx5_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type)
220 {
221 	struct ibv_mw *mw;
222 	struct ibv_alloc_mw cmd;
223 	struct ibv_alloc_mw_resp resp;
224 	int ret;
225 
226 	mw = malloc(sizeof(*mw));
227 	if (!mw)
228 		return NULL;
229 
230 	memset(mw, 0, sizeof(*mw));
231 
232 	ret = ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd), &resp,
233 			       sizeof(resp));
234 	if (ret) {
235 		free(mw);
236 		return NULL;
237 	}
238 
239 	return mw;
240 }
241 
242 int mlx5_dealloc_mw(struct ibv_mw *mw)
243 {
244 	int ret;
245 	struct ibv_dealloc_mw cmd;
246 
247 	ret = ibv_cmd_dealloc_mw(mw, &cmd, sizeof(cmd));
248 	if (ret)
249 		return ret;
250 
251 	free(mw);
252 	return 0;
253 }
254 
255 int mlx5_round_up_power_of_two(long long sz)
256 {
257 	long long ret;
258 
259 	for (ret = 1; ret < sz; ret <<= 1)
260 		; /* nothing */
261 
262 	if (ret > INT_MAX) {
263 		fprintf(stderr, "%s: roundup overflow\n", __func__);
264 		return -ENOMEM;
265 	}
266 
267 	return (int)ret;
268 }
269 
270 static int align_queue_size(long long req)
271 {
272 	return mlx5_round_up_power_of_two(req);
273 }
274 
275 static int get_cqe_size(void)
276 {
277 	char *env;
278 	int size = 64;
279 
280 	env = getenv("MLX5_CQE_SIZE");
281 	if (env)
282 		size = atoi(env);
283 
284 	switch (size) {
285 	case 64:
286 	case 128:
287 		return size;
288 
289 	default:
290 		return -EINVAL;
291 	}
292 }
293 
294 static int use_scatter_to_cqe(void)
295 {
296 	char *env;
297 
298 	env = getenv("MLX5_SCATTER_TO_CQE");
299 	if (env && !strcmp(env, "0"))
300 		return 0;
301 
302 	return 1;
303 }
304 
305 static int srq_sig_enabled(void)
306 {
307 	char *env;
308 
309 	env = getenv("MLX5_SRQ_SIGNATURE");
310 	if (env)
311 		return 1;
312 
313 	return 0;
314 }
315 
316 static int qp_sig_enabled(void)
317 {
318 	char *env;
319 
320 	env = getenv("MLX5_QP_SIGNATURE");
321 	if (env)
322 		return 1;
323 
324 	return 0;
325 }
326 
327 enum {
328 	CREATE_CQ_SUPPORTED_WC_FLAGS = IBV_WC_STANDARD_FLAGS	|
329 				       IBV_WC_EX_WITH_COMPLETION_TIMESTAMP |
330 				       IBV_WC_EX_WITH_CVLAN |
331 				       IBV_WC_EX_WITH_FLOW_TAG
332 };
333 
334 enum {
335 	CREATE_CQ_SUPPORTED_COMP_MASK = IBV_CQ_INIT_ATTR_MASK_FLAGS
336 };
337 
338 enum {
339 	CREATE_CQ_SUPPORTED_FLAGS = IBV_CREATE_CQ_ATTR_SINGLE_THREADED
340 };
341 
342 static struct ibv_cq_ex *create_cq(struct ibv_context *context,
343 				   const struct ibv_cq_init_attr_ex *cq_attr,
344 				   int cq_alloc_flags,
345 				   struct mlx5dv_cq_init_attr *mlx5cq_attr)
346 {
347 	struct mlx5_create_cq		cmd;
348 	struct mlx5_create_cq_resp	resp;
349 	struct mlx5_cq		       *cq;
350 	int				cqe_sz;
351 	int				ret;
352 	int				ncqe;
353 	struct mlx5_context *mctx = to_mctx(context);
354 	FILE *fp = to_mctx(context)->dbg_fp;
355 
356 	if (!cq_attr->cqe) {
357 		mlx5_dbg(fp, MLX5_DBG_CQ, "CQE invalid\n");
358 		errno = EINVAL;
359 		return NULL;
360 	}
361 
362 	if (cq_attr->comp_mask & ~CREATE_CQ_SUPPORTED_COMP_MASK) {
363 		mlx5_dbg(fp, MLX5_DBG_CQ,
364 			 "Unsupported comp_mask for create_cq\n");
365 		errno = EINVAL;
366 		return NULL;
367 	}
368 
369 	if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS &&
370 	    cq_attr->flags & ~CREATE_CQ_SUPPORTED_FLAGS) {
371 		mlx5_dbg(fp, MLX5_DBG_CQ,
372 			 "Unsupported creation flags requested for create_cq\n");
373 		errno = EINVAL;
374 		return NULL;
375 	}
376 
377 	if (cq_attr->wc_flags & ~CREATE_CQ_SUPPORTED_WC_FLAGS) {
378 		mlx5_dbg(fp, MLX5_DBG_CQ, "\n");
379 		errno = ENOTSUP;
380 		return NULL;
381 	}
382 
383 	cq =  calloc(1, sizeof *cq);
384 	if (!cq) {
385 		mlx5_dbg(fp, MLX5_DBG_CQ, "\n");
386 		return NULL;
387 	}
388 
389 	memset(&cmd, 0, sizeof cmd);
390 	cq->cons_index = 0;
391 
392 	if (mlx5_spinlock_init(&cq->lock))
393 		goto err;
394 
395 	ncqe = align_queue_size(cq_attr->cqe + 1);
396 	if ((ncqe > (1 << 24)) || (ncqe < (cq_attr->cqe + 1))) {
397 		mlx5_dbg(fp, MLX5_DBG_CQ, "ncqe %d\n", ncqe);
398 		errno = EINVAL;
399 		goto err_spl;
400 	}
401 
402 	cqe_sz = get_cqe_size();
403 	if (cqe_sz < 0) {
404 		mlx5_dbg(fp, MLX5_DBG_CQ, "\n");
405 		errno = -cqe_sz;
406 		goto err_spl;
407 	}
408 
409 	if (mlx5_alloc_cq_buf(to_mctx(context), cq, &cq->buf_a, ncqe, cqe_sz)) {
410 		mlx5_dbg(fp, MLX5_DBG_CQ, "\n");
411 		goto err_spl;
412 	}
413 
414 	cq->dbrec  = mlx5_alloc_dbrec(to_mctx(context));
415 	if (!cq->dbrec) {
416 		mlx5_dbg(fp, MLX5_DBG_CQ, "\n");
417 		goto err_buf;
418 	}
419 
420 	cq->dbrec[MLX5_CQ_SET_CI]	= 0;
421 	cq->dbrec[MLX5_CQ_ARM_DB]	= 0;
422 	cq->arm_sn			= 0;
423 	cq->cqe_sz			= cqe_sz;
424 	cq->flags			= cq_alloc_flags;
425 
426 	if (cq_attr->comp_mask & IBV_CQ_INIT_ATTR_MASK_FLAGS &&
427 	    cq_attr->flags & IBV_CREATE_CQ_ATTR_SINGLE_THREADED)
428 		cq->flags |= MLX5_CQ_FLAGS_SINGLE_THREADED;
429 	cmd.buf_addr = (uintptr_t) cq->buf_a.buf;
430 	cmd.db_addr  = (uintptr_t) cq->dbrec;
431 	cmd.cqe_size = cqe_sz;
432 
433 	if (mlx5cq_attr) {
434 		if (mlx5cq_attr->comp_mask & ~(MLX5DV_CQ_INIT_ATTR_MASK_RESERVED - 1)) {
435 			mlx5_dbg(fp, MLX5_DBG_CQ,
436 				   "Unsupported vendor comp_mask for create_cq\n");
437 			errno = EINVAL;
438 			goto err_db;
439 		}
440 
441 		if (mlx5cq_attr->comp_mask & MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE) {
442 			if (mctx->cqe_comp_caps.max_num &&
443 			    (mlx5cq_attr->cqe_comp_res_format &
444 			     mctx->cqe_comp_caps.supported_format)) {
445 				cmd.cqe_comp_en = 1;
446 				cmd.cqe_comp_res_format = mlx5cq_attr->cqe_comp_res_format;
447 			} else {
448 				mlx5_dbg(fp, MLX5_DBG_CQ, "CQE Compression is not supported\n");
449 				errno = EINVAL;
450 				goto err_db;
451 			}
452 		}
453 	}
454 
455 	ret = ibv_cmd_create_cq(context, ncqe - 1, cq_attr->channel,
456 				cq_attr->comp_vector,
457 				ibv_cq_ex_to_cq(&cq->ibv_cq), &cmd.ibv_cmd,
458 				sizeof(cmd), &resp.ibv_resp, sizeof(resp));
459 	if (ret) {
460 		mlx5_dbg(fp, MLX5_DBG_CQ, "ret %d\n", ret);
461 		goto err_db;
462 	}
463 
464 	cq->active_buf = &cq->buf_a;
465 	cq->resize_buf = NULL;
466 	cq->cqn = resp.cqn;
467 	cq->stall_enable = to_mctx(context)->stall_enable;
468 	cq->stall_adaptive_enable = to_mctx(context)->stall_adaptive_enable;
469 	cq->stall_cycles = to_mctx(context)->stall_cycles;
470 
471 	if (cq_alloc_flags & MLX5_CQ_FLAGS_EXTENDED)
472 		mlx5_cq_fill_pfns(cq, cq_attr);
473 
474 	return &cq->ibv_cq;
475 
476 err_db:
477 	mlx5_free_db(to_mctx(context), cq->dbrec);
478 
479 err_buf:
480 	mlx5_free_cq_buf(to_mctx(context), &cq->buf_a);
481 
482 err_spl:
483 	mlx5_spinlock_destroy(&cq->lock);
484 
485 err:
486 	free(cq);
487 
488 	return NULL;
489 }
490 
491 struct ibv_cq *mlx5_create_cq(struct ibv_context *context, int cqe,
492 			      struct ibv_comp_channel *channel,
493 			      int comp_vector)
494 {
495 	struct ibv_cq_ex *cq;
496 	struct ibv_cq_init_attr_ex cq_attr = {.cqe = cqe, .channel = channel,
497 						.comp_vector = comp_vector,
498 						.wc_flags = IBV_WC_STANDARD_FLAGS};
499 
500 	if (cqe <= 0) {
501 		errno = EINVAL;
502 		return NULL;
503 	}
504 
505 	cq = create_cq(context, &cq_attr, 0, NULL);
506 	return cq ? ibv_cq_ex_to_cq(cq) : NULL;
507 }
508 
509 struct ibv_cq_ex *mlx5_create_cq_ex(struct ibv_context *context,
510 				    struct ibv_cq_init_attr_ex *cq_attr)
511 {
512 	return create_cq(context, cq_attr, MLX5_CQ_FLAGS_EXTENDED, NULL);
513 }
514 
515 struct ibv_cq_ex *mlx5dv_create_cq(struct ibv_context *context,
516 				      struct ibv_cq_init_attr_ex *cq_attr,
517 				      struct mlx5dv_cq_init_attr *mlx5_cq_attr)
518 {
519 	struct ibv_cq_ex *cq;
520 	int err = 0;
521 
522 	cq = create_cq(context, cq_attr, MLX5_CQ_FLAGS_EXTENDED, mlx5_cq_attr);
523 	if (!cq)
524 		return NULL;
525 
526 	err = verbs_init_cq(ibv_cq_ex_to_cq(cq), context,
527 		      cq_attr->channel, cq_attr->cq_context);
528 	if (err)
529 		goto err;
530 
531 	return cq;
532 
533 err:
534 	context->ops.destroy_cq(ibv_cq_ex_to_cq(cq));
535 
536 	return NULL;
537 }
538 
539 int mlx5_resize_cq(struct ibv_cq *ibcq, int cqe)
540 {
541 	struct mlx5_cq *cq = to_mcq(ibcq);
542 	struct mlx5_resize_cq_resp resp;
543 	struct mlx5_resize_cq cmd;
544 	struct mlx5_context *mctx = to_mctx(ibcq->context);
545 	int err;
546 
547 	if (cqe < 0) {
548 		errno = EINVAL;
549 		return errno;
550 	}
551 
552 	memset(&cmd, 0, sizeof(cmd));
553 	memset(&resp, 0, sizeof(resp));
554 
555 	if (((long long)cqe * 64) > INT_MAX)
556 		return EINVAL;
557 
558 	mlx5_spin_lock(&cq->lock);
559 	cq->active_cqes = cq->ibv_cq.cqe;
560 	if (cq->active_buf == &cq->buf_a)
561 		cq->resize_buf = &cq->buf_b;
562 	else
563 		cq->resize_buf = &cq->buf_a;
564 
565 	cqe = align_queue_size(cqe + 1);
566 	if (cqe == ibcq->cqe + 1) {
567 		cq->resize_buf = NULL;
568 		err = 0;
569 		goto out;
570 	}
571 
572 	/* currently we don't change cqe size */
573 	cq->resize_cqe_sz = cq->cqe_sz;
574 	cq->resize_cqes = cqe;
575 	err = mlx5_alloc_cq_buf(mctx, cq, cq->resize_buf, cq->resize_cqes, cq->resize_cqe_sz);
576 	if (err) {
577 		cq->resize_buf = NULL;
578 		errno = ENOMEM;
579 		goto out;
580 	}
581 
582 	cmd.buf_addr = (uintptr_t)cq->resize_buf->buf;
583 	cmd.cqe_size = cq->resize_cqe_sz;
584 
585 	err = ibv_cmd_resize_cq(ibcq, cqe - 1, &cmd.ibv_cmd, sizeof(cmd),
586 				&resp.ibv_resp, sizeof(resp));
587 	if (err)
588 		goto out_buf;
589 
590 	mlx5_cq_resize_copy_cqes(cq);
591 	mlx5_free_cq_buf(mctx, cq->active_buf);
592 	cq->active_buf = cq->resize_buf;
593 	cq->ibv_cq.cqe = cqe - 1;
594 	mlx5_spin_unlock(&cq->lock);
595 	cq->resize_buf = NULL;
596 	return 0;
597 
598 out_buf:
599 	mlx5_free_cq_buf(mctx, cq->resize_buf);
600 	cq->resize_buf = NULL;
601 
602 out:
603 	mlx5_spin_unlock(&cq->lock);
604 	return err;
605 }
606 
607 int mlx5_destroy_cq(struct ibv_cq *cq)
608 {
609 	int ret;
610 	struct mlx5_cq *mcq = to_mcq(cq);
611 
612 	ret = ibv_cmd_destroy_cq(cq);
613 	if (ret)
614 		return ret;
615 
616 	verbs_cleanup_cq(cq);
617 	mlx5_free_db(to_mctx(cq->context), to_mcq(cq)->dbrec);
618 	mlx5_free_cq_buf(to_mctx(cq->context), to_mcq(cq)->active_buf);
619 	mlx5_spinlock_destroy(&mcq->lock);
620 	free(to_mcq(cq));
621 
622 	return 0;
623 }
624 
625 struct ibv_srq *mlx5_create_srq(struct ibv_pd *pd,
626 				struct ibv_srq_init_attr *attr)
627 {
628 	struct mlx5_create_srq      cmd;
629 	struct mlx5_create_srq_resp resp;
630 	struct mlx5_srq		   *srq;
631 	int			    ret;
632 	struct mlx5_context	   *ctx;
633 	int			    max_sge;
634 	struct ibv_srq		   *ibsrq;
635 
636 	ctx = to_mctx(pd->context);
637 	srq = calloc(1, sizeof *srq);
638 	if (!srq) {
639 		fprintf(stderr, "%s-%d:\n", __func__, __LINE__);
640 		return NULL;
641 	}
642 	ibsrq = &srq->vsrq.srq;
643 
644 	memset(&cmd, 0, sizeof cmd);
645 	if (mlx5_spinlock_init(&srq->lock)) {
646 		fprintf(stderr, "%s-%d:\n", __func__, __LINE__);
647 		goto err;
648 	}
649 
650 	if (attr->attr.max_wr > ctx->max_srq_recv_wr) {
651 		fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n", __func__, __LINE__,
652 			attr->attr.max_wr, ctx->max_srq_recv_wr);
653 		errno = EINVAL;
654 		goto err_spl;
655 	}
656 
657 	/*
658 	 * this calculation does not consider required control segments. The
659 	 * final calculation is done again later. This is done so to avoid
660 	 * overflows of variables
661 	 */
662 	max_sge = ctx->max_rq_desc_sz / sizeof(struct mlx5_wqe_data_seg);
663 	if (attr->attr.max_sge > max_sge) {
664 		fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n", __func__, __LINE__,
665 			attr->attr.max_wr, ctx->max_srq_recv_wr);
666 		errno = EINVAL;
667 		goto err_spl;
668 	}
669 
670 	srq->max     = align_queue_size(attr->attr.max_wr + 1);
671 	srq->max_gs  = attr->attr.max_sge;
672 	srq->counter = 0;
673 
674 	if (mlx5_alloc_srq_buf(pd->context, srq)) {
675 		fprintf(stderr, "%s-%d:\n", __func__, __LINE__);
676 		goto err_spl;
677 	}
678 
679 	srq->db = mlx5_alloc_dbrec(to_mctx(pd->context));
680 	if (!srq->db) {
681 		fprintf(stderr, "%s-%d:\n", __func__, __LINE__);
682 		goto err_free;
683 	}
684 
685 	*srq->db = 0;
686 
687 	cmd.buf_addr = (uintptr_t) srq->buf.buf;
688 	cmd.db_addr  = (uintptr_t) srq->db;
689 	srq->wq_sig = srq_sig_enabled();
690 	if (srq->wq_sig)
691 		cmd.flags = MLX5_SRQ_FLAG_SIGNATURE;
692 
693 	attr->attr.max_sge = srq->max_gs;
694 	pthread_mutex_lock(&ctx->srq_table_mutex);
695 	ret = ibv_cmd_create_srq(pd, ibsrq, attr, &cmd.ibv_cmd, sizeof(cmd),
696 				 &resp.ibv_resp, sizeof(resp));
697 	if (ret)
698 		goto err_db;
699 
700 	ret = mlx5_store_srq(ctx, resp.srqn, srq);
701 	if (ret)
702 		goto err_destroy;
703 
704 	pthread_mutex_unlock(&ctx->srq_table_mutex);
705 
706 	srq->srqn = resp.srqn;
707 	srq->rsc.rsn = resp.srqn;
708 	srq->rsc.type = MLX5_RSC_TYPE_SRQ;
709 
710 	return ibsrq;
711 
712 err_destroy:
713 	ibv_cmd_destroy_srq(ibsrq);
714 
715 err_db:
716 	pthread_mutex_unlock(&ctx->srq_table_mutex);
717 	mlx5_free_db(to_mctx(pd->context), srq->db);
718 
719 err_free:
720 	free(srq->wrid);
721 	mlx5_free_buf(&srq->buf);
722 
723 err_spl:
724 	mlx5_spinlock_destroy(&srq->lock);
725 
726 err:
727 	free(srq);
728 
729 	return NULL;
730 }
731 
732 int mlx5_modify_srq(struct ibv_srq *srq,
733 		    struct ibv_srq_attr *attr,
734 		    int attr_mask)
735 {
736 	struct ibv_modify_srq cmd;
737 
738 	return ibv_cmd_modify_srq(srq, attr, attr_mask, &cmd, sizeof cmd);
739 }
740 
741 int mlx5_query_srq(struct ibv_srq *srq,
742 		    struct ibv_srq_attr *attr)
743 {
744 	struct ibv_query_srq cmd;
745 
746 	return ibv_cmd_query_srq(srq, attr, &cmd, sizeof cmd);
747 }
748 
749 int mlx5_destroy_srq(struct ibv_srq *srq)
750 {
751 	int ret;
752 	struct mlx5_srq *msrq = to_msrq(srq);
753 	struct mlx5_context *ctx = to_mctx(srq->context);
754 
755 	ret = ibv_cmd_destroy_srq(srq);
756 	if (ret)
757 		return ret;
758 
759 	if (ctx->cqe_version && msrq->rsc.type == MLX5_RSC_TYPE_XSRQ)
760 		mlx5_clear_uidx(ctx, msrq->rsc.rsn);
761 	else
762 		mlx5_clear_srq(ctx, msrq->srqn);
763 
764 	mlx5_free_db(ctx, msrq->db);
765 	mlx5_free_buf(&msrq->buf);
766 	free(msrq->wrid);
767 	mlx5_spinlock_destroy(&msrq->lock);
768 	free(msrq);
769 
770 	return 0;
771 }
772 
773 static int sq_overhead(enum ibv_qp_type	qp_type)
774 {
775 	size_t size = 0;
776 	size_t mw_bind_size =
777 	    sizeof(struct mlx5_wqe_umr_ctrl_seg) +
778 	    sizeof(struct mlx5_wqe_mkey_context_seg) +
779 	    max_t(size_t, sizeof(struct mlx5_wqe_umr_klm_seg), 64);
780 
781 	switch (qp_type) {
782 	case IBV_QPT_RC:
783 		size += sizeof(struct mlx5_wqe_ctrl_seg) +
784 			max(sizeof(struct mlx5_wqe_atomic_seg) +
785 			    sizeof(struct mlx5_wqe_raddr_seg),
786 			    mw_bind_size);
787 		break;
788 
789 	case IBV_QPT_UC:
790 		size = sizeof(struct mlx5_wqe_ctrl_seg) +
791 			max(sizeof(struct mlx5_wqe_raddr_seg),
792 			    mw_bind_size);
793 		break;
794 
795 	case IBV_QPT_UD:
796 		size = sizeof(struct mlx5_wqe_ctrl_seg) +
797 			sizeof(struct mlx5_wqe_datagram_seg);
798 		break;
799 
800 	case IBV_QPT_XRC_SEND:
801 		size = sizeof(struct mlx5_wqe_ctrl_seg) + mw_bind_size;
802 		SWITCH_FALLTHROUGH;
803 
804 	case IBV_QPT_XRC_RECV:
805 		size = max(size, sizeof(struct mlx5_wqe_ctrl_seg) +
806 			   sizeof(struct mlx5_wqe_xrc_seg) +
807 			   sizeof(struct mlx5_wqe_raddr_seg));
808 		break;
809 
810 	case IBV_QPT_RAW_PACKET:
811 		size = sizeof(struct mlx5_wqe_ctrl_seg) +
812 			sizeof(struct mlx5_wqe_eth_seg);
813 		break;
814 
815 	default:
816 		return -EINVAL;
817 	}
818 
819 	return size;
820 }
821 
822 static int mlx5_calc_send_wqe(struct mlx5_context *ctx,
823 			      struct ibv_qp_init_attr_ex *attr,
824 			      struct mlx5_qp *qp)
825 {
826 	int size;
827 	int inl_size = 0;
828 	int max_gather;
829 	int tot_size;
830 
831 	size = sq_overhead(attr->qp_type);
832 	if (size < 0)
833 		return size;
834 
835 	if (attr->cap.max_inline_data) {
836 		inl_size = size + align(sizeof(struct mlx5_wqe_inl_data_seg) +
837 			attr->cap.max_inline_data, 16);
838 	}
839 
840 	if (attr->comp_mask & IBV_QP_INIT_ATTR_MAX_TSO_HEADER) {
841 		size += align(attr->max_tso_header, 16);
842 		qp->max_tso_header = attr->max_tso_header;
843 	}
844 
845 	max_gather = (ctx->max_sq_desc_sz - size) /
846 		sizeof(struct mlx5_wqe_data_seg);
847 	if (attr->cap.max_send_sge > max_gather)
848 		return -EINVAL;
849 
850 	size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg);
851 	tot_size = max_int(size, inl_size);
852 
853 	if (tot_size > ctx->max_sq_desc_sz)
854 		return -EINVAL;
855 
856 	return align(tot_size, MLX5_SEND_WQE_BB);
857 }
858 
859 static int mlx5_calc_rcv_wqe(struct mlx5_context *ctx,
860 			     struct ibv_qp_init_attr_ex *attr,
861 			     struct mlx5_qp *qp)
862 {
863 	uint32_t size;
864 	int num_scatter;
865 
866 	if (attr->srq)
867 		return 0;
868 
869 	num_scatter = max_t(uint32_t, attr->cap.max_recv_sge, 1);
870 	size = sizeof(struct mlx5_wqe_data_seg) * num_scatter;
871 	if (qp->wq_sig)
872 		size += sizeof(struct mlx5_rwqe_sig);
873 
874 	if (size > ctx->max_rq_desc_sz)
875 		return -EINVAL;
876 
877 	size = mlx5_round_up_power_of_two(size);
878 
879 	return size;
880 }
881 
882 static int mlx5_calc_sq_size(struct mlx5_context *ctx,
883 			     struct ibv_qp_init_attr_ex *attr,
884 			     struct mlx5_qp *qp)
885 {
886 	int wqe_size;
887 	int wq_size;
888 	FILE *fp = ctx->dbg_fp;
889 
890 	if (!attr->cap.max_send_wr)
891 		return 0;
892 
893 	wqe_size = mlx5_calc_send_wqe(ctx, attr, qp);
894 	if (wqe_size < 0) {
895 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
896 		return wqe_size;
897 	}
898 
899 	if (wqe_size > ctx->max_sq_desc_sz) {
900 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
901 		return -EINVAL;
902 	}
903 
904 	qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) -
905 		sizeof(struct mlx5_wqe_inl_data_seg);
906 	attr->cap.max_inline_data = qp->max_inline_data;
907 
908 	/*
909 	 * to avoid overflow, we limit max_send_wr so
910 	 * that the multiplication will fit in int
911 	 */
912 	if (attr->cap.max_send_wr > 0x7fffffff / ctx->max_sq_desc_sz) {
913 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
914 		return -EINVAL;
915 	}
916 
917 	wq_size = mlx5_round_up_power_of_two(attr->cap.max_send_wr * wqe_size);
918 	qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB;
919 	if (qp->sq.wqe_cnt > ctx->max_send_wqebb) {
920 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
921 		return -EINVAL;
922 	}
923 
924 	qp->sq.wqe_shift = mlx5_ilog2(MLX5_SEND_WQE_BB);
925 	qp->sq.max_gs = attr->cap.max_send_sge;
926 	qp->sq.max_post = wq_size / wqe_size;
927 
928 	return wq_size;
929 }
930 
931 static int mlx5_calc_rwq_size(struct mlx5_context *ctx,
932 			      struct mlx5_rwq *rwq,
933 			      struct ibv_wq_init_attr *attr)
934 {
935 	size_t wqe_size;
936 	int wq_size;
937 	uint32_t num_scatter;
938 	int scat_spc;
939 
940 	if (!attr->max_wr)
941 		return -EINVAL;
942 
943 	/* TBD: check caps for RQ */
944 	num_scatter = max_t(uint32_t, attr->max_sge, 1);
945 	wqe_size = sizeof(struct mlx5_wqe_data_seg) * num_scatter;
946 
947 	if (rwq->wq_sig)
948 		wqe_size += sizeof(struct mlx5_rwqe_sig);
949 
950 	if (wqe_size <= 0 || wqe_size > ctx->max_rq_desc_sz)
951 		return -EINVAL;
952 
953 	wqe_size = mlx5_round_up_power_of_two(wqe_size);
954 	wq_size = mlx5_round_up_power_of_two(attr->max_wr) * wqe_size;
955 	wq_size = max(wq_size, MLX5_SEND_WQE_BB);
956 	rwq->rq.wqe_cnt = wq_size / wqe_size;
957 	rwq->rq.wqe_shift = mlx5_ilog2(wqe_size);
958 	rwq->rq.max_post = 1 << mlx5_ilog2(wq_size / wqe_size);
959 	scat_spc = wqe_size -
960 		((rwq->wq_sig) ? sizeof(struct mlx5_rwqe_sig) : 0);
961 	rwq->rq.max_gs = scat_spc / sizeof(struct mlx5_wqe_data_seg);
962 	return wq_size;
963 }
964 
965 static int mlx5_calc_rq_size(struct mlx5_context *ctx,
966 			     struct ibv_qp_init_attr_ex *attr,
967 			     struct mlx5_qp *qp)
968 {
969 	int wqe_size;
970 	int wq_size;
971 	int scat_spc;
972 	FILE *fp = ctx->dbg_fp;
973 
974 	if (!attr->cap.max_recv_wr)
975 		return 0;
976 
977 	if (attr->cap.max_recv_wr > ctx->max_recv_wr) {
978 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
979 		return -EINVAL;
980 	}
981 
982 	wqe_size = mlx5_calc_rcv_wqe(ctx, attr, qp);
983 	if (wqe_size < 0 || wqe_size > ctx->max_rq_desc_sz) {
984 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
985 		return -EINVAL;
986 	}
987 
988 	wq_size = mlx5_round_up_power_of_two(attr->cap.max_recv_wr) * wqe_size;
989 	if (wqe_size) {
990 		wq_size = max(wq_size, MLX5_SEND_WQE_BB);
991 		qp->rq.wqe_cnt = wq_size / wqe_size;
992 		qp->rq.wqe_shift = mlx5_ilog2(wqe_size);
993 		qp->rq.max_post = 1 << mlx5_ilog2(wq_size / wqe_size);
994 		scat_spc = wqe_size -
995 			(qp->wq_sig ? sizeof(struct mlx5_rwqe_sig) : 0);
996 		qp->rq.max_gs = scat_spc / sizeof(struct mlx5_wqe_data_seg);
997 	} else {
998 		qp->rq.wqe_cnt = 0;
999 		qp->rq.wqe_shift = 0;
1000 		qp->rq.max_post = 0;
1001 		qp->rq.max_gs = 0;
1002 	}
1003 	return wq_size;
1004 }
1005 
1006 static int mlx5_calc_wq_size(struct mlx5_context *ctx,
1007 			     struct ibv_qp_init_attr_ex *attr,
1008 			     struct mlx5_qp *qp)
1009 {
1010 	int ret;
1011 	int result;
1012 
1013 	ret = mlx5_calc_sq_size(ctx, attr, qp);
1014 	if (ret < 0)
1015 		return ret;
1016 
1017 	result = ret;
1018 	ret = mlx5_calc_rq_size(ctx, attr, qp);
1019 	if (ret < 0)
1020 		return ret;
1021 
1022 	result += ret;
1023 
1024 	qp->sq.offset = ret;
1025 	qp->rq.offset = 0;
1026 
1027 	return result;
1028 }
1029 
1030 static void map_uuar(struct ibv_context *context, struct mlx5_qp *qp,
1031 		     int uuar_index)
1032 {
1033 	struct mlx5_context *ctx = to_mctx(context);
1034 
1035 	qp->bf = &ctx->bfs[uuar_index];
1036 }
1037 
1038 static const char *qptype2key(enum ibv_qp_type type)
1039 {
1040 	switch (type) {
1041 	case IBV_QPT_RC: return "HUGE_RC";
1042 	case IBV_QPT_UC: return "HUGE_UC";
1043 	case IBV_QPT_UD: return "HUGE_UD";
1044 	case IBV_QPT_RAW_PACKET: return "HUGE_RAW_ETH";
1045 	default: return "HUGE_NA";
1046 	}
1047 }
1048 
1049 static int mlx5_alloc_qp_buf(struct ibv_context *context,
1050 			     struct ibv_qp_init_attr_ex *attr,
1051 			     struct mlx5_qp *qp,
1052 			     int size)
1053 {
1054 	int err;
1055 	enum mlx5_alloc_type alloc_type;
1056 	enum mlx5_alloc_type default_alloc_type = MLX5_ALLOC_TYPE_ANON;
1057 	const char *qp_huge_key;
1058 
1059 	if (qp->sq.wqe_cnt) {
1060 		qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid));
1061 		if (!qp->sq.wrid) {
1062 			errno = ENOMEM;
1063 			err = -1;
1064 			return err;
1065 		}
1066 
1067 		qp->sq.wr_data = malloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wr_data));
1068 		if (!qp->sq.wr_data) {
1069 			errno = ENOMEM;
1070 			err = -1;
1071 			goto ex_wrid;
1072 		}
1073 	}
1074 
1075 	qp->sq.wqe_head = malloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wqe_head));
1076 	if (!qp->sq.wqe_head) {
1077 		errno = ENOMEM;
1078 		err = -1;
1079 			goto ex_wrid;
1080 	}
1081 
1082 	if (qp->rq.wqe_cnt) {
1083 		qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof(uint64_t));
1084 		if (!qp->rq.wrid) {
1085 			errno = ENOMEM;
1086 			err = -1;
1087 			goto ex_wrid;
1088 		}
1089 	}
1090 
1091 	/* compatibility support */
1092 	qp_huge_key  = qptype2key(qp->ibv_qp->qp_type);
1093 	if (mlx5_use_huge(qp_huge_key))
1094 		default_alloc_type = MLX5_ALLOC_TYPE_HUGE;
1095 
1096 	mlx5_get_alloc_type(MLX5_QP_PREFIX, &alloc_type,
1097 			    default_alloc_type);
1098 
1099 	err = mlx5_alloc_prefered_buf(to_mctx(context), &qp->buf,
1100 				      align(qp->buf_size, to_mdev
1101 				      (context->device)->page_size),
1102 				      to_mdev(context->device)->page_size,
1103 				      alloc_type,
1104 				      MLX5_QP_PREFIX);
1105 
1106 	if (err) {
1107 		err = -ENOMEM;
1108 		goto ex_wrid;
1109 	}
1110 
1111 	memset(qp->buf.buf, 0, qp->buf_size);
1112 
1113 	if (attr->qp_type == IBV_QPT_RAW_PACKET) {
1114 		size_t aligned_sq_buf_size = align(qp->sq_buf_size,
1115 						   to_mdev(context->device)->page_size);
1116 		/* For Raw Packet QP, allocate a separate buffer for the SQ */
1117 		err = mlx5_alloc_prefered_buf(to_mctx(context), &qp->sq_buf,
1118 					      aligned_sq_buf_size,
1119 					      to_mdev(context->device)->page_size,
1120 					      alloc_type,
1121 					      MLX5_QP_PREFIX);
1122 		if (err) {
1123 			err = -ENOMEM;
1124 			goto rq_buf;
1125 		}
1126 
1127 		memset(qp->sq_buf.buf, 0, aligned_sq_buf_size);
1128 	}
1129 
1130 	return 0;
1131 rq_buf:
1132 	mlx5_free_actual_buf(to_mctx(qp->verbs_qp.qp.context), &qp->buf);
1133 ex_wrid:
1134 	if (qp->rq.wrid)
1135 		free(qp->rq.wrid);
1136 
1137 	if (qp->sq.wqe_head)
1138 		free(qp->sq.wqe_head);
1139 
1140 	if (qp->sq.wr_data)
1141 		free(qp->sq.wr_data);
1142 	if (qp->sq.wrid)
1143 		free(qp->sq.wrid);
1144 
1145 	return err;
1146 }
1147 
1148 static void mlx5_free_qp_buf(struct mlx5_qp *qp)
1149 {
1150 	struct mlx5_context *ctx = to_mctx(qp->ibv_qp->context);
1151 
1152 	mlx5_free_actual_buf(ctx, &qp->buf);
1153 
1154 	if (qp->sq_buf.buf)
1155 		mlx5_free_actual_buf(ctx, &qp->sq_buf);
1156 
1157 	if (qp->rq.wrid)
1158 		free(qp->rq.wrid);
1159 
1160 	if (qp->sq.wqe_head)
1161 		free(qp->sq.wqe_head);
1162 
1163 	if (qp->sq.wrid)
1164 		free(qp->sq.wrid);
1165 
1166 	if (qp->sq.wr_data)
1167 		free(qp->sq.wr_data);
1168 }
1169 
1170 static int mlx5_cmd_create_rss_qp(struct ibv_context *context,
1171 				 struct ibv_qp_init_attr_ex *attr,
1172 				 struct mlx5_qp *qp)
1173 {
1174 	struct mlx5_create_qp_ex_rss cmd_ex_rss = {};
1175 	struct mlx5_create_qp_resp_ex resp = {};
1176 	int ret;
1177 
1178 	if (attr->rx_hash_conf.rx_hash_key_len > sizeof(cmd_ex_rss.rx_hash_key)) {
1179 		errno = EINVAL;
1180 		return errno;
1181 	}
1182 
1183 	cmd_ex_rss.rx_hash_fields_mask = attr->rx_hash_conf.rx_hash_fields_mask;
1184 	cmd_ex_rss.rx_hash_function = attr->rx_hash_conf.rx_hash_function;
1185 	cmd_ex_rss.rx_key_len = attr->rx_hash_conf.rx_hash_key_len;
1186 	memcpy(cmd_ex_rss.rx_hash_key, attr->rx_hash_conf.rx_hash_key,
1187 			attr->rx_hash_conf.rx_hash_key_len);
1188 
1189 	ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp,
1190 					    sizeof(qp->verbs_qp), attr,
1191 					    &cmd_ex_rss.ibv_cmd, sizeof(cmd_ex_rss.ibv_cmd),
1192 					    sizeof(cmd_ex_rss), &resp.ibv_resp,
1193 					    sizeof(resp.ibv_resp), sizeof(resp));
1194 	if (ret)
1195 		return ret;
1196 
1197 	qp->rss_qp = 1;
1198 	return 0;
1199 }
1200 
1201 static int mlx5_cmd_create_qp_ex(struct ibv_context *context,
1202 				 struct ibv_qp_init_attr_ex *attr,
1203 				 struct mlx5_create_qp *cmd,
1204 				 struct mlx5_qp *qp,
1205 				 struct mlx5_create_qp_resp_ex *resp)
1206 {
1207 	struct mlx5_create_qp_ex cmd_ex;
1208 	int ret;
1209 
1210 	memset(&cmd_ex, 0, sizeof(cmd_ex));
1211 	memcpy(&cmd_ex.ibv_cmd.base, &cmd->ibv_cmd.user_handle,
1212 	       offsetof(typeof(cmd->ibv_cmd), is_srq) +
1213 	       sizeof(cmd->ibv_cmd.is_srq) -
1214 	       offsetof(typeof(cmd->ibv_cmd), user_handle));
1215 
1216 	memcpy(&cmd_ex.drv_ex, &cmd->buf_addr,
1217 	       offsetof(typeof(*cmd), sq_buf_addr) +
1218 	       sizeof(cmd->sq_buf_addr) - sizeof(cmd->ibv_cmd));
1219 
1220 	ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp,
1221 				    sizeof(qp->verbs_qp), attr,
1222 				    &cmd_ex.ibv_cmd, sizeof(cmd_ex.ibv_cmd),
1223 				    sizeof(cmd_ex), &resp->ibv_resp,
1224 				    sizeof(resp->ibv_resp), sizeof(*resp));
1225 
1226 	return ret;
1227 }
1228 
1229 enum {
1230 	MLX5_CREATE_QP_SUP_COMP_MASK = (IBV_QP_INIT_ATTR_PD |
1231 					IBV_QP_INIT_ATTR_XRCD |
1232 					IBV_QP_INIT_ATTR_CREATE_FLAGS |
1233 					IBV_QP_INIT_ATTR_MAX_TSO_HEADER |
1234 					IBV_QP_INIT_ATTR_IND_TABLE |
1235 					IBV_QP_INIT_ATTR_RX_HASH),
1236 };
1237 
1238 enum {
1239 	MLX5_CREATE_QP_EX2_COMP_MASK = (IBV_QP_INIT_ATTR_CREATE_FLAGS |
1240 					IBV_QP_INIT_ATTR_MAX_TSO_HEADER |
1241 					IBV_QP_INIT_ATTR_IND_TABLE |
1242 					IBV_QP_INIT_ATTR_RX_HASH),
1243 };
1244 
1245 static struct ibv_qp *create_qp(struct ibv_context *context,
1246 			 struct ibv_qp_init_attr_ex *attr)
1247 {
1248 	struct mlx5_create_qp		cmd;
1249 	struct mlx5_create_qp_resp	resp;
1250 	struct mlx5_create_qp_resp_ex resp_ex;
1251 	struct mlx5_qp		       *qp;
1252 	int				ret;
1253 	struct mlx5_context	       *ctx = to_mctx(context);
1254 	struct ibv_qp		       *ibqp;
1255 	int32_t				usr_idx = 0;
1256 	uint32_t			uuar_index;
1257 	FILE *fp = ctx->dbg_fp;
1258 
1259 	if (attr->comp_mask & ~MLX5_CREATE_QP_SUP_COMP_MASK)
1260 		return NULL;
1261 
1262 	if ((attr->comp_mask & IBV_QP_INIT_ATTR_MAX_TSO_HEADER) &&
1263 	    (attr->qp_type != IBV_QPT_RAW_PACKET))
1264 		return NULL;
1265 
1266 	qp = calloc(1, sizeof(*qp));
1267 	if (!qp) {
1268 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
1269 		return NULL;
1270 	}
1271 	ibqp = (struct ibv_qp *)&qp->verbs_qp;
1272 	qp->ibv_qp = ibqp;
1273 
1274 	memset(&cmd, 0, sizeof(cmd));
1275 	memset(&resp, 0, sizeof(resp));
1276 	memset(&resp_ex, 0, sizeof(resp_ex));
1277 
1278 	if (attr->comp_mask & IBV_QP_INIT_ATTR_RX_HASH) {
1279 		ret = mlx5_cmd_create_rss_qp(context, attr, qp);
1280 		if (ret)
1281 			goto err;
1282 
1283 		return ibqp;
1284 	}
1285 
1286 	qp->wq_sig = qp_sig_enabled();
1287 	if (qp->wq_sig)
1288 		cmd.flags |= MLX5_QP_FLAG_SIGNATURE;
1289 
1290 	if (use_scatter_to_cqe())
1291 		cmd.flags |= MLX5_QP_FLAG_SCATTER_CQE;
1292 
1293 	ret = mlx5_calc_wq_size(ctx, attr, qp);
1294 	if (ret < 0) {
1295 		errno = -ret;
1296 		goto err;
1297 	}
1298 
1299 	if (attr->qp_type == IBV_QPT_RAW_PACKET) {
1300 		qp->buf_size = qp->sq.offset;
1301 		qp->sq_buf_size = ret - qp->buf_size;
1302 		qp->sq.offset = 0;
1303 	} else {
1304 		qp->buf_size = ret;
1305 		qp->sq_buf_size = 0;
1306 	}
1307 
1308 	if (mlx5_alloc_qp_buf(context, attr, qp, ret)) {
1309 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
1310 		goto err;
1311 	}
1312 
1313 	if (attr->qp_type == IBV_QPT_RAW_PACKET) {
1314 		qp->sq_start = qp->sq_buf.buf;
1315 		qp->sq.qend = qp->sq_buf.buf +
1316 				(qp->sq.wqe_cnt << qp->sq.wqe_shift);
1317 	} else {
1318 		qp->sq_start = qp->buf.buf + qp->sq.offset;
1319 		qp->sq.qend = qp->buf.buf + qp->sq.offset +
1320 				(qp->sq.wqe_cnt << qp->sq.wqe_shift);
1321 	}
1322 
1323 	mlx5_init_qp_indices(qp);
1324 
1325 	if (mlx5_spinlock_init(&qp->sq.lock))
1326 		goto err_free_qp_buf;
1327 
1328 	if (mlx5_spinlock_init(&qp->rq.lock))
1329 		goto err_sq_spl;
1330 
1331 	qp->db = mlx5_alloc_dbrec(ctx);
1332 	if (!qp->db) {
1333 		mlx5_dbg(fp, MLX5_DBG_QP, "\n");
1334 		goto err_rq_spl;
1335 	}
1336 
1337 	qp->db[MLX5_RCV_DBR] = 0;
1338 	qp->db[MLX5_SND_DBR] = 0;
1339 
1340 	cmd.buf_addr = (uintptr_t) qp->buf.buf;
1341 	cmd.sq_buf_addr = (attr->qp_type == IBV_QPT_RAW_PACKET) ?
1342 			  (uintptr_t) qp->sq_buf.buf : 0;
1343 	cmd.db_addr  = (uintptr_t) qp->db;
1344 	cmd.sq_wqe_count = qp->sq.wqe_cnt;
1345 	cmd.rq_wqe_count = qp->rq.wqe_cnt;
1346 	cmd.rq_wqe_shift = qp->rq.wqe_shift;
1347 
1348 	if (ctx->atomic_cap == IBV_ATOMIC_HCA)
1349 		qp->atomics_enabled = 1;
1350 
1351 	if (!ctx->cqe_version) {
1352 		cmd.uidx = 0xffffff;
1353 		pthread_mutex_lock(&ctx->qp_table_mutex);
1354 	} else if (!is_xrc_tgt(attr->qp_type)) {
1355 		usr_idx = mlx5_store_uidx(ctx, qp);
1356 		if (usr_idx < 0) {
1357 			mlx5_dbg(fp, MLX5_DBG_QP, "Couldn't find free user index\n");
1358 			goto err_rq_db;
1359 		}
1360 
1361 		cmd.uidx = usr_idx;
1362 	}
1363 
1364 	if (attr->comp_mask & MLX5_CREATE_QP_EX2_COMP_MASK)
1365 		ret = mlx5_cmd_create_qp_ex(context, attr, &cmd, qp, &resp_ex);
1366 	else
1367 		ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, sizeof(qp->verbs_qp),
1368 					   attr, &cmd.ibv_cmd, sizeof(cmd),
1369 					   &resp.ibv_resp, sizeof(resp));
1370 	if (ret) {
1371 		mlx5_dbg(fp, MLX5_DBG_QP, "ret %d\n", ret);
1372 		goto err_free_uidx;
1373 	}
1374 
1375 	uuar_index = (attr->comp_mask & MLX5_CREATE_QP_EX2_COMP_MASK) ?
1376 			resp_ex.uuar_index : resp.uuar_index;
1377 	if (!ctx->cqe_version) {
1378 		if (qp->sq.wqe_cnt || qp->rq.wqe_cnt) {
1379 			ret = mlx5_store_qp(ctx, ibqp->qp_num, qp);
1380 			if (ret) {
1381 				mlx5_dbg(fp, MLX5_DBG_QP, "ret %d\n", ret);
1382 				goto err_destroy;
1383 			}
1384 		}
1385 
1386 		pthread_mutex_unlock(&ctx->qp_table_mutex);
1387 	}
1388 
1389 	map_uuar(context, qp, uuar_index);
1390 
1391 	qp->rq.max_post = qp->rq.wqe_cnt;
1392 	if (attr->sq_sig_all)
1393 		qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
1394 	else
1395 		qp->sq_signal_bits = 0;
1396 
1397 	attr->cap.max_send_wr = qp->sq.max_post;
1398 	attr->cap.max_recv_wr = qp->rq.max_post;
1399 	attr->cap.max_recv_sge = qp->rq.max_gs;
1400 
1401 	qp->rsc.type = MLX5_RSC_TYPE_QP;
1402 	qp->rsc.rsn = (ctx->cqe_version && !is_xrc_tgt(attr->qp_type)) ?
1403 		      usr_idx : ibqp->qp_num;
1404 
1405 	return ibqp;
1406 
1407 err_destroy:
1408 	ibv_cmd_destroy_qp(ibqp);
1409 
1410 err_free_uidx:
1411 	if (!ctx->cqe_version)
1412 		pthread_mutex_unlock(&to_mctx(context)->qp_table_mutex);
1413 	else if (!is_xrc_tgt(attr->qp_type))
1414 		mlx5_clear_uidx(ctx, usr_idx);
1415 
1416 err_rq_db:
1417 	mlx5_free_db(to_mctx(context), qp->db);
1418 
1419 err_rq_spl:
1420 	mlx5_spinlock_destroy(&qp->rq.lock);
1421 
1422 err_sq_spl:
1423 	mlx5_spinlock_destroy(&qp->sq.lock);
1424 
1425 err_free_qp_buf:
1426 	mlx5_free_qp_buf(qp);
1427 
1428 err:
1429 	free(qp);
1430 
1431 	return NULL;
1432 }
1433 
1434 struct ibv_qp *mlx5_create_qp(struct ibv_pd *pd,
1435 			      struct ibv_qp_init_attr *attr)
1436 {
1437 	struct ibv_qp *qp;
1438 	struct ibv_qp_init_attr_ex attrx;
1439 
1440 	memset(&attrx, 0, sizeof(attrx));
1441 	memcpy(&attrx, attr, sizeof(*attr));
1442 	attrx.comp_mask = IBV_QP_INIT_ATTR_PD;
1443 	attrx.pd = pd;
1444 	qp = create_qp(pd->context, &attrx);
1445 	if (qp)
1446 		memcpy(attr, &attrx, sizeof(*attr));
1447 
1448 	return qp;
1449 }
1450 
1451 static void mlx5_lock_cqs(struct ibv_qp *qp)
1452 {
1453 	struct mlx5_cq *send_cq = to_mcq(qp->send_cq);
1454 	struct mlx5_cq *recv_cq = to_mcq(qp->recv_cq);
1455 
1456 	if (send_cq && recv_cq) {
1457 		if (send_cq == recv_cq) {
1458 			mlx5_spin_lock(&send_cq->lock);
1459 		} else if (send_cq->cqn < recv_cq->cqn) {
1460 			mlx5_spin_lock(&send_cq->lock);
1461 			mlx5_spin_lock(&recv_cq->lock);
1462 		} else {
1463 			mlx5_spin_lock(&recv_cq->lock);
1464 			mlx5_spin_lock(&send_cq->lock);
1465 		}
1466 	} else if (send_cq) {
1467 		mlx5_spin_lock(&send_cq->lock);
1468 	} else if (recv_cq) {
1469 		mlx5_spin_lock(&recv_cq->lock);
1470 	}
1471 }
1472 
1473 static void mlx5_unlock_cqs(struct ibv_qp *qp)
1474 {
1475 	struct mlx5_cq *send_cq = to_mcq(qp->send_cq);
1476 	struct mlx5_cq *recv_cq = to_mcq(qp->recv_cq);
1477 
1478 	if (send_cq && recv_cq) {
1479 		if (send_cq == recv_cq) {
1480 			mlx5_spin_unlock(&send_cq->lock);
1481 		} else if (send_cq->cqn < recv_cq->cqn) {
1482 			mlx5_spin_unlock(&recv_cq->lock);
1483 			mlx5_spin_unlock(&send_cq->lock);
1484 		} else {
1485 			mlx5_spin_unlock(&send_cq->lock);
1486 			mlx5_spin_unlock(&recv_cq->lock);
1487 		}
1488 	} else if (send_cq) {
1489 		mlx5_spin_unlock(&send_cq->lock);
1490 	} else if (recv_cq) {
1491 		mlx5_spin_unlock(&recv_cq->lock);
1492 	}
1493 }
1494 
1495 int mlx5_destroy_qp(struct ibv_qp *ibqp)
1496 {
1497 	struct mlx5_qp *qp = to_mqp(ibqp);
1498 	struct mlx5_context *ctx = to_mctx(ibqp->context);
1499 	int ret;
1500 
1501 	if (qp->rss_qp) {
1502 		ret = ibv_cmd_destroy_qp(ibqp);
1503 		if (ret)
1504 			return ret;
1505 		goto free;
1506 	}
1507 
1508 	if (!ctx->cqe_version)
1509 		pthread_mutex_lock(&ctx->qp_table_mutex);
1510 
1511 	ret = ibv_cmd_destroy_qp(ibqp);
1512 	if (ret) {
1513 		if (!ctx->cqe_version)
1514 			pthread_mutex_unlock(&ctx->qp_table_mutex);
1515 		return ret;
1516 	}
1517 
1518 	mlx5_lock_cqs(ibqp);
1519 
1520 	__mlx5_cq_clean(to_mcq(ibqp->recv_cq), qp->rsc.rsn,
1521 			ibqp->srq ? to_msrq(ibqp->srq) : NULL);
1522 	if (ibqp->send_cq != ibqp->recv_cq)
1523 		__mlx5_cq_clean(to_mcq(ibqp->send_cq), qp->rsc.rsn, NULL);
1524 
1525 	if (!ctx->cqe_version) {
1526 		if (qp->sq.wqe_cnt || qp->rq.wqe_cnt)
1527 			mlx5_clear_qp(ctx, ibqp->qp_num);
1528 	}
1529 
1530 	mlx5_unlock_cqs(ibqp);
1531 	if (!ctx->cqe_version)
1532 		pthread_mutex_unlock(&ctx->qp_table_mutex);
1533 	else if (!is_xrc_tgt(ibqp->qp_type))
1534 		mlx5_clear_uidx(ctx, qp->rsc.rsn);
1535 
1536 	mlx5_free_db(ctx, qp->db);
1537 	mlx5_spinlock_destroy(&qp->rq.lock);
1538 	mlx5_spinlock_destroy(&qp->sq.lock);
1539 	mlx5_free_qp_buf(qp);
1540 free:
1541 	free(qp);
1542 
1543 	return 0;
1544 }
1545 
1546 int mlx5_query_qp(struct ibv_qp *ibqp, struct ibv_qp_attr *attr,
1547 		  int attr_mask, struct ibv_qp_init_attr *init_attr)
1548 {
1549 	struct ibv_query_qp cmd;
1550 	struct mlx5_qp *qp = to_mqp(ibqp);
1551 	int ret;
1552 
1553 	if (qp->rss_qp)
1554 		return ENOSYS;
1555 
1556 	ret = ibv_cmd_query_qp(ibqp, attr, attr_mask, init_attr, &cmd, sizeof(cmd));
1557 	if (ret)
1558 		return ret;
1559 
1560 	init_attr->cap.max_send_wr     = qp->sq.max_post;
1561 	init_attr->cap.max_send_sge    = qp->sq.max_gs;
1562 	init_attr->cap.max_inline_data = qp->max_inline_data;
1563 
1564 	attr->cap = init_attr->cap;
1565 
1566 	return 0;
1567 }
1568 
1569 enum {
1570 	MLX5_MODIFY_QP_EX_ATTR_MASK = IBV_QP_RATE_LIMIT,
1571 };
1572 
1573 int mlx5_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr,
1574 		   int attr_mask)
1575 {
1576 	struct ibv_modify_qp cmd = {};
1577 	struct ibv_modify_qp_ex cmd_ex = {};
1578 	struct ibv_modify_qp_resp_ex resp = {};
1579 	struct mlx5_qp *mqp = to_mqp(qp);
1580 	struct mlx5_context *context = to_mctx(qp->context);
1581 	int ret;
1582 	uint32_t *db;
1583 
1584 	if (mqp->rss_qp)
1585 		return ENOSYS;
1586 
1587 	if (attr_mask & IBV_QP_PORT) {
1588 		switch (qp->qp_type) {
1589 		case IBV_QPT_RAW_PACKET:
1590 			if (context->cached_link_layer[attr->port_num - 1] ==
1591 			     IBV_LINK_LAYER_ETHERNET) {
1592 				if (context->cached_device_cap_flags &
1593 				    IBV_DEVICE_RAW_IP_CSUM)
1594 					mqp->qp_cap_cache |=
1595 						MLX5_CSUM_SUPPORT_RAW_OVER_ETH |
1596 						MLX5_RX_CSUM_VALID;
1597 
1598 				if (ibv_is_qpt_supported(
1599 				 context->cached_tso_caps.supported_qpts,
1600 				 IBV_QPT_RAW_PACKET))
1601 					mqp->max_tso =
1602 					     context->cached_tso_caps.max_tso;
1603 			}
1604 			break;
1605 		default:
1606 			break;
1607 		}
1608 	}
1609 
1610 	if (attr_mask & MLX5_MODIFY_QP_EX_ATTR_MASK)
1611 		ret = ibv_cmd_modify_qp_ex(qp, attr, attr_mask,
1612 					   &cmd_ex,
1613 					   sizeof(cmd_ex), sizeof(cmd_ex),
1614 					   &resp,
1615 					   sizeof(resp), sizeof(resp));
1616 	else
1617 		ret = ibv_cmd_modify_qp(qp, attr, attr_mask,
1618 					&cmd, sizeof(cmd));
1619 
1620 	if (!ret		       &&
1621 	    (attr_mask & IBV_QP_STATE) &&
1622 	    attr->qp_state == IBV_QPS_RESET) {
1623 		if (qp->recv_cq) {
1624 			mlx5_cq_clean(to_mcq(qp->recv_cq), mqp->rsc.rsn,
1625 				      qp->srq ? to_msrq(qp->srq) : NULL);
1626 		}
1627 		if (qp->send_cq != qp->recv_cq && qp->send_cq)
1628 			mlx5_cq_clean(to_mcq(qp->send_cq),
1629 				      to_mqp(qp)->rsc.rsn, NULL);
1630 
1631 		mlx5_init_qp_indices(mqp);
1632 		db = mqp->db;
1633 		db[MLX5_RCV_DBR] = 0;
1634 		db[MLX5_SND_DBR] = 0;
1635 	}
1636 
1637 	/*
1638 	 * When the Raw Packet QP is in INIT state, its RQ
1639 	 * underneath is already in RDY, which means it can
1640 	 * receive packets. According to the IB spec, a QP can't
1641 	 * receive packets until moved to RTR state. To achieve this,
1642 	 * for Raw Packet QPs, we update the doorbell record
1643 	 * once the QP is moved to RTR.
1644 	 */
1645 	if (!ret &&
1646 	    (attr_mask & IBV_QP_STATE) &&
1647 	    attr->qp_state == IBV_QPS_RTR &&
1648 	    qp->qp_type == IBV_QPT_RAW_PACKET) {
1649 		mlx5_spin_lock(&mqp->rq.lock);
1650 		mqp->db[MLX5_RCV_DBR] = htobe32(mqp->rq.head & 0xffff);
1651 		mlx5_spin_unlock(&mqp->rq.lock);
1652 	}
1653 
1654 	return ret;
1655 }
1656 
1657 #define RROCE_UDP_SPORT_MIN 0xC000
1658 #define RROCE_UDP_SPORT_MAX 0xFFFF
1659 struct ibv_ah *mlx5_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr)
1660 {
1661 	struct mlx5_context *ctx = to_mctx(pd->context);
1662 	struct ibv_port_attr port_attr;
1663 	struct mlx5_ah *ah;
1664 	uint32_t gid_type;
1665 	uint32_t tmp;
1666 	uint8_t grh;
1667 	int is_eth;
1668 
1669 	if (attr->port_num < 1 || attr->port_num > ctx->num_ports)
1670 		return NULL;
1671 
1672 	if (ctx->cached_link_layer[attr->port_num - 1]) {
1673 		is_eth = ctx->cached_link_layer[attr->port_num - 1] ==
1674 			IBV_LINK_LAYER_ETHERNET;
1675 	} else {
1676 		if (ibv_query_port(pd->context, attr->port_num, &port_attr))
1677 			return NULL;
1678 
1679 		is_eth = (port_attr.link_layer == IBV_LINK_LAYER_ETHERNET);
1680 	}
1681 
1682 	if (unlikely((!attr->is_global) && is_eth)) {
1683 		errno = EINVAL;
1684 		return NULL;
1685 	}
1686 
1687 	ah = calloc(1, sizeof *ah);
1688 	if (!ah)
1689 		return NULL;
1690 
1691 	if (is_eth) {
1692 		if (ibv_query_gid_type(pd->context, attr->port_num,
1693 				       attr->grh.sgid_index, &gid_type))
1694 			goto err;
1695 
1696 		if (gid_type == IBV_GID_TYPE_ROCE_V2)
1697 			ah->av.rlid = htobe16(rand() % (RROCE_UDP_SPORT_MAX + 1
1698 						      - RROCE_UDP_SPORT_MIN)
1699 					    + RROCE_UDP_SPORT_MIN);
1700 		/* Since RoCE packets must contain GRH, this bit is reserved
1701 		 * for RoCE and shouldn't be set.
1702 		 */
1703 		grh = 0;
1704 	} else {
1705 		ah->av.fl_mlid = attr->src_path_bits & 0x7f;
1706 		ah->av.rlid = htobe16(attr->dlid);
1707 		grh = 1;
1708 	}
1709 	ah->av.stat_rate_sl = (attr->static_rate << 4) | attr->sl;
1710 	if (attr->is_global) {
1711 		ah->av.tclass = attr->grh.traffic_class;
1712 		ah->av.hop_limit = attr->grh.hop_limit;
1713 		tmp = htobe32((grh << 30) |
1714 			    ((attr->grh.sgid_index & 0xff) << 20) |
1715 			    (attr->grh.flow_label & 0xfffff));
1716 		ah->av.grh_gid_fl = tmp;
1717 		memcpy(ah->av.rgid, attr->grh.dgid.raw, 16);
1718 	}
1719 
1720 	if (is_eth) {
1721 		if (ctx->cmds_supp_uhw & MLX5_USER_CMDS_SUPP_UHW_CREATE_AH) {
1722 			struct mlx5_create_ah_resp resp = {};
1723 
1724 			if (ibv_cmd_create_ah(pd, &ah->ibv_ah, attr, &resp.ibv_resp, sizeof(resp)))
1725 				goto err;
1726 
1727 			ah->kern_ah = true;
1728 			memcpy(ah->av.rmac, resp.dmac, ETHERNET_LL_SIZE);
1729 		} else {
1730 			uint16_t vid;
1731 
1732 			if (ibv_resolve_eth_l2_from_gid(pd->context, attr,
1733 							ah->av.rmac, &vid))
1734 				goto err;
1735 		}
1736 	}
1737 
1738 	return &ah->ibv_ah;
1739 err:
1740 	free(ah);
1741 	return NULL;
1742 }
1743 
1744 int mlx5_destroy_ah(struct ibv_ah *ah)
1745 {
1746 	struct mlx5_ah *mah = to_mah(ah);
1747 	int err;
1748 
1749 	if (mah->kern_ah) {
1750 		err = ibv_cmd_destroy_ah(ah);
1751 		if (err)
1752 			return err;
1753 	}
1754 
1755 	free(mah);
1756 	return 0;
1757 }
1758 
1759 int mlx5_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
1760 {
1761 	return ibv_cmd_attach_mcast(qp, gid, lid);
1762 }
1763 
1764 int mlx5_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid)
1765 {
1766 	return ibv_cmd_detach_mcast(qp, gid, lid);
1767 }
1768 
1769 struct ibv_qp *mlx5_create_qp_ex(struct ibv_context *context,
1770 				 struct ibv_qp_init_attr_ex *attr)
1771 {
1772 	return create_qp(context, attr);
1773 }
1774 
1775 int mlx5_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num)
1776 {
1777 	struct mlx5_srq *msrq = to_msrq(srq);
1778 
1779 	*srq_num = msrq->srqn;
1780 
1781 	return 0;
1782 }
1783 
1784 struct ibv_xrcd *
1785 mlx5_open_xrcd(struct ibv_context *context,
1786 	       struct ibv_xrcd_init_attr *xrcd_init_attr)
1787 {
1788 	int err;
1789 	struct verbs_xrcd *xrcd;
1790 	struct ibv_open_xrcd cmd = {};
1791 	struct ibv_open_xrcd_resp resp = {};
1792 
1793 	xrcd = calloc(1, sizeof(*xrcd));
1794 	if (!xrcd)
1795 		return NULL;
1796 
1797 	err = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), xrcd_init_attr,
1798 				&cmd, sizeof(cmd), &resp, sizeof(resp));
1799 	if (err) {
1800 		free(xrcd);
1801 		return NULL;
1802 	}
1803 
1804 	return &xrcd->xrcd;
1805 }
1806 
1807 int mlx5_close_xrcd(struct ibv_xrcd *ib_xrcd)
1808 {
1809 	struct verbs_xrcd *xrcd = container_of(ib_xrcd, struct verbs_xrcd, xrcd);
1810 	int ret;
1811 
1812 	ret = ibv_cmd_close_xrcd(xrcd);
1813 	if (!ret)
1814 		free(xrcd);
1815 
1816 	return ret;
1817 }
1818 
1819 static struct ibv_srq *
1820 mlx5_create_xrc_srq(struct ibv_context *context,
1821 		    struct ibv_srq_init_attr_ex *attr)
1822 {
1823 	int err;
1824 	struct mlx5_create_srq_ex cmd;
1825 	struct mlx5_create_srq_resp resp;
1826 	struct mlx5_srq *msrq;
1827 	struct mlx5_context *ctx = to_mctx(context);
1828 	int max_sge;
1829 	struct ibv_srq *ibsrq;
1830 	int uidx;
1831 	FILE *fp = ctx->dbg_fp;
1832 
1833 	msrq = calloc(1, sizeof(*msrq));
1834 	if (!msrq)
1835 		return NULL;
1836 
1837 	ibsrq = (struct ibv_srq *)&msrq->vsrq;
1838 
1839 	memset(&cmd, 0, sizeof(cmd));
1840 	memset(&resp, 0, sizeof(resp));
1841 
1842 	if (mlx5_spinlock_init(&msrq->lock)) {
1843 		fprintf(stderr, "%s-%d:\n", __func__, __LINE__);
1844 		goto err;
1845 	}
1846 
1847 	if (attr->attr.max_wr > ctx->max_srq_recv_wr) {
1848 		fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n",
1849 			__func__, __LINE__, attr->attr.max_wr,
1850 			ctx->max_srq_recv_wr);
1851 		errno = EINVAL;
1852 		goto err_spl;
1853 	}
1854 
1855 	/*
1856 	 * this calculation does not consider required control segments. The
1857 	 * final calculation is done again later. This is done so to avoid
1858 	 * overflows of variables
1859 	 */
1860 	max_sge = ctx->max_recv_wr / sizeof(struct mlx5_wqe_data_seg);
1861 	if (attr->attr.max_sge > max_sge) {
1862 		fprintf(stderr, "%s-%d:max_wr %d, max_srq_recv_wr %d\n",
1863 			__func__, __LINE__, attr->attr.max_wr,
1864 			ctx->max_srq_recv_wr);
1865 		errno = EINVAL;
1866 		goto err_spl;
1867 	}
1868 
1869 	msrq->max     = align_queue_size(attr->attr.max_wr + 1);
1870 	msrq->max_gs  = attr->attr.max_sge;
1871 	msrq->counter = 0;
1872 
1873 	if (mlx5_alloc_srq_buf(context, msrq)) {
1874 		fprintf(stderr, "%s-%d:\n", __func__, __LINE__);
1875 		goto err_spl;
1876 	}
1877 
1878 	msrq->db = mlx5_alloc_dbrec(ctx);
1879 	if (!msrq->db) {
1880 		fprintf(stderr, "%s-%d:\n", __func__, __LINE__);
1881 		goto err_free;
1882 	}
1883 
1884 	*msrq->db = 0;
1885 
1886 	cmd.buf_addr = (uintptr_t)msrq->buf.buf;
1887 	cmd.db_addr  = (uintptr_t)msrq->db;
1888 	msrq->wq_sig = srq_sig_enabled();
1889 	if (msrq->wq_sig)
1890 		cmd.flags = MLX5_SRQ_FLAG_SIGNATURE;
1891 
1892 	attr->attr.max_sge = msrq->max_gs;
1893 	if (ctx->cqe_version) {
1894 		uidx = mlx5_store_uidx(ctx, msrq);
1895 		if (uidx < 0) {
1896 			mlx5_dbg(fp, MLX5_DBG_QP, "Couldn't find free user index\n");
1897 			goto err_free_db;
1898 		}
1899 		cmd.uidx = uidx;
1900 	} else {
1901 		cmd.uidx = 0xffffff;
1902 		pthread_mutex_lock(&ctx->srq_table_mutex);
1903 	}
1904 
1905 	err = ibv_cmd_create_srq_ex(context, &msrq->vsrq, sizeof(msrq->vsrq),
1906 				    attr, &cmd.ibv_cmd, sizeof(cmd),
1907 				    &resp.ibv_resp, sizeof(resp));
1908 	if (err)
1909 		goto err_free_uidx;
1910 
1911 	if (!ctx->cqe_version) {
1912 		err = mlx5_store_srq(to_mctx(context), resp.srqn, msrq);
1913 		if (err)
1914 			goto err_destroy;
1915 
1916 		pthread_mutex_unlock(&ctx->srq_table_mutex);
1917 	}
1918 
1919 	msrq->srqn = resp.srqn;
1920 	msrq->rsc.type = MLX5_RSC_TYPE_XSRQ;
1921 	msrq->rsc.rsn = ctx->cqe_version ? cmd.uidx : resp.srqn;
1922 
1923 	return ibsrq;
1924 
1925 err_destroy:
1926 	ibv_cmd_destroy_srq(ibsrq);
1927 
1928 err_free_uidx:
1929 	if (ctx->cqe_version)
1930 		mlx5_clear_uidx(ctx, cmd.uidx);
1931 	else
1932 		pthread_mutex_unlock(&ctx->srq_table_mutex);
1933 
1934 err_free_db:
1935 	mlx5_free_db(ctx, msrq->db);
1936 
1937 err_free:
1938 	free(msrq->wrid);
1939 	mlx5_free_buf(&msrq->buf);
1940 
1941 err_spl:
1942 	mlx5_spinlock_destroy(&msrq->lock);
1943 
1944 err:
1945 	free(msrq);
1946 
1947 	return NULL;
1948 }
1949 
1950 struct ibv_srq *mlx5_create_srq_ex(struct ibv_context *context,
1951 				   struct ibv_srq_init_attr_ex *attr)
1952 {
1953 	if (!(attr->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ||
1954 	    (attr->srq_type == IBV_SRQT_BASIC))
1955 		return mlx5_create_srq(attr->pd,
1956 				       (struct ibv_srq_init_attr *)attr);
1957 	else if (attr->srq_type == IBV_SRQT_XRC)
1958 		return mlx5_create_xrc_srq(context, attr);
1959 
1960 	return NULL;
1961 }
1962 
1963 int mlx5_query_device_ex(struct ibv_context *context,
1964 			 const struct ibv_query_device_ex_input *input,
1965 			 struct ibv_device_attr_ex *attr,
1966 			 size_t attr_size)
1967 {
1968 	struct mlx5_context *mctx = to_mctx(context);
1969 	struct mlx5_query_device_ex_resp resp;
1970 	struct mlx5_query_device_ex cmd;
1971 	struct ibv_device_attr *a;
1972 	uint64_t raw_fw_ver;
1973 	unsigned sub_minor;
1974 	unsigned major;
1975 	unsigned minor;
1976 	int err;
1977 	int cmd_supp_uhw = mctx->cmds_supp_uhw &
1978 		MLX5_USER_CMDS_SUPP_UHW_QUERY_DEVICE;
1979 
1980 	memset(&cmd, 0, sizeof(cmd));
1981 	memset(&resp, 0, sizeof(resp));
1982 	err = ibv_cmd_query_device_ex(context, input, attr, attr_size,
1983 				      &raw_fw_ver,
1984 				      &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd),
1985 				      &resp.ibv_resp, sizeof(resp.ibv_resp),
1986 				      cmd_supp_uhw ? sizeof(resp) : sizeof(resp.ibv_resp));
1987 	if (err)
1988 		return err;
1989 
1990 	attr->tso_caps = resp.tso_caps;
1991 	attr->rss_caps.rx_hash_fields_mask = resp.rss_caps.rx_hash_fields_mask;
1992 	attr->rss_caps.rx_hash_function = resp.rss_caps.rx_hash_function;
1993 	attr->packet_pacing_caps = resp.packet_pacing_caps.caps;
1994 
1995 	if (resp.support_multi_pkt_send_wqe)
1996 		mctx->vendor_cap_flags |= MLX5_VENDOR_CAP_FLAGS_MPW;
1997 
1998 	mctx->cqe_comp_caps = resp.cqe_comp_caps;
1999 
2000 	major     = (raw_fw_ver >> 32) & 0xffff;
2001 	minor     = (raw_fw_ver >> 16) & 0xffff;
2002 	sub_minor = raw_fw_ver & 0xffff;
2003 	a = &attr->orig_attr;
2004 	snprintf(a->fw_ver, sizeof(a->fw_ver), "%d.%d.%04d",
2005 		 major, minor, sub_minor);
2006 
2007 	return 0;
2008 }
2009 
2010 static int rwq_sig_enabled(struct ibv_context *context)
2011 {
2012 	char *env;
2013 
2014 	env = getenv("MLX5_RWQ_SIGNATURE");
2015 	if (env)
2016 		return 1;
2017 
2018 	return 0;
2019 }
2020 
2021 static void mlx5_free_rwq_buf(struct mlx5_rwq *rwq, struct ibv_context *context)
2022 {
2023 	struct mlx5_context *ctx = to_mctx(context);
2024 
2025 	mlx5_free_actual_buf(ctx, &rwq->buf);
2026 	free(rwq->rq.wrid);
2027 }
2028 
2029 static int mlx5_alloc_rwq_buf(struct ibv_context *context,
2030 			      struct mlx5_rwq *rwq,
2031 			      int size)
2032 {
2033 	int err;
2034 	enum mlx5_alloc_type default_alloc_type = MLX5_ALLOC_TYPE_PREFER_CONTIG;
2035 
2036 	rwq->rq.wrid = malloc(rwq->rq.wqe_cnt * sizeof(uint64_t));
2037 	if (!rwq->rq.wrid) {
2038 		errno = ENOMEM;
2039 		return -1;
2040 	}
2041 
2042 	err = mlx5_alloc_prefered_buf(to_mctx(context), &rwq->buf,
2043 				      align(rwq->buf_size, to_mdev
2044 				      (context->device)->page_size),
2045 				      to_mdev(context->device)->page_size,
2046 				      default_alloc_type,
2047 				      MLX5_RWQ_PREFIX);
2048 
2049 	if (err) {
2050 		free(rwq->rq.wrid);
2051 		errno = ENOMEM;
2052 		return -1;
2053 	}
2054 
2055 	return 0;
2056 }
2057 
2058 struct ibv_wq *mlx5_create_wq(struct ibv_context *context,
2059 			      struct ibv_wq_init_attr *attr)
2060 {
2061 	struct mlx5_create_wq		cmd;
2062 	struct mlx5_create_wq_resp		resp;
2063 	int				err;
2064 	struct mlx5_rwq			*rwq;
2065 	struct mlx5_context	*ctx = to_mctx(context);
2066 	int ret;
2067 	int32_t				usr_idx = 0;
2068 	FILE *fp = ctx->dbg_fp;
2069 
2070 	if (attr->wq_type != IBV_WQT_RQ)
2071 		return NULL;
2072 
2073 	memset(&cmd, 0, sizeof(cmd));
2074 	memset(&resp, 0, sizeof(resp));
2075 
2076 	rwq = calloc(1, sizeof(*rwq));
2077 	if (!rwq)
2078 		return NULL;
2079 
2080 	rwq->wq_sig = rwq_sig_enabled(context);
2081 	if (rwq->wq_sig)
2082 		cmd.drv.flags = MLX5_RWQ_FLAG_SIGNATURE;
2083 
2084 	ret = mlx5_calc_rwq_size(ctx, rwq, attr);
2085 	if (ret < 0) {
2086 		errno = -ret;
2087 		goto err;
2088 	}
2089 
2090 	ret = ibv_init_wq(&rwq->wq);
2091 	if (ret < 0)
2092 		goto err;
2093 
2094 	rwq->buf_size = ret;
2095 	if (mlx5_alloc_rwq_buf(context, rwq, ret))
2096 		goto err_cleanup_wq;
2097 
2098 	mlx5_init_rwq_indices(rwq);
2099 
2100 	if (mlx5_spinlock_init(&rwq->rq.lock))
2101 		goto err_free_rwq_buf;
2102 
2103 	rwq->db = mlx5_alloc_dbrec(ctx);
2104 	if (!rwq->db)
2105 		goto err_spl;
2106 
2107 	rwq->db[MLX5_RCV_DBR] = 0;
2108 	rwq->db[MLX5_SND_DBR] = 0;
2109 	rwq->pbuff = rwq->buf.buf + rwq->rq.offset;
2110 	rwq->recv_db =  &rwq->db[MLX5_RCV_DBR];
2111 	cmd.drv.buf_addr = (uintptr_t)rwq->buf.buf;
2112 	cmd.drv.db_addr  = (uintptr_t)rwq->db;
2113 	cmd.drv.rq_wqe_count = rwq->rq.wqe_cnt;
2114 	cmd.drv.rq_wqe_shift = rwq->rq.wqe_shift;
2115 	usr_idx = mlx5_store_uidx(ctx, rwq);
2116 	if (usr_idx < 0) {
2117 		mlx5_dbg(fp, MLX5_DBG_QP, "Couldn't find free user index\n");
2118 		goto err_free_db_rec;
2119 	}
2120 
2121 	cmd.drv.user_index = usr_idx;
2122 	err = ibv_cmd_create_wq(context, attr, &rwq->wq, &cmd.ibv_cmd,
2123 				sizeof(cmd.ibv_cmd),
2124 				sizeof(cmd),
2125 				&resp.ibv_resp, sizeof(resp.ibv_resp),
2126 				sizeof(resp));
2127 	if (err)
2128 		goto err_create;
2129 
2130 	rwq->rsc.type = MLX5_RSC_TYPE_RWQ;
2131 	rwq->rsc.rsn =  cmd.drv.user_index;
2132 
2133 	rwq->wq.post_recv = mlx5_post_wq_recv;
2134 	return &rwq->wq;
2135 
2136 err_create:
2137 	mlx5_clear_uidx(ctx, cmd.drv.user_index);
2138 err_free_db_rec:
2139 	mlx5_free_db(to_mctx(context), rwq->db);
2140 err_spl:
2141 	mlx5_spinlock_destroy(&rwq->rq.lock);
2142 err_free_rwq_buf:
2143 	mlx5_free_rwq_buf(rwq, context);
2144 err_cleanup_wq:
2145 	ibv_cleanup_wq(&rwq->wq);
2146 err:
2147 	free(rwq);
2148 	return NULL;
2149 }
2150 
2151 int mlx5_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *attr)
2152 {
2153 	struct mlx5_modify_wq	cmd = {};
2154 	struct mlx5_rwq *rwq = to_mrwq(wq);
2155 
2156 	if ((attr->attr_mask & IBV_WQ_ATTR_STATE) &&
2157 	    attr->wq_state == IBV_WQS_RDY) {
2158 		if ((attr->attr_mask & IBV_WQ_ATTR_CURR_STATE) &&
2159 		    attr->curr_wq_state != wq->state)
2160 			return -EINVAL;
2161 
2162 		if (wq->state == IBV_WQS_RESET) {
2163 			mlx5_spin_lock(&to_mcq(wq->cq)->lock);
2164 			__mlx5_cq_clean(to_mcq(wq->cq),
2165 					rwq->rsc.rsn, NULL);
2166 			mlx5_spin_unlock(&to_mcq(wq->cq)->lock);
2167 			mlx5_init_rwq_indices(rwq);
2168 			rwq->db[MLX5_RCV_DBR] = 0;
2169 			rwq->db[MLX5_SND_DBR] = 0;
2170 		}
2171 	}
2172 
2173 	return ibv_cmd_modify_wq(wq, attr, &cmd.ibv_cmd,  sizeof(cmd.ibv_cmd), sizeof(cmd));
2174 }
2175 
2176 int mlx5_destroy_wq(struct ibv_wq *wq)
2177 {
2178 	struct mlx5_rwq *rwq = to_mrwq(wq);
2179 	int ret;
2180 
2181 	ret = ibv_cmd_destroy_wq(wq);
2182 	if (ret)
2183 		return ret;
2184 
2185 	mlx5_spin_lock(&to_mcq(wq->cq)->lock);
2186 	__mlx5_cq_clean(to_mcq(wq->cq), rwq->rsc.rsn, NULL);
2187 	mlx5_spin_unlock(&to_mcq(wq->cq)->lock);
2188 	mlx5_clear_uidx(to_mctx(wq->context), rwq->rsc.rsn);
2189 	mlx5_free_db(to_mctx(wq->context), rwq->db);
2190 	mlx5_spinlock_destroy(&rwq->rq.lock);
2191 	mlx5_free_rwq_buf(rwq, wq->context);
2192 	ibv_cleanup_wq(&rwq->wq);
2193 	free(rwq);
2194 
2195 	return 0;
2196 }
2197 
2198 struct ibv_rwq_ind_table *mlx5_create_rwq_ind_table(struct ibv_context *context,
2199 						    struct ibv_rwq_ind_table_init_attr *init_attr)
2200 {
2201 	struct ibv_create_rwq_ind_table *cmd;
2202 	struct mlx5_create_rwq_ind_table_resp resp;
2203 	struct ibv_rwq_ind_table *ind_table;
2204 	uint32_t required_tbl_size;
2205 	int num_tbl_entries;
2206 	int cmd_size;
2207 	int err;
2208 
2209 	num_tbl_entries = 1 << init_attr->log_ind_tbl_size;
2210 	/* Data must be u64 aligned */
2211 	required_tbl_size = (num_tbl_entries * sizeof(uint32_t)) < sizeof(uint64_t) ?
2212 			sizeof(uint64_t) : (num_tbl_entries * sizeof(uint32_t));
2213 
2214 	cmd_size = required_tbl_size + sizeof(*cmd);
2215 	cmd = calloc(1, cmd_size);
2216 	if (!cmd)
2217 		return NULL;
2218 
2219 	memset(&resp, 0, sizeof(resp));
2220 	ind_table = calloc(1, sizeof(*ind_table));
2221 	if (!ind_table)
2222 		goto free_cmd;
2223 
2224 	err = ibv_cmd_create_rwq_ind_table(context, init_attr, ind_table, cmd,
2225 					   cmd_size, cmd_size, &resp.ibv_resp, sizeof(resp.ibv_resp),
2226 					   sizeof(resp));
2227 	if (err)
2228 		goto err;
2229 
2230 	free(cmd);
2231 	return ind_table;
2232 
2233 err:
2234 	free(ind_table);
2235 free_cmd:
2236 	free(cmd);
2237 	return NULL;
2238 }
2239 
2240 int mlx5_destroy_rwq_ind_table(struct ibv_rwq_ind_table *rwq_ind_table)
2241 {
2242 	int ret;
2243 
2244 	ret = ibv_cmd_destroy_rwq_ind_table(rwq_ind_table);
2245 
2246 	if (ret)
2247 		return ret;
2248 
2249 	free(rwq_ind_table);
2250 	return 0;
2251 }
2252