xref: /freebsd/contrib/ofed/libirdma/irdma_uverbs.c (revision 924226fba12cc9a228c73b956e1b7fa24c60b055)
1 /*-
2  * SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
3  *
4  * Copyright (C) 2019 - 2021 Intel Corporation
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenFabrics.org BSD license below:
11  *
12  *   Redistribution and use in source and binary forms, with or
13  *   without modification, are permitted provided that the following
14  *   conditions are met:
15  *
16  *    - Redistributions of source code must retain the above
17  *	copyright notice, this list of conditions and the following
18  *	disclaimer.
19  *
20  *    - Redistributions in binary form must reproduce the above
21  *	copyright notice, this list of conditions and the following
22  *	disclaimer in the documentation and/or other materials
23  *	provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  */
34 /*$FreeBSD$*/
35 
36 #include <config.h>
37 #include <stdlib.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <unistd.h>
41 #include <signal.h>
42 #include <errno.h>
43 #include <sys/param.h>
44 #include <sys/mman.h>
45 #include <netinet/in.h>
46 #include <sys/stat.h>
47 #include <fcntl.h>
48 #include <stdbool.h>
49 
50 #include "irdma_umain.h"
51 #include "abi.h"
52 
53 static inline void
54 print_fw_ver(uint64_t fw_ver, char *str, size_t len)
55 {
56 	uint16_t major, minor;
57 
58 	major = fw_ver >> 32 & 0xffff;
59 	minor = fw_ver & 0xffff;
60 
61 	snprintf(str, len, "%d.%d", major, minor);
62 }
63 
64 /**
65  * irdma_uquery_device_ex - query device attributes including extended properties
66  * @context: user context for the device
67  * @input: extensible input struct for ibv_query_device_ex verb
68  * @attr: extended device attribute struct
69  * @attr_size: size of extended device attribute struct
70  **/
71 int
72 irdma_uquery_device_ex(struct ibv_context *context,
73 		       const struct ibv_query_device_ex_input *input,
74 		       struct ibv_device_attr_ex *attr, size_t attr_size)
75 {
76 	struct irdma_query_device_ex cmd = {};
77 	struct irdma_query_device_ex_resp resp = {};
78 	uint64_t fw_ver;
79 	int ret;
80 
81 	ret = ibv_cmd_query_device_ex(context, input, attr, attr_size, &fw_ver,
82 				      &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd),
83 				      &resp.ibv_resp, sizeof(resp.ibv_resp), sizeof(resp));
84 	if (ret)
85 		return ret;
86 
87 	print_fw_ver(fw_ver, attr->orig_attr.fw_ver, sizeof(attr->orig_attr.fw_ver));
88 
89 	return 0;
90 }
91 
92 /**
93  * irdma_uquery_device - call driver to query device for max resources
94  * @context: user context for the device
95  * @attr: where to save all the mx resources from the driver
96  **/
97 int
98 irdma_uquery_device(struct ibv_context *context, struct ibv_device_attr *attr)
99 {
100 	struct ibv_query_device cmd;
101 	uint64_t fw_ver;
102 	int ret;
103 
104 	ret = ibv_cmd_query_device(context, attr, &fw_ver, &cmd, sizeof(cmd));
105 	if (ret)
106 		return ret;
107 
108 	print_fw_ver(fw_ver, attr->fw_ver, sizeof(attr->fw_ver));
109 
110 	return 0;
111 }
112 
113 /**
114  * irdma_uquery_port - get port attributes (msg size, lnk, mtu...)
115  * @context: user context of the device
116  * @port: port for the attributes
117  * @attr: to return port attributes
118  **/
119 int
120 irdma_uquery_port(struct ibv_context *context, uint8_t port,
121 		  struct ibv_port_attr *attr)
122 {
123 	struct ibv_query_port cmd;
124 
125 	return ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd));
126 }
127 
128 /**
129  * irdma_ualloc_pd - allocates protection domain and return pd ptr
130  * @context: user context of the device
131  **/
132 struct ibv_pd *
133 irdma_ualloc_pd(struct ibv_context *context)
134 {
135 	struct ibv_alloc_pd cmd;
136 	struct irdma_ualloc_pd_resp resp = {};
137 	struct irdma_upd *iwupd;
138 	int err;
139 
140 	iwupd = malloc(sizeof(*iwupd));
141 	if (!iwupd)
142 		return NULL;
143 
144 	err = ibv_cmd_alloc_pd(context, &iwupd->ibv_pd, &cmd, sizeof(cmd),
145 			       &resp.ibv_resp, sizeof(resp));
146 	if (err)
147 		goto err_free;
148 
149 	iwupd->pd_id = resp.pd_id;
150 
151 	return &iwupd->ibv_pd;
152 
153 err_free:
154 	free(iwupd);
155 	errno = err;
156 	return NULL;
157 }
158 
159 /**
160  * irdma_ufree_pd - free pd resources
161  * @pd: pd to free resources
162  */
163 int
164 irdma_ufree_pd(struct ibv_pd *pd)
165 {
166 	struct irdma_upd *iwupd;
167 	int ret;
168 
169 	iwupd = container_of(pd, struct irdma_upd, ibv_pd);
170 	ret = ibv_cmd_dealloc_pd(pd);
171 	if (ret)
172 		return ret;
173 
174 	free(iwupd);
175 
176 	return 0;
177 }
178 
179 /**
180  * irdma_ureg_mr - register user memory region
181  * @pd: pd for the mr
182  * @addr: user address of the memory region
183  * @length: length of the memory
184  * @hca_va: hca_va
185  * @access: access allowed on this mr
186  */
187 struct ibv_mr *
188 irdma_ureg_mr(struct ibv_pd *pd, void *addr, size_t length,
189 	      int access)
190 {
191 	struct irdma_umr *umr;
192 	struct irdma_ureg_mr cmd;
193 	struct ibv_reg_mr_resp resp;
194 	int err;
195 
196 	umr = malloc(sizeof(*umr));
197 	if (!umr)
198 		return NULL;
199 
200 	cmd.reg_type = IRDMA_MEMREG_TYPE_MEM;
201 	err = ibv_cmd_reg_mr(pd, addr, length,
202 			     (uintptr_t)addr, access, &umr->vmr.ibv_mr, &cmd.ibv_cmd,
203 			     sizeof(cmd), &resp, sizeof(resp));
204 	if (err) {
205 		free(umr);
206 		errno = err;
207 		return NULL;
208 	}
209 	umr->acc_flags = access;
210 
211 	return &umr->vmr.ibv_mr;
212 }
213 
214 /**
215  * irdma_udereg_mr - re-register memory region
216  * @mr: mr that was allocated
217  */
218 int
219 irdma_udereg_mr(struct ibv_mr *mr)
220 {
221 	struct irdma_umr *umr;
222 	struct verbs_mr *vmr;
223 	int ret;
224 
225 	vmr = container_of(mr, struct verbs_mr, ibv_mr);
226 	umr = container_of(vmr, struct irdma_umr, vmr);
227 
228 	ret = ibv_cmd_dereg_mr(mr);
229 	if (ret)
230 		return ret;
231 
232 	free(umr);
233 
234 	return 0;
235 }
236 
237 /**
238  * irdma_ualloc_mw - allocate memory window
239  * @pd: protection domain
240  * @type: memory window type
241  */
242 struct ibv_mw *
243 irdma_ualloc_mw(struct ibv_pd *pd, enum ibv_mw_type type)
244 {
245 	struct ibv_mw *mw;
246 	struct ibv_alloc_mw cmd;
247 	struct ibv_alloc_mw_resp resp;
248 
249 	mw = calloc(1, sizeof(*mw));
250 	if (!mw)
251 		return NULL;
252 
253 	if (ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd), &resp,
254 			     sizeof(resp))) {
255 		printf("%s: Failed to alloc memory window\n",
256 		       __func__);
257 		free(mw);
258 		return NULL;
259 	}
260 
261 	return mw;
262 }
263 
264 /**
265  * irdma_ubind_mw - bind a memory window
266  * @qp: qp to post WR
267  * @mw: memory window to bind
268  * @mw_bind: bind info
269  */
270 int
271 irdma_ubind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
272 	       struct ibv_mw_bind *mw_bind)
273 {
274 	struct ibv_mw_bind_info *bind_info = &mw_bind->bind_info;
275 	struct verbs_mr *vmr;
276 	struct irdma_umr *umr;
277 
278 	struct ibv_send_wr wr = {};
279 	struct ibv_send_wr *bad_wr;
280 	int err;
281 
282 	if (!bind_info->mr && (bind_info->addr || bind_info->length))
283 		return EINVAL;
284 
285 	if (bind_info->mr) {
286 		vmr = verbs_get_mr(bind_info->mr);
287 		umr = container_of(vmr, struct irdma_umr, vmr);
288 		if (vmr->mr_type != IBV_MR_TYPE_MR)
289 			return ENOTSUP;
290 
291 		if (umr->acc_flags & IBV_ACCESS_ZERO_BASED)
292 			return EINVAL;
293 
294 		if (mw->pd != bind_info->mr->pd)
295 			return EPERM;
296 	}
297 
298 	wr.opcode = IBV_WR_BIND_MW;
299 	wr.bind_mw.bind_info = mw_bind->bind_info;
300 	wr.bind_mw.mw = mw;
301 	wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey);
302 
303 	wr.wr_id = mw_bind->wr_id;
304 	wr.send_flags = mw_bind->send_flags;
305 
306 	err = irdma_upost_send(qp, &wr, &bad_wr);
307 	if (!err)
308 		mw->rkey = wr.bind_mw.rkey;
309 
310 	return err;
311 }
312 
313 /**
314  * irdma_udealloc_mw - deallocate memory window
315  * @mw: memory window to dealloc
316  */
317 int
318 irdma_udealloc_mw(struct ibv_mw *mw)
319 {
320 	int ret;
321 	struct ibv_dealloc_mw cmd;
322 
323 	ret = ibv_cmd_dealloc_mw(mw, &cmd, sizeof(cmd));
324 	if (ret)
325 		return ret;
326 	free(mw);
327 
328 	return 0;
329 }
330 
331 static void *
332 irdma_alloc_hw_buf(size_t size)
333 {
334 	void *buf;
335 
336 	buf = memalign(IRDMA_HW_PAGE_SIZE, size);
337 
338 	if (!buf)
339 		return NULL;
340 	if (ibv_dontfork_range(buf, size)) {
341 		free(buf);
342 		return NULL;
343 	}
344 
345 	return buf;
346 }
347 
348 static void
349 irdma_free_hw_buf(void *buf, size_t size)
350 {
351 	ibv_dofork_range(buf, size);
352 	free(buf);
353 }
354 
355 /**
356  * get_cq_size - returns actual cqe needed by HW
357  * @ncqe: minimum cqes requested by application
358  * @hw_rev: HW generation
359  */
360 static inline int
361 get_cq_size(int ncqe, u8 hw_rev)
362 {
363 	ncqe++;
364 
365 	/* Completions with immediate require 1 extra entry */
366 	if (hw_rev > IRDMA_GEN_1)
367 		ncqe *= 2;
368 
369 	if (ncqe < IRDMA_U_MINCQ_SIZE)
370 		ncqe = IRDMA_U_MINCQ_SIZE;
371 
372 	return ncqe;
373 }
374 
375 static inline size_t get_cq_total_bytes(u32 cq_size) {
376 	return roundup(cq_size * sizeof(struct irdma_cqe), IRDMA_HW_PAGE_SIZE);
377 }
378 
379 /**
380  * ucreate_cq - irdma util function to create a CQ
381  * @context: ibv context
382  * @attr_ex: CQ init attributes
383  * @ext_cq: flag to create an extendable or normal CQ
384  */
385 static struct ibv_cq_ex *
386 ucreate_cq(struct ibv_context *context,
387 	   struct ibv_cq_init_attr_ex *attr_ex,
388 	   bool ext_cq)
389 {
390 	struct irdma_cq_uk_init_info info = {};
391 	struct irdma_ureg_mr reg_mr_cmd = {};
392 	struct irdma_ucreate_cq_ex cmd = {};
393 	struct irdma_ucreate_cq_ex_resp resp = {};
394 	struct ibv_reg_mr_resp reg_mr_resp = {};
395 	struct irdma_ureg_mr reg_mr_shadow_cmd = {};
396 	struct ibv_reg_mr_resp reg_mr_shadow_resp = {};
397 	struct irdma_uk_attrs *uk_attrs;
398 	struct irdma_uvcontext *iwvctx;
399 	struct irdma_ucq *iwucq;
400 	size_t total_size;
401 	u32 cq_pages;
402 	int ret, ncqe;
403 	u8 hw_rev;
404 
405 	iwvctx = container_of(context, struct irdma_uvcontext, ibv_ctx);
406 	uk_attrs = &iwvctx->uk_attrs;
407 	hw_rev = uk_attrs->hw_rev;
408 
409 	if (ext_cq && hw_rev == IRDMA_GEN_1) {
410 		errno = EOPNOTSUPP;
411 		return NULL;
412 	}
413 
414 	if (attr_ex->cqe < IRDMA_MIN_CQ_SIZE || attr_ex->cqe > uk_attrs->max_hw_cq_size) {
415 		errno = EINVAL;
416 		return NULL;
417 	}
418 
419 	/* save the cqe requested by application */
420 	ncqe = attr_ex->cqe;
421 
422 	iwucq = calloc(1, sizeof(*iwucq));
423 	if (!iwucq)
424 		return NULL;
425 
426 	if (pthread_spin_init(&iwucq->lock, PTHREAD_PROCESS_PRIVATE)) {
427 		free(iwucq);
428 		return NULL;
429 	}
430 
431 	info.cq_size = get_cq_size(attr_ex->cqe, hw_rev);
432 	iwucq->comp_vector = attr_ex->comp_vector;
433 	LIST_INIT(&iwucq->resize_list);
434 	LIST_INIT(&iwucq->cmpl_generated);
435 	total_size = get_cq_total_bytes(info.cq_size);
436 	cq_pages = total_size >> IRDMA_HW_PAGE_SHIFT;
437 
438 	if (!(uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE))
439 		total_size = (cq_pages << IRDMA_HW_PAGE_SHIFT) + IRDMA_DB_SHADOW_AREA_SIZE;
440 
441 	iwucq->buf_size = total_size;
442 	info.cq_base = irdma_alloc_hw_buf(total_size);
443 	if (!info.cq_base)
444 		goto err_cq_base;
445 
446 	memset(info.cq_base, 0, total_size);
447 	reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ;
448 	reg_mr_cmd.cq_pages = cq_pages;
449 
450 	ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, info.cq_base,
451 			     total_size, (uintptr_t)info.cq_base,
452 			     IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr.ibv_mr,
453 			     &reg_mr_cmd.ibv_cmd, sizeof(reg_mr_cmd),
454 			     &reg_mr_resp, sizeof(reg_mr_resp));
455 	if (ret) {
456 		errno = ret;
457 		goto err_dereg_mr;
458 	}
459 
460 	iwucq->vmr.ibv_mr.pd = &iwvctx->iwupd->ibv_pd;
461 
462 	if (uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE) {
463 		info.shadow_area = irdma_alloc_hw_buf(IRDMA_DB_SHADOW_AREA_SIZE);
464 		if (!info.shadow_area)
465 			goto err_dereg_mr;
466 
467 		memset(info.shadow_area, 0, IRDMA_DB_SHADOW_AREA_SIZE);
468 		reg_mr_shadow_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ;
469 		reg_mr_shadow_cmd.cq_pages = 1;
470 
471 		ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, info.shadow_area,
472 				     IRDMA_DB_SHADOW_AREA_SIZE, (uintptr_t)info.shadow_area,
473 				     IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr_shadow_area.ibv_mr,
474 				     &reg_mr_shadow_cmd.ibv_cmd, sizeof(reg_mr_shadow_cmd),
475 				     &reg_mr_shadow_resp, sizeof(reg_mr_shadow_resp));
476 		if (ret) {
477 			errno = ret;
478 			goto err_dereg_shadow;
479 		}
480 
481 		iwucq->vmr_shadow_area.ibv_mr.pd = &iwvctx->iwupd->ibv_pd;
482 
483 	} else {
484 		info.shadow_area = (__le64 *) ((u8 *)info.cq_base + (cq_pages << IRDMA_HW_PAGE_SHIFT));
485 	}
486 
487 	attr_ex->cqe = info.cq_size;
488 	cmd.user_cq_buf = (__u64) ((uintptr_t)info.cq_base);
489 	cmd.user_shadow_area = (__u64) ((uintptr_t)info.shadow_area);
490 
491 	ret = ibv_cmd_create_cq_ex(context, attr_ex, &iwucq->verbs_cq.cq_ex,
492 				   &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd), &resp.ibv_resp,
493 				   sizeof(resp.ibv_resp), sizeof(resp));
494 	if (ret) {
495 		errno = ret;
496 		goto err_dereg_shadow;
497 	}
498 
499 	if (ext_cq)
500 		irdma_ibvcq_ex_fill_priv_funcs(iwucq, attr_ex);
501 	info.cq_id = resp.cq_id;
502 	/* Do not report the cqe's burned by HW */
503 	iwucq->verbs_cq.cq.cqe = ncqe;
504 
505 	info.cqe_alloc_db = (u32 *)((u8 *)iwvctx->db + IRDMA_DB_CQ_OFFSET);
506 	irdma_uk_cq_init(&iwucq->cq, &info);
507 
508 	return &iwucq->verbs_cq.cq_ex;
509 
510 err_dereg_shadow:
511 	ibv_cmd_dereg_mr(&iwucq->vmr.ibv_mr);
512 	if (iwucq->vmr_shadow_area.ibv_mr.handle) {
513 		ibv_cmd_dereg_mr(&iwucq->vmr_shadow_area.ibv_mr);
514 		irdma_free_hw_buf(info.shadow_area, IRDMA_HW_PAGE_SIZE);
515 	}
516 err_dereg_mr:
517 	irdma_free_hw_buf(info.cq_base, total_size);
518 err_cq_base:
519 	printf("%s: failed to initialize CQ\n", __func__);
520 	pthread_spin_destroy(&iwucq->lock);
521 
522 	free(iwucq);
523 
524 	return NULL;
525 }
526 
527 struct ibv_cq *
528 irdma_ucreate_cq(struct ibv_context *context, int cqe,
529 		 struct ibv_comp_channel *channel,
530 		 int comp_vector)
531 {
532 	struct ibv_cq_init_attr_ex attr_ex = {
533 		.cqe = cqe,
534 		.channel = channel,
535 		.comp_vector = comp_vector,
536 	};
537 	struct ibv_cq_ex *ibvcq_ex;
538 
539 	ibvcq_ex = ucreate_cq(context, &attr_ex, false);
540 
541 	return ibvcq_ex ? ibv_cq_ex_to_cq(ibvcq_ex) : NULL;
542 }
543 
544 struct ibv_cq_ex *
545 irdma_ucreate_cq_ex(struct ibv_context *context,
546 		    struct ibv_cq_init_attr_ex *attr_ex)
547 {
548 	if (attr_ex->wc_flags & ~IRDMA_CQ_SUPPORTED_WC_FLAGS) {
549 		errno = EOPNOTSUPP;
550 		return NULL;
551 	}
552 
553 	return ucreate_cq(context, attr_ex, true);
554 }
555 
556 /**
557  * irdma_free_cq_buf - free memory for cq buffer
558  * @cq_buf: cq buf to free
559  */
560 static void
561 irdma_free_cq_buf(struct irdma_cq_buf *cq_buf)
562 {
563 	ibv_cmd_dereg_mr(&cq_buf->vmr.ibv_mr);
564 	irdma_free_hw_buf(cq_buf->cq.cq_base, get_cq_total_bytes(cq_buf->cq.cq_size));
565 	free(cq_buf);
566 }
567 
568 /**
569  * irdma_process_resize_list - process the cq list to remove buffers
570  * @iwucq: cq which owns the list
571  * @lcqe_buf: cq buf where the last cqe is found
572  */
573 static int
574 irdma_process_resize_list(struct irdma_ucq *iwucq,
575 			  struct irdma_cq_buf *lcqe_buf)
576 {
577 	struct irdma_cq_buf *cq_buf, *next;
578 	int cq_cnt = 0;
579 
580 	LIST_FOREACH_SAFE(cq_buf, &iwucq->resize_list, list, next) {
581 		if (cq_buf == lcqe_buf)
582 			return cq_cnt;
583 
584 		LIST_REMOVE(cq_buf, list);
585 		irdma_free_cq_buf(cq_buf);
586 		cq_cnt++;
587 	}
588 
589 	return cq_cnt;
590 }
591 
592 static void
593 irdma_remove_cmpls_list(struct irdma_ucq *iwucq)
594 {
595 	struct irdma_cmpl_gen *cmpl_node, *next;
596 
597 	LIST_FOREACH_SAFE(cmpl_node, &iwucq->cmpl_generated, list, next) {
598 		LIST_REMOVE(cmpl_node, list);
599 		free(cmpl_node);
600 	}
601 }
602 
603 static int
604 irdma_generated_cmpls(struct irdma_ucq *iwucq, struct irdma_cq_poll_info *cq_poll_info)
605 {
606 	struct irdma_cmpl_gen *cmpl;
607 
608 	if (!iwucq || LIST_EMPTY(&iwucq->cmpl_generated))
609 		return ENOENT;
610 	cmpl = LIST_FIRST(&iwucq->cmpl_generated);
611 	LIST_REMOVE(cmpl, list);
612 	memcpy(cq_poll_info, &cmpl->cpi, sizeof(*cq_poll_info));
613 
614 	free(cmpl);
615 
616 	return 0;
617 }
618 
619 /**
620  * irdma_set_cpi_common_values - fill in values for polling info struct
621  * @cpi: resulting structure of cq_poll_info type
622  * @qp: QPair
623  * @qp_num: id of the QP
624  */
625 static void
626 irdma_set_cpi_common_values(struct irdma_cq_poll_info *cpi,
627 			    struct irdma_qp_uk *qp, __u32 qp_num)
628 {
629 	cpi->comp_status = IRDMA_COMPL_STATUS_FLUSHED;
630 	cpi->error = 1;
631 	cpi->major_err = IRDMA_FLUSH_MAJOR_ERR;
632 	cpi->minor_err = FLUSH_GENERAL_ERR;
633 	cpi->qp_handle = (irdma_qp_handle) (uintptr_t)qp;
634 	cpi->qp_id = qp_num;
635 }
636 
637 static bool
638 irdma_cq_empty(struct irdma_ucq *iwucq)
639 {
640 	struct irdma_cq_uk *ukcq;
641 	__u64 qword3;
642 	__le64 *cqe;
643 	__u8 polarity;
644 
645 	ukcq = &iwucq->cq;
646 	cqe = IRDMA_GET_CURRENT_CQ_ELEM(ukcq);
647 	get_64bit_val(cqe, 24, &qword3);
648 	polarity = (__u8) RS_64(qword3, IRDMA_CQ_VALID);
649 
650 	return polarity != ukcq->polarity;
651 }
652 
653 /**
654  * irdma_generate_flush_completions - generate completion from WRs
655  * @iwuqp: pointer to QP
656  */
657 static void
658 irdma_generate_flush_completions(struct irdma_uqp *iwuqp)
659 {
660 	struct irdma_qp_uk *qp = &iwuqp->qp;
661 	struct irdma_ring *sq_ring = &qp->sq_ring;
662 	struct irdma_ring *rq_ring = &qp->rq_ring;
663 	struct irdma_cmpl_gen *cmpl;
664 	__le64 *sw_wqe;
665 	__u64 wqe_qword;
666 	__u32 wqe_idx;
667 
668 	if (pthread_spin_lock(&iwuqp->send_cq->lock))
669 		return;
670 	if (irdma_cq_empty(iwuqp->send_cq)) {
671 		while (IRDMA_RING_MORE_WORK(*sq_ring)) {
672 			cmpl = malloc(sizeof(*cmpl));
673 			if (!cmpl) {
674 				pthread_spin_unlock(&iwuqp->send_cq->lock);
675 				return;
676 			}
677 
678 			wqe_idx = sq_ring->tail;
679 			irdma_set_cpi_common_values(&cmpl->cpi, qp, qp->qp_id);
680 			cmpl->cpi.wr_id = qp->sq_wrtrk_array[wqe_idx].wrid;
681 			sw_wqe = qp->sq_base[wqe_idx].elem;
682 			get_64bit_val(sw_wqe, 24, &wqe_qword);
683 			cmpl->cpi.op_type = (__u8) RS_64(wqe_qword, IRDMAQPSQ_OPCODE);
684 			/* remove the SQ WR by moving SQ tail */
685 			IRDMA_RING_SET_TAIL(*sq_ring, sq_ring->tail + qp->sq_wrtrk_array[sq_ring->tail].quanta);
686 			LIST_INSERT_HEAD(&iwuqp->send_cq->cmpl_generated, cmpl, list);
687 		}
688 	}
689 	pthread_spin_unlock(&iwuqp->send_cq->lock);
690 	if (pthread_spin_lock(&iwuqp->recv_cq->lock))
691 		return;
692 	if (irdma_cq_empty(iwuqp->recv_cq)) {
693 		while (IRDMA_RING_MORE_WORK(*rq_ring)) {
694 			cmpl = malloc(sizeof(*cmpl));
695 			if (!cmpl) {
696 				pthread_spin_unlock(&iwuqp->recv_cq->lock);
697 				return;
698 			}
699 
700 			wqe_idx = rq_ring->tail;
701 			irdma_set_cpi_common_values(&cmpl->cpi, qp, qp->qp_id);
702 			cmpl->cpi.wr_id = qp->rq_wrid_array[wqe_idx];
703 			cmpl->cpi.op_type = IRDMA_OP_TYPE_REC;
704 			/* remove the RQ WR by moving RQ tail */
705 			IRDMA_RING_SET_TAIL(*rq_ring, rq_ring->tail + 1);
706 			LIST_INSERT_HEAD(&iwuqp->recv_cq->cmpl_generated, cmpl, list);
707 		}
708 	}
709 	pthread_spin_unlock(&iwuqp->recv_cq->lock);
710 }
711 
712 void *
713 irdma_flush_thread(void *arg)
714 {
715 	__u8 i = 5;
716 	struct irdma_uqp *iwuqp = arg;
717 
718 	while (--i) {
719 		if (pthread_spin_lock(&iwuqp->lock))
720 			break;
721 		irdma_generate_flush_completions(arg);
722 		pthread_spin_unlock(&iwuqp->lock);
723 		sleep(1);
724 	}
725 	pthread_exit(NULL);
726 }
727 
728 /**
729  * irdma_udestroy_cq - destroys cq
730  * @cq: ptr to cq to be destroyed
731  */
732 int
733 irdma_udestroy_cq(struct ibv_cq *cq)
734 {
735 	struct irdma_uk_attrs *uk_attrs;
736 	struct irdma_uvcontext *iwvctx;
737 	struct irdma_ucq *iwucq;
738 	int ret;
739 
740 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
741 	iwvctx = container_of(cq->context, struct irdma_uvcontext, ibv_ctx);
742 	uk_attrs = &iwvctx->uk_attrs;
743 
744 	ret = pthread_spin_destroy(&iwucq->lock);
745 	if (ret)
746 		goto err;
747 
748 	if (!LIST_EMPTY(&iwucq->cmpl_generated))
749 		irdma_remove_cmpls_list(iwucq);
750 	irdma_process_resize_list(iwucq, NULL);
751 	ret = ibv_cmd_destroy_cq(cq);
752 	if (ret)
753 		goto err;
754 
755 	ibv_cmd_dereg_mr(&iwucq->vmr.ibv_mr);
756 	irdma_free_hw_buf(iwucq->cq.cq_base, iwucq->buf_size);
757 
758 	if (uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE) {
759 		ibv_cmd_dereg_mr(&iwucq->vmr_shadow_area.ibv_mr);
760 		irdma_free_hw_buf(iwucq->cq.shadow_area, IRDMA_DB_SHADOW_AREA_SIZE);
761 	}
762 	free(iwucq);
763 	return 0;
764 
765 err:
766 	return ret;
767 }
768 
769 static enum ibv_wc_status
770 irdma_flush_err_to_ib_wc_status(enum irdma_flush_opcode opcode)
771 {
772 	switch (opcode) {
773 	case FLUSH_PROT_ERR:
774 		return IBV_WC_LOC_PROT_ERR;
775 	case FLUSH_REM_ACCESS_ERR:
776 		return IBV_WC_REM_ACCESS_ERR;
777 	case FLUSH_LOC_QP_OP_ERR:
778 		return IBV_WC_LOC_QP_OP_ERR;
779 	case FLUSH_REM_OP_ERR:
780 		return IBV_WC_REM_OP_ERR;
781 	case FLUSH_LOC_LEN_ERR:
782 		return IBV_WC_LOC_LEN_ERR;
783 	case FLUSH_GENERAL_ERR:
784 		return IBV_WC_WR_FLUSH_ERR;
785 	case FLUSH_MW_BIND_ERR:
786 		return IBV_WC_MW_BIND_ERR;
787 	case FLUSH_REM_INV_REQ_ERR:
788 		return IBV_WC_REM_INV_REQ_ERR;
789 	case FLUSH_RETRY_EXC_ERR:
790 		return IBV_WC_RETRY_EXC_ERR;
791 	case FLUSH_FATAL_ERR:
792 	default:
793 		return IBV_WC_FATAL_ERR;
794 	}
795 }
796 
797 /**
798  * irdma_process_cqe_ext - process current cqe for extended CQ
799  * @cur_cqe - current cqe info
800  */
801 static void
802 irdma_process_cqe_ext(struct irdma_cq_poll_info *cur_cqe)
803 {
804 	struct irdma_ucq *iwucq = container_of(cur_cqe, struct irdma_ucq, cur_cqe);
805 	struct ibv_cq_ex *ibvcq_ex = &iwucq->verbs_cq.cq_ex;
806 
807 	ibvcq_ex->wr_id = cur_cqe->wr_id;
808 	if (cur_cqe->error)
809 		ibvcq_ex->status = (cur_cqe->comp_status == IRDMA_COMPL_STATUS_FLUSHED) ?
810 		    irdma_flush_err_to_ib_wc_status(cur_cqe->minor_err) : IBV_WC_GENERAL_ERR;
811 	else
812 		ibvcq_ex->status = IBV_WC_SUCCESS;
813 }
814 
815 /**
816  * irdma_process_cqe - process current cqe info
817  * @entry - ibv_wc object to fill in for non-extended CQ
818  * @cur_cqe - current cqe info
819  */
820 static void
821 irdma_process_cqe(struct ibv_wc *entry, struct irdma_cq_poll_info *cur_cqe)
822 {
823 	struct irdma_qp_uk *qp;
824 	struct ibv_qp *ib_qp;
825 
826 	entry->wc_flags = 0;
827 	entry->wr_id = cur_cqe->wr_id;
828 	entry->qp_num = cur_cqe->qp_id;
829 	qp = cur_cqe->qp_handle;
830 	ib_qp = qp->back_qp;
831 
832 	if (cur_cqe->error) {
833 		if (cur_cqe->comp_status == IRDMA_COMPL_STATUS_FLUSHED)
834 			entry->status = (cur_cqe->comp_status == IRDMA_COMPL_STATUS_FLUSHED) ?
835 			    irdma_flush_err_to_ib_wc_status(cur_cqe->minor_err) : IBV_WC_GENERAL_ERR;
836 		entry->vendor_err = cur_cqe->major_err << 16 |
837 		    cur_cqe->minor_err;
838 	} else {
839 		entry->status = IBV_WC_SUCCESS;
840 	}
841 
842 	if (cur_cqe->imm_valid) {
843 		entry->imm_data = htonl(cur_cqe->imm_data);
844 		entry->wc_flags |= IBV_WC_WITH_IMM;
845 	}
846 
847 	switch (cur_cqe->op_type) {
848 	case IRDMA_OP_TYPE_RDMA_WRITE:
849 	case IRDMA_OP_TYPE_RDMA_WRITE_SOL:
850 		entry->opcode = IBV_WC_RDMA_WRITE;
851 		break;
852 	case IRDMA_OP_TYPE_RDMA_READ:
853 		entry->opcode = IBV_WC_RDMA_READ;
854 		break;
855 	case IRDMA_OP_TYPE_SEND_SOL:
856 	case IRDMA_OP_TYPE_SEND_SOL_INV:
857 	case IRDMA_OP_TYPE_SEND_INV:
858 	case IRDMA_OP_TYPE_SEND:
859 		entry->opcode = IBV_WC_SEND;
860 		break;
861 	case IRDMA_OP_TYPE_BIND_MW:
862 		entry->opcode = IBV_WC_BIND_MW;
863 		break;
864 	case IRDMA_OP_TYPE_REC:
865 		entry->opcode = IBV_WC_RECV;
866 		if (ib_qp->qp_type != IBV_QPT_UD &&
867 		    cur_cqe->stag_invalid_set) {
868 			entry->invalidated_rkey = cur_cqe->inv_stag;
869 			entry->wc_flags |= IBV_WC_WITH_INV;
870 		}
871 		break;
872 	case IRDMA_OP_TYPE_REC_IMM:
873 		entry->opcode = IBV_WC_RECV_RDMA_WITH_IMM;
874 		if (ib_qp->qp_type != IBV_QPT_UD &&
875 		    cur_cqe->stag_invalid_set) {
876 			entry->invalidated_rkey = cur_cqe->inv_stag;
877 			entry->wc_flags |= IBV_WC_WITH_INV;
878 		}
879 		break;
880 	case IRDMA_OP_TYPE_INV_STAG:
881 		entry->opcode = IBV_WC_LOCAL_INV;
882 		break;
883 	default:
884 		entry->status = IBV_WC_GENERAL_ERR;
885 		printf("%s: Invalid opcode = %d in CQE\n",
886 		       __func__, cur_cqe->op_type);
887 		return;
888 	}
889 
890 	if (ib_qp->qp_type == IBV_QPT_UD) {
891 		entry->src_qp = cur_cqe->ud_src_qpn;
892 		entry->wc_flags |= IBV_WC_GRH;
893 	} else {
894 		entry->src_qp = cur_cqe->qp_id;
895 	}
896 	entry->byte_len = cur_cqe->bytes_xfered;
897 }
898 
899 /**
900  * irdma_poll_one - poll one entry of the CQ
901  * @ukcq: ukcq to poll
902  * @cur_cqe: current CQE info to be filled in
903  * @entry: ibv_wc object to be filled for non-extended CQ or NULL for extended CQ
904  *
905  * Returns the internal irdma device error code or 0 on success
906  */
907 static int
908 irdma_poll_one(struct irdma_cq_uk *ukcq, struct irdma_cq_poll_info *cur_cqe,
909 	       struct ibv_wc *entry)
910 {
911 	int ret = irdma_uk_cq_poll_cmpl(ukcq, cur_cqe);
912 
913 	if (ret)
914 		return ret;
915 
916 	if (!entry)
917 		irdma_process_cqe_ext(cur_cqe);
918 	else
919 		irdma_process_cqe(entry, cur_cqe);
920 
921 	return 0;
922 }
923 
924 /**
925  * __irdma_upoll_cq - irdma util function to poll device CQ
926  * @iwucq: irdma cq to poll
927  * @num_entries: max cq entries to poll
928  * @entry: pointer to array of ibv_wc objects to be filled in for each completion or NULL if ext CQ
929  *
930  * Returns non-negative value equal to the number of completions
931  * found. On failure, EINVAL
932  */
933 static int
934 __irdma_upoll_cq(struct irdma_ucq *iwucq, int num_entries,
935 		 struct ibv_wc *entry)
936 {
937 	struct irdma_cq_buf *cq_buf, *next;
938 	struct irdma_cq_buf *last_buf = NULL;
939 	struct irdma_cq_poll_info *cur_cqe = &iwucq->cur_cqe;
940 	bool cq_new_cqe = false;
941 	int resized_bufs = 0;
942 	int npolled = 0;
943 	int ret;
944 
945 	/* go through the list of previously resized CQ buffers */
946 	LIST_FOREACH_SAFE(cq_buf, &iwucq->resize_list, list, next) {
947 		while (npolled < num_entries) {
948 			ret = irdma_poll_one(&cq_buf->cq, cur_cqe,
949 					     entry ? entry + npolled : NULL);
950 			if (!ret) {
951 				++npolled;
952 				cq_new_cqe = true;
953 				continue;
954 			}
955 			if (ret == ENOENT)
956 				break;
957 			/* QP using the CQ is destroyed. Skip reporting this CQE */
958 			if (ret == EFAULT) {
959 				cq_new_cqe = true;
960 				continue;
961 			}
962 			goto error;
963 		}
964 
965 		/* save the resized CQ buffer which received the last cqe */
966 		if (cq_new_cqe)
967 			last_buf = cq_buf;
968 		cq_new_cqe = false;
969 	}
970 
971 	/* check the current CQ for new cqes */
972 	while (npolled < num_entries) {
973 		ret = irdma_poll_one(&iwucq->cq, cur_cqe,
974 				     entry ? entry + npolled : NULL);
975 		if (ret == ENOENT) {
976 			ret = irdma_generated_cmpls(iwucq, cur_cqe);
977 			if (!ret) {
978 				if (entry)
979 					irdma_process_cqe(entry + npolled, cur_cqe);
980 				else
981 					irdma_process_cqe_ext(cur_cqe);
982 			}
983 		}
984 		if (!ret) {
985 			++npolled;
986 			cq_new_cqe = true;
987 			continue;
988 		}
989 		if (ret == ENOENT)
990 			break;
991 		/* QP using the CQ is destroyed. Skip reporting this CQE */
992 		if (ret == EFAULT) {
993 			cq_new_cqe = true;
994 			continue;
995 		}
996 		goto error;
997 	}
998 
999 	if (cq_new_cqe)
1000 		/* all previous CQ resizes are complete */
1001 		resized_bufs = irdma_process_resize_list(iwucq, NULL);
1002 	else if (last_buf)
1003 		/* only CQ resizes up to the last_buf are complete */
1004 		resized_bufs = irdma_process_resize_list(iwucq, last_buf);
1005 	if (resized_bufs)
1006 		/* report to the HW the number of complete CQ resizes */
1007 		irdma_uk_cq_set_resized_cnt(&iwucq->cq, resized_bufs);
1008 
1009 	return npolled;
1010 
1011 error:
1012 	printf("%s: Error polling CQ, irdma_err: %d\n", __func__, ret);
1013 
1014 	return EINVAL;
1015 }
1016 
1017 /**
1018  * irdma_upoll_cq - verb API callback to poll device CQ
1019  * @cq: ibv_cq to poll
1020  * @num_entries: max cq entries to poll
1021  * @entry: pointer to array of ibv_wc objects to be filled in for each completion
1022  *
1023  * Returns non-negative value equal to the number of completions
1024  * found and a negative error code on failure
1025  */
1026 int
1027 irdma_upoll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *entry)
1028 {
1029 	struct irdma_ucq *iwucq;
1030 	int ret;
1031 
1032 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
1033 	ret = pthread_spin_lock(&iwucq->lock);
1034 	if (ret)
1035 		return -ret;
1036 
1037 	ret = __irdma_upoll_cq(iwucq, num_entries, entry);
1038 
1039 	pthread_spin_unlock(&iwucq->lock);
1040 
1041 	return ret;
1042 }
1043 
1044 /**
1045  * irdma_start_poll - verb_ex API callback to poll batch of WC's
1046  * @ibvcq_ex: ibv extended CQ
1047  * @attr: attributes (not used)
1048  *
1049  * Start polling batch of work completions. Return 0 on success, ENONENT when
1050  * no completions are available on CQ. And an error code on errors
1051  */
1052 static int
1053 irdma_start_poll(struct ibv_cq_ex *ibvcq_ex, struct ibv_poll_cq_attr *attr)
1054 {
1055 	struct irdma_ucq *iwucq;
1056 	int ret;
1057 
1058 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1059 	ret = pthread_spin_lock(&iwucq->lock);
1060 	if (ret)
1061 		return ret;
1062 
1063 	ret = __irdma_upoll_cq(iwucq, 1, NULL);
1064 	if (ret == 1)
1065 		return 0;
1066 
1067 	/* No Completions on CQ */
1068 	if (!ret)
1069 		ret = ENOENT;
1070 
1071 	pthread_spin_unlock(&iwucq->lock);
1072 
1073 	return ret;
1074 }
1075 
1076 /**
1077  * irdma_next_poll - verb_ex API callback to get next WC
1078  * @ibvcq_ex: ibv extended CQ
1079  *
1080  * Return 0 on success, ENONENT when no completions are available on CQ.
1081  * And an error code on errors
1082  */
1083 static int
1084 irdma_next_poll(struct ibv_cq_ex *ibvcq_ex)
1085 {
1086 	struct irdma_ucq *iwucq;
1087 	int ret;
1088 
1089 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1090 	ret = __irdma_upoll_cq(iwucq, 1, NULL);
1091 	if (ret == 1)
1092 		return 0;
1093 
1094 	/* No Completions on CQ */
1095 	if (!ret)
1096 		ret = ENOENT;
1097 
1098 	return ret;
1099 }
1100 
1101 /**
1102  * irdma_end_poll - verb_ex API callback to end polling of WC's
1103  * @ibvcq_ex: ibv extended CQ
1104  */
1105 static void
1106 irdma_end_poll(struct ibv_cq_ex *ibvcq_ex)
1107 {
1108 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1109 					       verbs_cq.cq_ex);
1110 
1111 	pthread_spin_unlock(&iwucq->lock);
1112 }
1113 
1114 /**
1115  * irdma_wc_read_completion_ts - Get completion timestamp
1116  * @ibvcq_ex: ibv extended CQ
1117  *
1118  * Get completion timestamp in HCA clock units
1119  */
1120 static uint64_t irdma_wc_read_completion_ts(struct ibv_cq_ex *ibvcq_ex){
1121 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1122 					       verbs_cq.cq_ex);
1123 #define HCA_CORE_CLOCK_800_MHZ 800
1124 
1125 	return iwucq->cur_cqe.tcp_seq_num_rtt / HCA_CORE_CLOCK_800_MHZ;
1126 }
1127 
1128 static enum ibv_wc_opcode
1129 irdma_wc_read_opcode(struct ibv_cq_ex *ibvcq_ex)
1130 {
1131 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1132 					       verbs_cq.cq_ex);
1133 
1134 	switch (iwucq->cur_cqe.op_type) {
1135 	case IRDMA_OP_TYPE_RDMA_WRITE:
1136 	case IRDMA_OP_TYPE_RDMA_WRITE_SOL:
1137 		return IBV_WC_RDMA_WRITE;
1138 	case IRDMA_OP_TYPE_RDMA_READ:
1139 		return IBV_WC_RDMA_READ;
1140 	case IRDMA_OP_TYPE_SEND_SOL:
1141 	case IRDMA_OP_TYPE_SEND_SOL_INV:
1142 	case IRDMA_OP_TYPE_SEND_INV:
1143 	case IRDMA_OP_TYPE_SEND:
1144 		return IBV_WC_SEND;
1145 	case IRDMA_OP_TYPE_BIND_MW:
1146 		return IBV_WC_BIND_MW;
1147 	case IRDMA_OP_TYPE_REC:
1148 		return IBV_WC_RECV;
1149 	case IRDMA_OP_TYPE_REC_IMM:
1150 		return IBV_WC_RECV_RDMA_WITH_IMM;
1151 	case IRDMA_OP_TYPE_INV_STAG:
1152 		return IBV_WC_LOCAL_INV;
1153 	}
1154 
1155 	printf("%s: Invalid opcode = %d in CQE\n", __func__,
1156 	       iwucq->cur_cqe.op_type);
1157 
1158 	return 0;
1159 }
1160 
1161 static uint32_t irdma_wc_read_vendor_err(struct ibv_cq_ex *ibvcq_ex){
1162 	struct irdma_cq_poll_info *cur_cqe;
1163 	struct irdma_ucq *iwucq;
1164 
1165 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1166 	cur_cqe = &iwucq->cur_cqe;
1167 
1168 	return cur_cqe->error ? cur_cqe->major_err << 16 | cur_cqe->minor_err : 0;
1169 }
1170 
1171 static int
1172 irdma_wc_read_wc_flags(struct ibv_cq_ex *ibvcq_ex)
1173 {
1174 	struct irdma_cq_poll_info *cur_cqe;
1175 	struct irdma_ucq *iwucq;
1176 	struct irdma_qp_uk *qp;
1177 	struct ibv_qp *ib_qp;
1178 	int wc_flags = 0;
1179 
1180 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1181 	cur_cqe = &iwucq->cur_cqe;
1182 	qp = cur_cqe->qp_handle;
1183 	ib_qp = qp->back_qp;
1184 
1185 	if (cur_cqe->imm_valid)
1186 		wc_flags |= IBV_WC_WITH_IMM;
1187 
1188 	if (ib_qp->qp_type == IBV_QPT_UD) {
1189 		wc_flags |= IBV_WC_GRH;
1190 	} else {
1191 		if (cur_cqe->stag_invalid_set) {
1192 			switch (cur_cqe->op_type) {
1193 			case IRDMA_OP_TYPE_REC:
1194 				wc_flags |= IBV_WC_WITH_INV;
1195 				break;
1196 			case IRDMA_OP_TYPE_REC_IMM:
1197 				wc_flags |= IBV_WC_WITH_INV;
1198 				break;
1199 			}
1200 		}
1201 	}
1202 
1203 	return wc_flags;
1204 }
1205 
1206 static uint32_t irdma_wc_read_byte_len(struct ibv_cq_ex *ibvcq_ex){
1207 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1208 					       verbs_cq.cq_ex);
1209 
1210 	return iwucq->cur_cqe.bytes_xfered;
1211 }
1212 
1213 static __be32 irdma_wc_read_imm_data(struct ibv_cq_ex *ibvcq_ex){
1214 	struct irdma_cq_poll_info *cur_cqe;
1215 	struct irdma_ucq *iwucq;
1216 
1217 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1218 	cur_cqe = &iwucq->cur_cqe;
1219 
1220 	return cur_cqe->imm_valid ? htonl(cur_cqe->imm_data) : 0;
1221 }
1222 
1223 static uint32_t irdma_wc_read_qp_num(struct ibv_cq_ex *ibvcq_ex){
1224 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1225 					       verbs_cq.cq_ex);
1226 
1227 	return iwucq->cur_cqe.qp_id;
1228 }
1229 
1230 static uint32_t irdma_wc_read_src_qp(struct ibv_cq_ex *ibvcq_ex){
1231 	struct irdma_cq_poll_info *cur_cqe;
1232 	struct irdma_ucq *iwucq;
1233 	struct irdma_qp_uk *qp;
1234 	struct ibv_qp *ib_qp;
1235 
1236 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1237 	cur_cqe = &iwucq->cur_cqe;
1238 	qp = cur_cqe->qp_handle;
1239 	ib_qp = qp->back_qp;
1240 
1241 	return ib_qp->qp_type == IBV_QPT_UD ? cur_cqe->ud_src_qpn : cur_cqe->qp_id;
1242 }
1243 
1244 static uint8_t irdma_wc_read_sl(struct ibv_cq_ex *ibvcq_ex){
1245 	return 0;
1246 }
1247 
1248 void
1249 irdma_ibvcq_ex_fill_priv_funcs(struct irdma_ucq *iwucq,
1250 			       struct ibv_cq_init_attr_ex *attr_ex)
1251 {
1252 	struct ibv_cq_ex *ibvcq_ex = &iwucq->verbs_cq.cq_ex;
1253 
1254 	ibvcq_ex->start_poll = irdma_start_poll;
1255 	ibvcq_ex->end_poll = irdma_end_poll;
1256 	ibvcq_ex->next_poll = irdma_next_poll;
1257 
1258 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP) {
1259 		ibvcq_ex->read_completion_ts = irdma_wc_read_completion_ts;
1260 		iwucq->report_rtt = true;
1261 	}
1262 
1263 	ibvcq_ex->read_opcode = irdma_wc_read_opcode;
1264 	ibvcq_ex->read_vendor_err = irdma_wc_read_vendor_err;
1265 	ibvcq_ex->read_wc_flags = irdma_wc_read_wc_flags;
1266 
1267 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_BYTE_LEN)
1268 		ibvcq_ex->read_byte_len = irdma_wc_read_byte_len;
1269 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_IMM)
1270 		ibvcq_ex->read_imm_data = irdma_wc_read_imm_data;
1271 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_QP_NUM)
1272 		ibvcq_ex->read_qp_num = irdma_wc_read_qp_num;
1273 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_SRC_QP)
1274 		ibvcq_ex->read_src_qp = irdma_wc_read_src_qp;
1275 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_SL)
1276 		ibvcq_ex->read_sl = irdma_wc_read_sl;
1277 }
1278 
1279 /**
1280  * irdma_arm_cq - arm of cq
1281  * @iwucq: cq to which arm
1282  * @cq_notify: notification params
1283  */
1284 static void
1285 irdma_arm_cq(struct irdma_ucq *iwucq,
1286 	     enum irdma_cmpl_notify cq_notify)
1287 {
1288 	iwucq->is_armed = true;
1289 	iwucq->arm_sol = true;
1290 	iwucq->skip_arm = false;
1291 	iwucq->skip_sol = true;
1292 	irdma_uk_cq_request_notification(&iwucq->cq, cq_notify);
1293 }
1294 
1295 /**
1296  * irdma_uarm_cq - callback for arm of cq
1297  * @cq: cq to arm
1298  * @solicited: to get notify params
1299  */
1300 int
1301 irdma_uarm_cq(struct ibv_cq *cq, int solicited)
1302 {
1303 	struct irdma_ucq *iwucq;
1304 	enum irdma_cmpl_notify cq_notify = IRDMA_CQ_COMPL_EVENT;
1305 	int ret;
1306 
1307 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
1308 	if (solicited)
1309 		cq_notify = IRDMA_CQ_COMPL_SOLICITED;
1310 
1311 	ret = pthread_spin_lock(&iwucq->lock);
1312 	if (ret)
1313 		return ret;
1314 
1315 	if (iwucq->is_armed) {
1316 		if (iwucq->arm_sol && !solicited) {
1317 			irdma_arm_cq(iwucq, cq_notify);
1318 		} else {
1319 			iwucq->skip_arm = true;
1320 			iwucq->skip_sol = solicited ? true : false;
1321 		}
1322 	} else {
1323 		irdma_arm_cq(iwucq, cq_notify);
1324 	}
1325 
1326 	pthread_spin_unlock(&iwucq->lock);
1327 
1328 	return 0;
1329 }
1330 
1331 /**
1332  * irdma_cq_event - cq to do completion event
1333  * @cq: cq to arm
1334  */
1335 void
1336 irdma_cq_event(struct ibv_cq *cq)
1337 {
1338 	struct irdma_ucq *iwucq;
1339 
1340 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
1341 	if (pthread_spin_lock(&iwucq->lock))
1342 		return;
1343 
1344 	if (iwucq->skip_arm)
1345 		irdma_arm_cq(iwucq, IRDMA_CQ_COMPL_EVENT);
1346 	else
1347 		iwucq->is_armed = false;
1348 
1349 	pthread_spin_unlock(&iwucq->lock);
1350 }
1351 
1352 void *
1353 irdma_mmap(int fd, off_t offset)
1354 {
1355 	void *map;
1356 
1357 	map = mmap(NULL, IRDMA_HW_PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED,
1358 		   fd, offset);
1359 	if (map == MAP_FAILED)
1360 		return map;
1361 
1362 	if (ibv_dontfork_range(map, IRDMA_HW_PAGE_SIZE)) {
1363 		munmap(map, IRDMA_HW_PAGE_SIZE);
1364 		return MAP_FAILED;
1365 	}
1366 
1367 	return map;
1368 }
1369 
1370 void
1371 irdma_munmap(void *map)
1372 {
1373 	ibv_dofork_range(map, IRDMA_HW_PAGE_SIZE);
1374 	munmap(map, IRDMA_HW_PAGE_SIZE);
1375 }
1376 
1377 /**
1378  * irdma_destroy_vmapped_qp - destroy resources for qp
1379  * @iwuqp: qp struct for resources
1380  */
1381 static int
1382 irdma_destroy_vmapped_qp(struct irdma_uqp *iwuqp)
1383 {
1384 	int ret;
1385 
1386 	ret = ibv_cmd_destroy_qp(&iwuqp->ibv_qp);
1387 	if (ret)
1388 		return ret;
1389 
1390 	if (iwuqp->qp.push_db)
1391 		irdma_munmap(iwuqp->qp.push_db);
1392 	if (iwuqp->qp.push_wqe)
1393 		irdma_munmap(iwuqp->qp.push_wqe);
1394 
1395 	ibv_cmd_dereg_mr(&iwuqp->vmr.ibv_mr);
1396 
1397 	return 0;
1398 }
1399 
1400 /**
1401  * irdma_vmapped_qp - create resources for qp
1402  * @iwuqp: qp struct for resources
1403  * @pd: pd for the qp
1404  * @attr: attributes of qp passed
1405  * @resp: response back from create qp
1406  * @sqdepth: depth of sq
1407  * @rqdepth: depth of rq
1408  * @info: info for initializing user level qp
1409  * @abi_ver: abi version of the create qp command
1410  */
1411 static int
1412 irdma_vmapped_qp(struct irdma_uqp *iwuqp, struct ibv_pd *pd,
1413 		 struct ibv_qp_init_attr *attr, int sqdepth,
1414 		 int rqdepth, struct irdma_qp_uk_init_info *info,
1415 		 bool legacy_mode)
1416 {
1417 	struct irdma_ucreate_qp cmd = {};
1418 	size_t sqsize, rqsize, totalqpsize;
1419 	struct irdma_ucreate_qp_resp resp = {};
1420 	struct irdma_ureg_mr reg_mr_cmd = {};
1421 	struct ibv_reg_mr_resp reg_mr_resp = {};
1422 	int ret;
1423 
1424 	sqsize = roundup(sqdepth * IRDMA_QP_WQE_MIN_SIZE, IRDMA_HW_PAGE_SIZE);
1425 	rqsize = roundup(rqdepth * IRDMA_QP_WQE_MIN_SIZE, IRDMA_HW_PAGE_SIZE);
1426 	totalqpsize = rqsize + sqsize + IRDMA_DB_SHADOW_AREA_SIZE;
1427 	info->sq = irdma_alloc_hw_buf(totalqpsize);
1428 	iwuqp->buf_size = totalqpsize;
1429 
1430 	if (!info->sq)
1431 		return ENOMEM;
1432 
1433 	memset(info->sq, 0, totalqpsize);
1434 	info->rq = &info->sq[sqsize / IRDMA_QP_WQE_MIN_SIZE];
1435 	info->shadow_area = info->rq[rqsize / IRDMA_QP_WQE_MIN_SIZE].elem;
1436 
1437 	reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_QP;
1438 	reg_mr_cmd.sq_pages = sqsize >> IRDMA_HW_PAGE_SHIFT;
1439 	reg_mr_cmd.rq_pages = rqsize >> IRDMA_HW_PAGE_SHIFT;
1440 
1441 	ret = ibv_cmd_reg_mr(pd, info->sq, totalqpsize,
1442 			     (uintptr_t)info->sq, IBV_ACCESS_LOCAL_WRITE,
1443 			     &iwuqp->vmr.ibv_mr, &reg_mr_cmd.ibv_cmd,
1444 			     sizeof(reg_mr_cmd), &reg_mr_resp,
1445 			     sizeof(reg_mr_resp));
1446 	if (ret)
1447 		goto err_dereg_mr;
1448 
1449 	cmd.user_wqe_bufs = (__u64) ((uintptr_t)info->sq);
1450 	cmd.user_compl_ctx = (__u64) (uintptr_t)&iwuqp->qp;
1451 	ret = ibv_cmd_create_qp(pd, &iwuqp->ibv_qp, attr, &cmd.ibv_cmd,
1452 				sizeof(cmd), &resp.ibv_resp,
1453 				sizeof(struct irdma_ucreate_qp_resp));
1454 	if (ret)
1455 		goto err_qp;
1456 
1457 	info->sq_size = resp.actual_sq_size;
1458 	info->rq_size = resp.actual_rq_size;
1459 	info->first_sq_wq = legacy_mode ? 1 : resp.lsmm;
1460 	info->qp_caps = resp.qp_caps;
1461 	info->qp_id = resp.qp_id;
1462 	iwuqp->irdma_drv_opt = resp.irdma_drv_opt;
1463 	iwuqp->ibv_qp.qp_num = resp.qp_id;
1464 
1465 	iwuqp->send_cq = container_of(attr->send_cq, struct irdma_ucq,
1466 				      verbs_cq.cq);
1467 	iwuqp->recv_cq = container_of(attr->recv_cq, struct irdma_ucq,
1468 				      verbs_cq.cq);
1469 	iwuqp->send_cq->uqp = iwuqp;
1470 	iwuqp->recv_cq->uqp = iwuqp;
1471 
1472 	return 0;
1473 err_qp:
1474 	ibv_cmd_dereg_mr(&iwuqp->vmr.ibv_mr);
1475 err_dereg_mr:
1476 	printf("%s: failed to create QP, status %d\n", __func__, ret);
1477 	irdma_free_hw_buf(info->sq, iwuqp->buf_size);
1478 	return ret;
1479 }
1480 
1481 /**
1482  * irdma_ucreate_qp - create qp on user app
1483  * @pd: pd for the qp
1484  * @attr: attributes of the qp to be created (sizes, sge, cq)
1485  */
1486 struct ibv_qp *
1487 irdma_ucreate_qp(struct ibv_pd *pd,
1488 		 struct ibv_qp_init_attr *attr)
1489 {
1490 	struct irdma_qp_uk_init_info info = {};
1491 	struct irdma_uk_attrs *uk_attrs;
1492 	struct irdma_uvcontext *iwvctx;
1493 	struct irdma_uqp *iwuqp;
1494 	u32 sqdepth, rqdepth;
1495 	u8 sqshift, rqshift;
1496 	int status;
1497 
1498 	if (attr->qp_type != IBV_QPT_RC && attr->qp_type != IBV_QPT_UD) {
1499 		printf("%s: failed to create QP, unsupported QP type: 0x%x\n",
1500 		       __func__, attr->qp_type);
1501 		errno = EOPNOTSUPP;
1502 		return NULL;
1503 	}
1504 
1505 	iwvctx = container_of(pd->context, struct irdma_uvcontext, ibv_ctx);
1506 	uk_attrs = &iwvctx->uk_attrs;
1507 
1508 	if (attr->cap.max_send_sge > uk_attrs->max_hw_wq_frags ||
1509 	    attr->cap.max_recv_sge > uk_attrs->max_hw_wq_frags ||
1510 	    attr->cap.max_inline_data > uk_attrs->max_hw_inline) {
1511 		errno = EINVAL;
1512 		return NULL;
1513 	}
1514 
1515 	irdma_get_wqe_shift(uk_attrs,
1516 			    uk_attrs->hw_rev > IRDMA_GEN_1 ? attr->cap.max_send_sge + 1 :
1517 			    attr->cap.max_send_sge,
1518 			    attr->cap.max_inline_data, &sqshift);
1519 	status = irdma_get_sqdepth(uk_attrs->max_hw_wq_quanta,
1520 				   attr->cap.max_send_wr, sqshift, &sqdepth);
1521 	if (status) {
1522 		printf("%s: invalid SQ attributes, max_send_wr=%d max_send_sge=%d max_inline=%d\n",
1523 		       __func__, attr->cap.max_send_wr, attr->cap.max_send_sge,
1524 		       attr->cap.max_inline_data);
1525 		errno = status;
1526 		return NULL;
1527 	}
1528 
1529 	if (uk_attrs->hw_rev == IRDMA_GEN_1 && iwvctx->abi_ver > 4)
1530 		rqshift = IRDMA_MAX_RQ_WQE_SHIFT_GEN1;
1531 	else
1532 		irdma_get_wqe_shift(uk_attrs, attr->cap.max_recv_sge, 0,
1533 				    &rqshift);
1534 
1535 	status = irdma_get_rqdepth(uk_attrs->max_hw_rq_quanta,
1536 				   attr->cap.max_recv_wr, rqshift, &rqdepth);
1537 	if (status) {
1538 		printf("%s: invalid RQ attributes, recv_wr=%d recv_sge=%d\n",
1539 		       __func__, attr->cap.max_recv_wr, attr->cap.max_recv_sge);
1540 		errno = status;
1541 		return NULL;
1542 	}
1543 
1544 	iwuqp = memalign(1024, sizeof(*iwuqp));
1545 	if (!iwuqp)
1546 		return NULL;
1547 
1548 	memset(iwuqp, 0, sizeof(*iwuqp));
1549 
1550 	if (pthread_spin_init(&iwuqp->lock, PTHREAD_PROCESS_PRIVATE))
1551 		goto err_free_qp;
1552 
1553 	info.sq_size = sqdepth >> sqshift;
1554 	info.rq_size = rqdepth >> rqshift;
1555 	attr->cap.max_send_wr = info.sq_size;
1556 	attr->cap.max_recv_wr = info.rq_size;
1557 
1558 	info.uk_attrs = uk_attrs;
1559 	info.max_sq_frag_cnt = attr->cap.max_send_sge;
1560 	info.max_rq_frag_cnt = attr->cap.max_recv_sge;
1561 	iwuqp->recv_sges = calloc(attr->cap.max_recv_sge, sizeof(*iwuqp->recv_sges));
1562 	if (!iwuqp->recv_sges)
1563 		goto err_destroy_lock;
1564 
1565 	info.wqe_alloc_db = (u32 *)iwvctx->db;
1566 	info.legacy_mode = iwvctx->legacy_mode;
1567 	info.sq_wrtrk_array = calloc(sqdepth, sizeof(*info.sq_wrtrk_array));
1568 	if (!info.sq_wrtrk_array)
1569 		goto err_free_rsges;
1570 
1571 	info.rq_wrid_array = calloc(rqdepth, sizeof(*info.rq_wrid_array));
1572 	if (!info.rq_wrid_array)
1573 		goto err_free_sq_wrtrk;
1574 
1575 	iwuqp->sq_sig_all = attr->sq_sig_all;
1576 	iwuqp->qp_type = attr->qp_type;
1577 	status = irdma_vmapped_qp(iwuqp, pd, attr, sqdepth, rqdepth, &info, iwvctx->legacy_mode);
1578 	if (status) {
1579 		errno = status;
1580 		goto err_free_rq_wrid;
1581 	}
1582 
1583 	iwuqp->qp.back_qp = iwuqp;
1584 	iwuqp->qp.lock = &iwuqp->lock;
1585 
1586 	info.max_sq_frag_cnt = attr->cap.max_send_sge;
1587 	info.max_rq_frag_cnt = attr->cap.max_recv_sge;
1588 	info.max_inline_data = attr->cap.max_inline_data;
1589 	iwuqp->qp.force_fence = true;
1590 	status = irdma_uk_qp_init(&iwuqp->qp, &info);
1591 	if (status) {
1592 		errno = status;
1593 		goto err_free_vmap_qp;
1594 	}
1595 
1596 	attr->cap.max_send_wr = (sqdepth - IRDMA_SQ_RSVD) >> sqshift;
1597 	attr->cap.max_recv_wr = (rqdepth - IRDMA_RQ_RSVD) >> rqshift;
1598 	return &iwuqp->ibv_qp;
1599 
1600 err_free_vmap_qp:
1601 	irdma_destroy_vmapped_qp(iwuqp);
1602 	irdma_free_hw_buf(info.sq, iwuqp->buf_size);
1603 err_free_rq_wrid:
1604 	free(info.rq_wrid_array);
1605 err_free_sq_wrtrk:
1606 	free(info.sq_wrtrk_array);
1607 err_free_rsges:
1608 	free(iwuqp->recv_sges);
1609 err_destroy_lock:
1610 	pthread_spin_destroy(&iwuqp->lock);
1611 err_free_qp:
1612 	printf("%s: failed to create QP\n", __func__);
1613 	free(iwuqp);
1614 
1615 	return NULL;
1616 }
1617 
1618 /**
1619  * irdma_uquery_qp - query qp for some attribute
1620  * @qp: qp for the attributes query
1621  * @attr: to return the attributes
1622  * @attr_mask: mask of what is query for
1623  * @init_attr: initial attributes during create_qp
1624  */
1625 int
1626 irdma_uquery_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask,
1627 		struct ibv_qp_init_attr *init_attr)
1628 {
1629 	struct ibv_query_qp cmd;
1630 
1631 	return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, &cmd,
1632 				sizeof(cmd));
1633 }
1634 
1635 /**
1636  * irdma_umodify_qp - send qp modify to driver
1637  * @qp: qp to modify
1638  * @attr: attribute to modify
1639  * @attr_mask: mask of the attribute
1640  */
1641 int
1642 irdma_umodify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask)
1643 {
1644 	struct irdma_umodify_qp_resp resp = {};
1645 	struct ibv_modify_qp cmd = {};
1646 	struct irdma_modify_qp_cmd cmd_ex = {};
1647 	struct irdma_uvcontext *iwvctx;
1648 	struct irdma_uqp *iwuqp;
1649 
1650 	iwuqp = container_of(qp, struct irdma_uqp, ibv_qp);
1651 	iwvctx = container_of(qp->context, struct irdma_uvcontext, ibv_ctx);
1652 	iwuqp->attr_mask = attr_mask;
1653 	memcpy(&iwuqp->attr, attr, sizeof(iwuqp->attr));
1654 
1655 	if (iwuqp->qp.qp_caps & IRDMA_PUSH_MODE && attr_mask & IBV_QP_STATE &&
1656 	    iwvctx->uk_attrs.hw_rev > IRDMA_GEN_1) {
1657 		u64 offset;
1658 		void *map;
1659 		int ret;
1660 
1661 		ret = ibv_cmd_modify_qp_ex(qp, attr, attr_mask, &cmd_ex.ibv_cmd,
1662 					   sizeof(cmd_ex.ibv_cmd),
1663 					   sizeof(cmd_ex), &resp.ibv_resp,
1664 					   sizeof(resp.ibv_resp),
1665 					   sizeof(resp));
1666 		if (!ret)
1667 			iwuqp->qp.rd_fence_rate = resp.rd_fence_rate;
1668 		if (ret || !resp.push_valid)
1669 			return ret;
1670 
1671 		if (iwuqp->qp.push_wqe)
1672 			return ret;
1673 
1674 		offset = resp.push_wqe_mmap_key;
1675 		map = irdma_mmap(qp->context->cmd_fd, offset);
1676 		if (map == MAP_FAILED)
1677 			return ret;
1678 
1679 		iwuqp->qp.push_wqe = map;
1680 
1681 		offset = resp.push_db_mmap_key;
1682 		map = irdma_mmap(qp->context->cmd_fd, offset);
1683 		if (map == MAP_FAILED) {
1684 			irdma_munmap(iwuqp->qp.push_wqe);
1685 			iwuqp->qp.push_wqe = NULL;
1686 			printf("failed to map push page, errno %d\n", errno);
1687 			return ret;
1688 		}
1689 		iwuqp->qp.push_wqe += resp.push_offset;
1690 		iwuqp->qp.push_db = map + resp.push_offset;
1691 
1692 		return ret;
1693 	} else {
1694 		int ret;
1695 
1696 		ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd));
1697 		if (ret)
1698 			return ret;
1699 		if (attr_mask & IBV_QP_STATE && attr->qp_state == IBV_QPS_ERR)
1700 			pthread_create(&iwuqp->flush_thread, NULL, irdma_flush_thread, iwuqp);
1701 		return 0;
1702 	}
1703 }
1704 
1705 static void
1706 irdma_issue_flush(struct ibv_qp *qp, bool sq_flush, bool rq_flush)
1707 {
1708 	struct irdma_umodify_qp_resp resp = {};
1709 	struct irdma_modify_qp_cmd cmd_ex = {};
1710 	struct irdma_uqp *iwuqp;
1711 
1712 	cmd_ex.sq_flush = sq_flush;
1713 	cmd_ex.rq_flush = rq_flush;
1714 	iwuqp = container_of(qp, struct irdma_uqp, ibv_qp);
1715 
1716 	ibv_cmd_modify_qp_ex(qp, &iwuqp->attr, iwuqp->attr_mask,
1717 			     &cmd_ex.ibv_cmd,
1718 			     sizeof(cmd_ex.ibv_cmd),
1719 			     sizeof(cmd_ex), &resp.ibv_resp,
1720 			     sizeof(resp.ibv_resp),
1721 			     sizeof(resp));
1722 }
1723 
1724 /**
1725  * irdma_clean_cqes - clean cq entries for qp
1726  * @qp: qp for which completions are cleaned
1727  * @iwcq: cq to be cleaned
1728  */
1729 static void
1730 irdma_clean_cqes(struct irdma_qp_uk *qp, struct irdma_ucq *iwucq)
1731 {
1732 	struct irdma_cq_uk *ukcq = &iwucq->cq;
1733 	int ret;
1734 
1735 	ret = pthread_spin_lock(&iwucq->lock);
1736 	if (ret)
1737 		return;
1738 
1739 	irdma_uk_clean_cq(qp, ukcq);
1740 	pthread_spin_unlock(&iwucq->lock);
1741 }
1742 
1743 /**
1744  * irdma_udestroy_qp - destroy qp
1745  * @qp: qp to destroy
1746  */
1747 int
1748 irdma_udestroy_qp(struct ibv_qp *qp)
1749 {
1750 	struct irdma_uqp *iwuqp;
1751 	int ret;
1752 
1753 	iwuqp = container_of(qp, struct irdma_uqp, ibv_qp);
1754 	if (iwuqp->flush_thread) {
1755 		pthread_cancel(iwuqp->flush_thread);
1756 		pthread_join(iwuqp->flush_thread, NULL);
1757 	}
1758 	ret = pthread_spin_destroy(&iwuqp->lock);
1759 	if (ret)
1760 		goto err;
1761 
1762 	ret = irdma_destroy_vmapped_qp(iwuqp);
1763 	if (ret)
1764 		goto err;
1765 
1766 	/* Clean any pending completions from the cq(s) */
1767 	if (iwuqp->send_cq)
1768 		irdma_clean_cqes(&iwuqp->qp, iwuqp->send_cq);
1769 
1770 	if (iwuqp->recv_cq && iwuqp->recv_cq != iwuqp->send_cq)
1771 		irdma_clean_cqes(&iwuqp->qp, iwuqp->recv_cq);
1772 
1773 	if (iwuqp->qp.sq_wrtrk_array)
1774 		free(iwuqp->qp.sq_wrtrk_array);
1775 	if (iwuqp->qp.rq_wrid_array)
1776 		free(iwuqp->qp.rq_wrid_array);
1777 
1778 	irdma_free_hw_buf(iwuqp->qp.sq_base, iwuqp->buf_size);
1779 	free(iwuqp->recv_sges);
1780 	free(iwuqp);
1781 	return 0;
1782 
1783 err:
1784 	printf("%s: failed to destroy QP, status %d\n",
1785 	       __func__, ret);
1786 	return ret;
1787 }
1788 
1789 /**
1790  * irdma_copy_sg_list - copy sg list for qp
1791  * @sg_list: copied into sg_list
1792  * @sgl: copy from sgl
1793  * @num_sges: count of sg entries
1794  * @max_sges: count of max supported sg entries
1795  */
1796 static void
1797 irdma_copy_sg_list(struct irdma_sge *sg_list, struct ibv_sge *sgl,
1798 		   int num_sges)
1799 {
1800 	int i;
1801 
1802 	for (i = 0; i < num_sges; i++) {
1803 		sg_list[i].tag_off = sgl[i].addr;
1804 		sg_list[i].len = sgl[i].length;
1805 		sg_list[i].stag = sgl[i].lkey;
1806 	}
1807 }
1808 
1809 /**
1810  * calc_type2_mw_stag - calculate type 2 MW stag
1811  * @rkey: desired rkey of the MW
1812  * @mw_rkey: type2 memory window rkey
1813  *
1814  * compute type2 memory window stag by taking lower 8 bits
1815  * of the desired rkey and leaving 24 bits if mw->rkey unchanged
1816  */
1817 static inline u32 calc_type2_mw_stag(u32 rkey, u32 mw_rkey) {
1818 	const u32 mask = 0xff;
1819 
1820 	return (rkey & mask) | (mw_rkey & ~mask);
1821 }
1822 
1823 /**
1824  * irdma_post_send -  post send wr for user application
1825  * @ib_qp: qp to post wr
1826  * @ib_wr: work request ptr
1827  * @bad_wr: return of bad wr if err
1828  */
1829 int
1830 irdma_upost_send(struct ibv_qp *ib_qp, struct ibv_send_wr *ib_wr,
1831 		 struct ibv_send_wr **bad_wr)
1832 {
1833 	struct irdma_post_sq_info info;
1834 	struct irdma_uvcontext *iwvctx;
1835 	struct irdma_uk_attrs *uk_attrs;
1836 	struct irdma_uqp *iwuqp;
1837 	bool reflush = false;
1838 	int err = 0;
1839 
1840 	iwuqp = container_of(ib_qp, struct irdma_uqp, ibv_qp);
1841 	iwvctx = container_of(ib_qp->context, struct irdma_uvcontext, ibv_ctx);
1842 	uk_attrs = &iwvctx->uk_attrs;
1843 
1844 	err = pthread_spin_lock(&iwuqp->lock);
1845 	if (err)
1846 		return err;
1847 
1848 	if (!IRDMA_RING_MORE_WORK(iwuqp->qp.sq_ring) &&
1849 	    ib_qp->state == IBV_QPS_ERR)
1850 		reflush = true;
1851 
1852 	while (ib_wr) {
1853 		memset(&info, 0, sizeof(info));
1854 		info.wr_id = (u64)(ib_wr->wr_id);
1855 		if ((ib_wr->send_flags & IBV_SEND_SIGNALED) ||
1856 		    iwuqp->sq_sig_all)
1857 			info.signaled = true;
1858 		if (ib_wr->send_flags & IBV_SEND_FENCE)
1859 			info.read_fence = true;
1860 		if (iwuqp->send_cq->report_rtt)
1861 			info.report_rtt = true;
1862 
1863 		switch (ib_wr->opcode) {
1864 		case IBV_WR_SEND_WITH_IMM:
1865 			if (iwuqp->qp.qp_caps & IRDMA_SEND_WITH_IMM) {
1866 				info.imm_data_valid = true;
1867 				info.imm_data = ntohl(ib_wr->imm_data);
1868 			} else {
1869 				err = EINVAL;
1870 				break;
1871 			}
1872 			/* fallthrough */
1873 		case IBV_WR_SEND:
1874 		case IBV_WR_SEND_WITH_INV:
1875 			if (ib_wr->opcode == IBV_WR_SEND ||
1876 			    ib_wr->opcode == IBV_WR_SEND_WITH_IMM) {
1877 				if (ib_wr->send_flags & IBV_SEND_SOLICITED)
1878 					info.op_type = IRDMA_OP_TYPE_SEND_SOL;
1879 				else
1880 					info.op_type = IRDMA_OP_TYPE_SEND;
1881 			} else {
1882 				if (ib_wr->send_flags & IBV_SEND_SOLICITED)
1883 					info.op_type = IRDMA_OP_TYPE_SEND_SOL_INV;
1884 				else
1885 					info.op_type = IRDMA_OP_TYPE_SEND_INV;
1886 				info.stag_to_inv = ib_wr->imm_data;
1887 			}
1888 			if (ib_wr->send_flags & IBV_SEND_INLINE) {
1889 				info.op.inline_send.data = (void *)(uintptr_t)ib_wr->sg_list[0].addr;
1890 				info.op.inline_send.len = ib_wr->sg_list[0].length;
1891 				if (ib_qp->qp_type == IBV_QPT_UD) {
1892 					struct irdma_uah *ah = container_of(ib_wr->wr.ud.ah,
1893 									    struct irdma_uah, ibv_ah);
1894 
1895 					info.op.inline_send.ah_id = ah->ah_id;
1896 					info.op.inline_send.qkey = ib_wr->wr.ud.remote_qkey;
1897 					info.op.inline_send.dest_qp = ib_wr->wr.ud.remote_qpn;
1898 				}
1899 				err = irdma_uk_inline_send(&iwuqp->qp, &info, false);
1900 			} else {
1901 				info.op.send.num_sges = ib_wr->num_sge;
1902 				info.op.send.sg_list = (struct irdma_sge *)ib_wr->sg_list;
1903 				if (ib_qp->qp_type == IBV_QPT_UD) {
1904 					struct irdma_uah *ah = container_of(ib_wr->wr.ud.ah,
1905 									    struct irdma_uah, ibv_ah);
1906 
1907 					info.op.inline_send.ah_id = ah->ah_id;
1908 					info.op.inline_send.qkey = ib_wr->wr.ud.remote_qkey;
1909 					info.op.inline_send.dest_qp = ib_wr->wr.ud.remote_qpn;
1910 				}
1911 				err = irdma_uk_send(&iwuqp->qp, &info, false);
1912 			}
1913 			break;
1914 		case IBV_WR_RDMA_WRITE_WITH_IMM:
1915 			if (iwuqp->qp.qp_caps & IRDMA_WRITE_WITH_IMM) {
1916 				info.imm_data_valid = true;
1917 				info.imm_data = ntohl(ib_wr->imm_data);
1918 			} else {
1919 				err = EINVAL;
1920 				break;
1921 			}
1922 			/* fallthrough */
1923 		case IBV_WR_RDMA_WRITE:
1924 			if (ib_wr->send_flags & IBV_SEND_SOLICITED)
1925 				info.op_type = IRDMA_OP_TYPE_RDMA_WRITE_SOL;
1926 			else
1927 				info.op_type = IRDMA_OP_TYPE_RDMA_WRITE;
1928 
1929 			if (ib_wr->send_flags & IBV_SEND_INLINE) {
1930 				info.op.inline_rdma_write.data = (void *)(uintptr_t)ib_wr->sg_list[0].addr;
1931 				info.op.inline_rdma_write.len = ib_wr->sg_list[0].length;
1932 				info.op.inline_rdma_write.rem_addr.tag_off = ib_wr->wr.rdma.remote_addr;
1933 				info.op.inline_rdma_write.rem_addr.stag = ib_wr->wr.rdma.rkey;
1934 				err = irdma_uk_inline_rdma_write(&iwuqp->qp, &info, false);
1935 			} else {
1936 				info.op.rdma_write.lo_sg_list = (void *)ib_wr->sg_list;
1937 				info.op.rdma_write.num_lo_sges = ib_wr->num_sge;
1938 				info.op.rdma_write.rem_addr.tag_off = ib_wr->wr.rdma.remote_addr;
1939 				info.op.rdma_write.rem_addr.stag = ib_wr->wr.rdma.rkey;
1940 				err = irdma_uk_rdma_write(&iwuqp->qp, &info, false);
1941 			}
1942 			break;
1943 		case IBV_WR_RDMA_READ:
1944 			if (ib_wr->num_sge > uk_attrs->max_hw_read_sges) {
1945 				err = EINVAL;
1946 				break;
1947 			}
1948 			info.op_type = IRDMA_OP_TYPE_RDMA_READ;
1949 			info.op.rdma_read.rem_addr.tag_off = ib_wr->wr.rdma.remote_addr;
1950 			info.op.rdma_read.rem_addr.stag = ib_wr->wr.rdma.rkey;
1951 
1952 			info.op.rdma_read.lo_sg_list = (void *)ib_wr->sg_list;
1953 			info.op.rdma_read.num_lo_sges = ib_wr->num_sge;
1954 			err = irdma_uk_rdma_read(&iwuqp->qp, &info, false, false);
1955 			break;
1956 		case IBV_WR_BIND_MW:
1957 			if (ib_qp->qp_type != IBV_QPT_RC) {
1958 				err = EINVAL;
1959 				break;
1960 			}
1961 			info.op_type = IRDMA_OP_TYPE_BIND_MW;
1962 			info.op.bind_window.mr_stag = ib_wr->bind_mw.bind_info.mr->rkey;
1963 			if (ib_wr->bind_mw.mw->type == IBV_MW_TYPE_1) {
1964 				info.op.bind_window.mem_window_type_1 = true;
1965 				info.op.bind_window.mw_stag = ib_wr->bind_mw.rkey;
1966 			} else {
1967 				struct verbs_mr *vmr = verbs_get_mr(ib_wr->bind_mw.bind_info.mr);
1968 				struct irdma_umr *umr = container_of(vmr, struct irdma_umr, vmr);
1969 
1970 				if (umr->acc_flags & IBV_ACCESS_ZERO_BASED) {
1971 					err = EINVAL;
1972 					break;
1973 				}
1974 				info.op.bind_window.mw_stag =
1975 				    calc_type2_mw_stag(ib_wr->bind_mw.rkey, ib_wr->bind_mw.mw->rkey);
1976 				ib_wr->bind_mw.mw->rkey = info.op.bind_window.mw_stag;
1977 
1978 			}
1979 
1980 			if (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_ZERO_BASED) {
1981 				info.op.bind_window.addressing_type = IRDMA_ADDR_TYPE_ZERO_BASED;
1982 				info.op.bind_window.va = NULL;
1983 			} else {
1984 				info.op.bind_window.addressing_type = IRDMA_ADDR_TYPE_VA_BASED;
1985 				info.op.bind_window.va = (void *)(uintptr_t)ib_wr->bind_mw.bind_info.addr;
1986 			}
1987 			info.op.bind_window.bind_len = ib_wr->bind_mw.bind_info.length;
1988 			info.op.bind_window.ena_reads =
1989 			    (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_REMOTE_READ) ? 1 : 0;
1990 			info.op.bind_window.ena_writes =
1991 			    (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_REMOTE_WRITE) ? 1 : 0;
1992 
1993 			err = irdma_uk_mw_bind(&iwuqp->qp, &info, false);
1994 			break;
1995 		case IBV_WR_LOCAL_INV:
1996 			info.op_type = IRDMA_OP_TYPE_INV_STAG;
1997 			info.op.inv_local_stag.target_stag = ib_wr->imm_data;
1998 			err = irdma_uk_stag_local_invalidate(&iwuqp->qp, &info, true);
1999 			break;
2000 		default:
2001 			/* error */
2002 			err = EINVAL;
2003 			printf("%s: post work request failed, invalid opcode: 0x%x\n",
2004 			       __func__, ib_wr->opcode);
2005 			break;
2006 		}
2007 		if (err)
2008 			break;
2009 
2010 		ib_wr = ib_wr->next;
2011 	}
2012 
2013 	if (err)
2014 		*bad_wr = ib_wr;
2015 
2016 	irdma_uk_qp_post_wr(&iwuqp->qp);
2017 	if (reflush)
2018 		irdma_issue_flush(ib_qp, 1, 0);
2019 
2020 	pthread_spin_unlock(&iwuqp->lock);
2021 
2022 	return err;
2023 }
2024 
2025 /**
2026  * irdma_post_recv - post receive wr for user application
2027  * @ib_wr: work request for receive
2028  * @bad_wr: bad wr caused an error
2029  */
2030 int
2031 irdma_upost_recv(struct ibv_qp *ib_qp, struct ibv_recv_wr *ib_wr,
2032 		 struct ibv_recv_wr **bad_wr)
2033 {
2034 	struct irdma_post_rq_info post_recv = {};
2035 	struct irdma_sge *sg_list;
2036 	struct irdma_uqp *iwuqp;
2037 	bool reflush = false;
2038 	int err = 0;
2039 
2040 	iwuqp = container_of(ib_qp, struct irdma_uqp, ibv_qp);
2041 	sg_list = iwuqp->recv_sges;
2042 
2043 	err = pthread_spin_lock(&iwuqp->lock);
2044 	if (err)
2045 		return err;
2046 
2047 	if (!IRDMA_RING_MORE_WORK(iwuqp->qp.rq_ring) &&
2048 	    ib_qp->state == IBV_QPS_ERR)
2049 		reflush = true;
2050 
2051 	while (ib_wr) {
2052 		if (ib_wr->num_sge > iwuqp->qp.max_rq_frag_cnt) {
2053 			*bad_wr = ib_wr;
2054 			err = EINVAL;
2055 			goto error;
2056 		}
2057 		post_recv.num_sges = ib_wr->num_sge;
2058 		post_recv.wr_id = ib_wr->wr_id;
2059 		irdma_copy_sg_list(sg_list, ib_wr->sg_list, ib_wr->num_sge);
2060 		post_recv.sg_list = sg_list;
2061 		err = irdma_uk_post_receive(&iwuqp->qp, &post_recv);
2062 		if (err) {
2063 			*bad_wr = ib_wr;
2064 			goto error;
2065 		}
2066 
2067 		if (reflush)
2068 			irdma_issue_flush(ib_qp, 0, 1);
2069 
2070 		ib_wr = ib_wr->next;
2071 	}
2072 error:
2073 	pthread_spin_unlock(&iwuqp->lock);
2074 
2075 	return err;
2076 }
2077 
2078 /**
2079  * irdma_ucreate_ah - create address handle associated with a pd
2080  * @ibpd: pd for the address handle
2081  * @attr: attributes of address handle
2082  */
2083 struct ibv_ah *
2084 irdma_ucreate_ah(struct ibv_pd *ibpd, struct ibv_ah_attr *attr)
2085 {
2086 	struct irdma_uah *ah;
2087 	union ibv_gid sgid;
2088 	struct irdma_ucreate_ah_resp resp;
2089 	int err;
2090 
2091 	err = ibv_query_gid(ibpd->context, attr->port_num, attr->grh.sgid_index,
2092 			    &sgid);
2093 	if (err) {
2094 		fprintf(stderr, "irdma: Error from ibv_query_gid.\n");
2095 		errno = err;
2096 		return NULL;
2097 	}
2098 
2099 	ah = calloc(1, sizeof(*ah));
2100 	if (!ah)
2101 		return NULL;
2102 
2103 	err = ibv_cmd_create_ah(ibpd, &ah->ibv_ah, attr, &resp.ibv_resp,
2104 				sizeof(resp));
2105 	if (err) {
2106 		free(ah);
2107 		errno = err;
2108 		return NULL;
2109 	}
2110 
2111 	ah->ah_id = resp.ah_id;
2112 
2113 	return &ah->ibv_ah;
2114 }
2115 
2116 /**
2117  * irdma_udestroy_ah - destroy the address handle
2118  * @ibah: address handle
2119  */
2120 int
2121 irdma_udestroy_ah(struct ibv_ah *ibah)
2122 {
2123 	struct irdma_uah *ah;
2124 	int ret;
2125 
2126 	ah = container_of(ibah, struct irdma_uah, ibv_ah);
2127 
2128 	ret = ibv_cmd_destroy_ah(ibah);
2129 	if (ret)
2130 		return ret;
2131 
2132 	free(ah);
2133 
2134 	return 0;
2135 }
2136 
2137 /**
2138  * irdma_uattach_mcast - Attach qp to multicast group implemented
2139  * @qp: The queue pair
2140  * @gid:The Global ID for multicast group
2141  * @lid: The Local ID
2142  */
2143 int
2144 irdma_uattach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
2145 		    uint16_t lid)
2146 {
2147 	return ibv_cmd_attach_mcast(qp, gid, lid);
2148 }
2149 
2150 /**
2151  * irdma_udetach_mcast - Detach qp from multicast group
2152  * @qp: The queue pair
2153  * @gid:The Global ID for multicast group
2154  * @lid: The Local ID
2155  */
2156 int
2157 irdma_udetach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
2158 		    uint16_t lid)
2159 {
2160 	return ibv_cmd_detach_mcast(qp, gid, lid);
2161 }
2162 
2163 /**
2164  * irdma_uresize_cq - resizes a cq
2165  * @cq: cq to resize
2166  * @cqe: the number of cqes of the new cq
2167  */
2168 int
2169 irdma_uresize_cq(struct ibv_cq *cq, int cqe)
2170 {
2171 	struct irdma_uvcontext *iwvctx;
2172 	struct irdma_uk_attrs *uk_attrs;
2173 	struct irdma_uresize_cq cmd = {};
2174 	struct ibv_resize_cq_resp resp = {};
2175 	struct irdma_ureg_mr reg_mr_cmd = {};
2176 	struct ibv_reg_mr_resp reg_mr_resp = {};
2177 	struct irdma_cq_buf *cq_buf = NULL;
2178 	struct irdma_cqe *cq_base = NULL;
2179 	struct verbs_mr new_mr = {};
2180 	struct irdma_ucq *iwucq;
2181 	size_t cq_size;
2182 	u32 cq_pages;
2183 	int cqe_needed;
2184 	int ret = 0;
2185 
2186 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
2187 	iwvctx = container_of(cq->context, struct irdma_uvcontext, ibv_ctx);
2188 	uk_attrs = &iwvctx->uk_attrs;
2189 
2190 	if (!(uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE))
2191 		return EOPNOTSUPP;
2192 
2193 	if (cqe > IRDMA_MAX_CQ_SIZE)
2194 		return EINVAL;
2195 
2196 	cqe_needed = cqe + 1;
2197 	if (uk_attrs->hw_rev > IRDMA_GEN_1)
2198 		cqe_needed *= 2;
2199 
2200 	if (cqe_needed < IRDMA_U_MINCQ_SIZE)
2201 		cqe_needed = IRDMA_U_MINCQ_SIZE;
2202 
2203 	if (cqe_needed == iwucq->cq.cq_size)
2204 		return 0;
2205 
2206 	cq_size = get_cq_total_bytes(cqe_needed);
2207 	cq_pages = cq_size >> IRDMA_HW_PAGE_SHIFT;
2208 	cq_base = irdma_alloc_hw_buf(cq_size);
2209 	if (!cq_base)
2210 		return ENOMEM;
2211 
2212 	memset(cq_base, 0, cq_size);
2213 
2214 	cq_buf = malloc(sizeof(*cq_buf));
2215 	if (!cq_buf) {
2216 		ret = ENOMEM;
2217 		goto err_buf;
2218 	}
2219 
2220 	new_mr.ibv_mr.pd = iwucq->vmr.ibv_mr.pd;
2221 	reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ;
2222 	reg_mr_cmd.cq_pages = cq_pages;
2223 
2224 	ret = ibv_cmd_reg_mr(new_mr.ibv_mr.pd, cq_base, cq_size,
2225 			     (uintptr_t)cq_base, IBV_ACCESS_LOCAL_WRITE,
2226 			     &new_mr.ibv_mr, &reg_mr_cmd.ibv_cmd, sizeof(reg_mr_cmd),
2227 			     &reg_mr_resp, sizeof(reg_mr_resp));
2228 	if (ret)
2229 		goto err_dereg_mr;
2230 
2231 	ret = pthread_spin_lock(&iwucq->lock);
2232 	if (ret)
2233 		goto err_lock;
2234 
2235 	cmd.user_cq_buffer = (__u64) ((uintptr_t)cq_base);
2236 	ret = ibv_cmd_resize_cq(&iwucq->verbs_cq.cq, cqe_needed, &cmd.ibv_cmd,
2237 				sizeof(cmd), &resp, sizeof(resp));
2238 	if (ret)
2239 		goto err_resize;
2240 
2241 	memcpy(&cq_buf->cq, &iwucq->cq, sizeof(cq_buf->cq));
2242 	cq_buf->vmr = iwucq->vmr;
2243 	iwucq->vmr = new_mr;
2244 	irdma_uk_cq_resize(&iwucq->cq, cq_base, cqe_needed);
2245 	iwucq->verbs_cq.cq.cqe = cqe;
2246 	LIST_INSERT_HEAD(&iwucq->resize_list, cq_buf, list);
2247 
2248 	pthread_spin_unlock(&iwucq->lock);
2249 
2250 	return ret;
2251 
2252 err_resize:
2253 	pthread_spin_unlock(&iwucq->lock);
2254 err_lock:
2255 	ibv_cmd_dereg_mr(&new_mr.ibv_mr);
2256 err_dereg_mr:
2257 	free(cq_buf);
2258 err_buf:
2259 	fprintf(stderr, "failed to resize CQ cq_id=%d ret=%d\n", iwucq->cq.cq_id, ret);
2260 	irdma_free_hw_buf(cq_base, cq_size);
2261 	return ret;
2262 }
2263