xref: /freebsd/contrib/ofed/libirdma/irdma_uverbs.c (revision 734e82fe33aa764367791a7d603b383996c6b40b)
1 /*-
2  * SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
3  *
4  * Copyright (C) 2019 - 2023 Intel Corporation
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenFabrics.org BSD license below:
11  *
12  *   Redistribution and use in source and binary forms, with or
13  *   without modification, are permitted provided that the following
14  *   conditions are met:
15  *
16  *    - Redistributions of source code must retain the above
17  *	copyright notice, this list of conditions and the following
18  *	disclaimer.
19  *
20  *    - Redistributions in binary form must reproduce the above
21  *	copyright notice, this list of conditions and the following
22  *	disclaimer in the documentation and/or other materials
23  *	provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  */
34 /*$FreeBSD$*/
35 
36 #include <config.h>
37 #include <stdlib.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <unistd.h>
41 #include <signal.h>
42 #include <errno.h>
43 #include <sys/param.h>
44 #include <sys/mman.h>
45 #include <netinet/in.h>
46 #include <sys/stat.h>
47 #include <fcntl.h>
48 #include <stdbool.h>
49 #include <infiniband/opcode.h>
50 
51 #include "irdma_umain.h"
52 #include "abi.h"
53 
54 static inline void
55 print_fw_ver(uint64_t fw_ver, char *str, size_t len)
56 {
57 	uint16_t major, minor;
58 
59 	major = fw_ver >> 32 & 0xffff;
60 	minor = fw_ver & 0xffff;
61 
62 	snprintf(str, len, "%d.%d", major, minor);
63 }
64 
65 /**
66  * irdma_uquery_device_ex - query device attributes including extended properties
67  * @context: user context for the device
68  * @input: extensible input struct for ibv_query_device_ex verb
69  * @attr: extended device attribute struct
70  * @attr_size: size of extended device attribute struct
71  **/
72 int
73 irdma_uquery_device_ex(struct ibv_context *context,
74 		       const struct ibv_query_device_ex_input *input,
75 		       struct ibv_device_attr_ex *attr, size_t attr_size)
76 {
77 	struct irdma_query_device_ex cmd = {};
78 	struct irdma_query_device_ex_resp resp = {};
79 	uint64_t fw_ver;
80 	int ret;
81 
82 	ret = ibv_cmd_query_device_ex(context, input, attr, attr_size, &fw_ver,
83 				      &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd),
84 				      &resp.ibv_resp, sizeof(resp.ibv_resp), sizeof(resp));
85 	if (ret)
86 		return ret;
87 
88 	print_fw_ver(fw_ver, attr->orig_attr.fw_ver, sizeof(attr->orig_attr.fw_ver));
89 
90 	return 0;
91 }
92 
93 /**
94  * irdma_uquery_device - call driver to query device for max resources
95  * @context: user context for the device
96  * @attr: where to save all the mx resources from the driver
97  **/
98 int
99 irdma_uquery_device(struct ibv_context *context, struct ibv_device_attr *attr)
100 {
101 	struct ibv_query_device cmd;
102 	uint64_t fw_ver;
103 	int ret;
104 
105 	ret = ibv_cmd_query_device(context, attr, &fw_ver, &cmd, sizeof(cmd));
106 	if (ret)
107 		return ret;
108 
109 	print_fw_ver(fw_ver, attr->fw_ver, sizeof(attr->fw_ver));
110 
111 	return 0;
112 }
113 
114 /**
115  * irdma_uquery_port - get port attributes (msg size, lnk, mtu...)
116  * @context: user context of the device
117  * @port: port for the attributes
118  * @attr: to return port attributes
119  **/
120 int
121 irdma_uquery_port(struct ibv_context *context, uint8_t port,
122 		  struct ibv_port_attr *attr)
123 {
124 	struct ibv_query_port cmd;
125 
126 	return ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd));
127 }
128 
129 /**
130  * irdma_ualloc_pd - allocates protection domain and return pd ptr
131  * @context: user context of the device
132  **/
133 struct ibv_pd *
134 irdma_ualloc_pd(struct ibv_context *context)
135 {
136 	struct ibv_alloc_pd cmd;
137 	struct irdma_ualloc_pd_resp resp = {};
138 	struct irdma_upd *iwupd;
139 	int err;
140 
141 	iwupd = calloc(1, sizeof(*iwupd));
142 	if (!iwupd)
143 		return NULL;
144 
145 	err = ibv_cmd_alloc_pd(context, &iwupd->ibv_pd, &cmd, sizeof(cmd),
146 			       &resp.ibv_resp, sizeof(resp));
147 	if (err)
148 		goto err_free;
149 
150 	iwupd->pd_id = resp.pd_id;
151 
152 	return &iwupd->ibv_pd;
153 
154 err_free:
155 	free(iwupd);
156 	errno = err;
157 	return NULL;
158 }
159 
160 /**
161  * irdma_ufree_pd - free pd resources
162  * @pd: pd to free resources
163  */
164 int
165 irdma_ufree_pd(struct ibv_pd *pd)
166 {
167 	struct irdma_uvcontext *iwvctx = container_of(pd->context, struct irdma_uvcontext, ibv_ctx);
168 	struct irdma_upd *iwupd;
169 	int ret;
170 
171 	iwupd = container_of(pd, struct irdma_upd, ibv_pd);
172 	ret = ibv_cmd_dealloc_pd(pd);
173 	if (ret)
174 		return ret;
175 
176 	free(iwupd);
177 
178 	return 0;
179 }
180 
181 /**
182  * irdma_ureg_mr - register user memory region
183  * @pd: pd for the mr
184  * @addr: user address of the memory region
185  * @length: length of the memory
186  * @hca_va: hca_va
187  * @access: access allowed on this mr
188  */
189 struct ibv_mr *
190 irdma_ureg_mr(struct ibv_pd *pd, void *addr, size_t length,
191 	      int access)
192 {
193 	struct verbs_mr *vmr;
194 	struct irdma_ureg_mr cmd = {};
195 	struct ibv_reg_mr_resp resp;
196 	int err;
197 
198 	vmr = malloc(sizeof(*vmr));
199 	if (!vmr)
200 		return NULL;
201 
202 	cmd.reg_type = IRDMA_MEMREG_TYPE_MEM;
203 	err = ibv_cmd_reg_mr(pd, addr, length,
204 			     (uintptr_t)addr, access, &vmr->ibv_mr, &cmd.ibv_cmd,
205 			     sizeof(cmd), &resp, sizeof(resp));
206 	if (err) {
207 		free(vmr);
208 		errno = err;
209 		return NULL;
210 	}
211 
212 	return &vmr->ibv_mr;
213 }
214 
215 /*
216  * irdma_urereg_mr - re-register memory region @vmr: mr that was allocated @flags: bit mask to indicate which of the
217  * attr's of MR modified @pd: pd of the mr @addr: user address of the memory region @length: length of the memory
218  * @access: access allowed on this mr
219  */
220 int
221 irdma_urereg_mr(struct verbs_mr *vmr, int flags, struct ibv_pd *pd,
222 		void *addr, size_t length, int access)
223 {
224 	struct irdma_urereg_mr cmd = {};
225 	struct ibv_rereg_mr_resp resp;
226 
227 	cmd.reg_type = IRDMA_MEMREG_TYPE_MEM;
228 	return ibv_cmd_rereg_mr(&vmr->ibv_mr, flags, addr, length, (uintptr_t)addr,
229 				access, pd, &cmd.ibv_cmd, sizeof(cmd), &resp,
230 				sizeof(resp));
231 }
232 
233 /**
234  * irdma_udereg_mr - re-register memory region
235  * @mr: mr that was allocated
236  */
237 int
238 irdma_udereg_mr(struct ibv_mr *mr)
239 {
240 	struct verbs_mr *vmr;
241 	int ret;
242 
243 	vmr = container_of(mr, struct verbs_mr, ibv_mr);
244 
245 	ret = ibv_cmd_dereg_mr(mr);
246 	if (ret)
247 		return ret;
248 
249 	return 0;
250 }
251 
252 /**
253  * irdma_ualloc_mw - allocate memory window
254  * @pd: protection domain
255  * @type: memory window type
256  */
257 struct ibv_mw *
258 irdma_ualloc_mw(struct ibv_pd *pd, enum ibv_mw_type type)
259 {
260 	struct ibv_mw *mw;
261 	struct ibv_alloc_mw cmd;
262 	struct ibv_alloc_mw_resp resp;
263 	int err;
264 
265 	mw = calloc(1, sizeof(*mw));
266 	if (!mw)
267 		return NULL;
268 
269 	err = ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd), &resp,
270 			       sizeof(resp));
271 	if (err) {
272 		printf("%s: Failed to alloc memory window\n",
273 		       __func__);
274 		free(mw);
275 		errno = err;
276 		return NULL;
277 	}
278 
279 	return mw;
280 }
281 
282 /**
283  * irdma_ubind_mw - bind a memory window
284  * @qp: qp to post WR
285  * @mw: memory window to bind
286  * @mw_bind: bind info
287  */
288 int
289 irdma_ubind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
290 	       struct ibv_mw_bind *mw_bind)
291 {
292 	struct ibv_mw_bind_info *bind_info = &mw_bind->bind_info;
293 	struct verbs_mr *vmr;
294 
295 	struct ibv_send_wr wr = {};
296 	struct ibv_send_wr *bad_wr;
297 	int err;
298 
299 	if (!bind_info->mr && (bind_info->addr || bind_info->length))
300 		return EINVAL;
301 
302 	if (bind_info->mr) {
303 		vmr = verbs_get_mr(bind_info->mr);
304 		if (vmr->mr_type != IBV_MR_TYPE_MR)
305 			return ENOTSUP;
306 
307 		if (vmr->access & IBV_ACCESS_ZERO_BASED)
308 			return EINVAL;
309 
310 		if (mw->pd != bind_info->mr->pd)
311 			return EPERM;
312 	}
313 
314 	wr.opcode = IBV_WR_BIND_MW;
315 	wr.bind_mw.bind_info = mw_bind->bind_info;
316 	wr.bind_mw.mw = mw;
317 	wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey);
318 
319 	wr.wr_id = mw_bind->wr_id;
320 	wr.send_flags = mw_bind->send_flags;
321 
322 	err = irdma_upost_send(qp, &wr, &bad_wr);
323 	if (!err)
324 		mw->rkey = wr.bind_mw.rkey;
325 
326 	return err;
327 }
328 
329 /**
330  * irdma_udealloc_mw - deallocate memory window
331  * @mw: memory window to dealloc
332  */
333 int
334 irdma_udealloc_mw(struct ibv_mw *mw)
335 {
336 	int ret;
337 	struct ibv_dealloc_mw cmd;
338 
339 	ret = ibv_cmd_dealloc_mw(mw, &cmd, sizeof(cmd));
340 	if (ret)
341 		return ret;
342 	free(mw);
343 
344 	return 0;
345 }
346 
347 static void *
348 irdma_alloc_hw_buf(size_t size)
349 {
350 	void *buf;
351 
352 	buf = memalign(IRDMA_HW_PAGE_SIZE, size);
353 
354 	if (!buf)
355 		return NULL;
356 	if (ibv_dontfork_range(buf, size)) {
357 		free(buf);
358 		return NULL;
359 	}
360 
361 	return buf;
362 }
363 
364 static void
365 irdma_free_hw_buf(void *buf, size_t size)
366 {
367 	ibv_dofork_range(buf, size);
368 	free(buf);
369 }
370 
371 /**
372  * get_cq_size - returns actual cqe needed by HW
373  * @ncqe: minimum cqes requested by application
374  * @hw_rev: HW generation
375  * @cqe_64byte_ena: enable 64byte cqe
376  */
377 static inline int
378 get_cq_size(int ncqe, u8 hw_rev, bool cqe_64byte_ena)
379 {
380 	ncqe++;
381 
382 	/* Completions with immediate require 1 extra entry */
383 	if (!cqe_64byte_ena && hw_rev > IRDMA_GEN_1)
384 		ncqe *= 2;
385 
386 	if (ncqe < IRDMA_U_MINCQ_SIZE)
387 		ncqe = IRDMA_U_MINCQ_SIZE;
388 
389 	return ncqe;
390 }
391 
392 static inline size_t get_cq_total_bytes(u32 cq_size, bool cqe_64byte_ena){
393 	if (cqe_64byte_ena)
394 		return roundup(cq_size * sizeof(struct irdma_extended_cqe), IRDMA_HW_PAGE_SIZE);
395 	else
396 		return roundup(cq_size * sizeof(struct irdma_cqe), IRDMA_HW_PAGE_SIZE);
397 }
398 
399 /**
400  * ucreate_cq - irdma util function to create a CQ
401  * @context: ibv context
402  * @attr_ex: CQ init attributes
403  * @ext_cq: flag to create an extendable or normal CQ
404  */
405 static struct ibv_cq_ex *
406 ucreate_cq(struct ibv_context *context,
407 	   struct ibv_cq_init_attr_ex *attr_ex,
408 	   bool ext_cq)
409 {
410 	struct irdma_cq_uk_init_info info = {};
411 	struct irdma_ureg_mr reg_mr_cmd = {};
412 	struct irdma_ucreate_cq_ex cmd = {};
413 	struct irdma_ucreate_cq_ex_resp resp = {};
414 	struct ibv_reg_mr_resp reg_mr_resp = {};
415 	struct irdma_ureg_mr reg_mr_shadow_cmd = {};
416 	struct ibv_reg_mr_resp reg_mr_shadow_resp = {};
417 	struct irdma_uk_attrs *uk_attrs;
418 	struct irdma_uvcontext *iwvctx;
419 	struct irdma_ucq *iwucq;
420 	size_t total_size;
421 	u32 cq_pages;
422 	int ret, ncqe;
423 	u8 hw_rev;
424 	bool cqe_64byte_ena;
425 
426 	iwvctx = container_of(context, struct irdma_uvcontext, ibv_ctx);
427 	uk_attrs = &iwvctx->uk_attrs;
428 	hw_rev = uk_attrs->hw_rev;
429 
430 	if (ext_cq) {
431 		u32 supported_flags = IRDMA_STANDARD_WC_FLAGS_EX;
432 
433 		if (hw_rev == IRDMA_GEN_1 || attr_ex->wc_flags & ~supported_flags) {
434 			errno = EOPNOTSUPP;
435 			return NULL;
436 		}
437 	}
438 
439 	if (attr_ex->cqe < uk_attrs->min_hw_cq_size || attr_ex->cqe > uk_attrs->max_hw_cq_size - 1) {
440 		errno = EINVAL;
441 		return NULL;
442 	}
443 
444 	/* save the cqe requested by application */
445 	ncqe = attr_ex->cqe;
446 
447 	iwucq = calloc(1, sizeof(*iwucq));
448 	if (!iwucq)
449 		return NULL;
450 
451 	ret = pthread_spin_init(&iwucq->lock, PTHREAD_PROCESS_PRIVATE);
452 	if (ret) {
453 		free(iwucq);
454 		errno = ret;
455 		return NULL;
456 	}
457 
458 	cqe_64byte_ena = uk_attrs->feature_flags & IRDMA_FEATURE_64_BYTE_CQE ? true : false;
459 	info.cq_size = get_cq_size(attr_ex->cqe, hw_rev, cqe_64byte_ena);
460 	iwucq->comp_vector = attr_ex->comp_vector;
461 	LIST_INIT(&iwucq->resize_list);
462 	LIST_INIT(&iwucq->cmpl_generated);
463 	total_size = get_cq_total_bytes(info.cq_size, cqe_64byte_ena);
464 	cq_pages = total_size >> IRDMA_HW_PAGE_SHIFT;
465 
466 	if (!(uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE))
467 		total_size = (cq_pages << IRDMA_HW_PAGE_SHIFT) + IRDMA_DB_SHADOW_AREA_SIZE;
468 
469 	iwucq->buf_size = total_size;
470 	info.cq_base = irdma_alloc_hw_buf(total_size);
471 	if (!info.cq_base) {
472 		ret = ENOMEM;
473 		goto err_cq_base;
474 	}
475 
476 	memset(info.cq_base, 0, total_size);
477 	reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ;
478 	reg_mr_cmd.cq_pages = cq_pages;
479 
480 	ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, info.cq_base,
481 			     total_size, (uintptr_t)info.cq_base,
482 			     IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr.ibv_mr,
483 			     &reg_mr_cmd.ibv_cmd, sizeof(reg_mr_cmd),
484 			     &reg_mr_resp, sizeof(reg_mr_resp));
485 	if (ret)
486 		goto err_dereg_mr;
487 
488 	iwucq->vmr.ibv_mr.pd = &iwvctx->iwupd->ibv_pd;
489 
490 	if (uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE) {
491 		info.shadow_area = irdma_alloc_hw_buf(IRDMA_DB_SHADOW_AREA_SIZE);
492 		if (!info.shadow_area) {
493 			ret = ENOMEM;
494 			goto err_alloc_shadow;
495 		}
496 
497 		memset(info.shadow_area, 0, IRDMA_DB_SHADOW_AREA_SIZE);
498 		reg_mr_shadow_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ;
499 		reg_mr_shadow_cmd.cq_pages = 1;
500 
501 		ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, info.shadow_area,
502 				     IRDMA_DB_SHADOW_AREA_SIZE, (uintptr_t)info.shadow_area,
503 				     IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr_shadow_area.ibv_mr,
504 				     &reg_mr_shadow_cmd.ibv_cmd, sizeof(reg_mr_shadow_cmd),
505 				     &reg_mr_shadow_resp, sizeof(reg_mr_shadow_resp));
506 		if (ret) {
507 			irdma_free_hw_buf(info.shadow_area, IRDMA_DB_SHADOW_AREA_SIZE);
508 			goto err_alloc_shadow;
509 		}
510 
511 		iwucq->vmr_shadow_area.ibv_mr.pd = &iwvctx->iwupd->ibv_pd;
512 
513 	} else {
514 		info.shadow_area = (__le64 *) ((u8 *)info.cq_base + (cq_pages << IRDMA_HW_PAGE_SHIFT));
515 	}
516 
517 	attr_ex->cqe = info.cq_size;
518 	cmd.user_cq_buf = (__u64) ((uintptr_t)info.cq_base);
519 	cmd.user_shadow_area = (__u64) ((uintptr_t)info.shadow_area);
520 
521 	ret = ibv_cmd_create_cq_ex(context, attr_ex, &iwucq->verbs_cq.cq_ex,
522 				   &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd), &resp.ibv_resp,
523 				   sizeof(resp.ibv_resp), sizeof(resp));
524 	attr_ex->cqe = ncqe;
525 	if (ret)
526 		goto err_create_cq;
527 
528 	if (ext_cq)
529 		irdma_ibvcq_ex_fill_priv_funcs(iwucq, attr_ex);
530 	info.cq_id = resp.cq_id;
531 	/* Do not report the CQE's reserved for immediate and burned by HW */
532 	iwucq->verbs_cq.cq.cqe = ncqe;
533 	if (cqe_64byte_ena)
534 		info.avoid_mem_cflct = true;
535 	info.cqe_alloc_db = (u32 *)((u8 *)iwvctx->db + IRDMA_DB_CQ_OFFSET);
536 	irdma_uk_cq_init(&iwucq->cq, &info);
537 	return &iwucq->verbs_cq.cq_ex;
538 
539 err_create_cq:
540 	if (iwucq->vmr_shadow_area.ibv_mr.handle) {
541 		ibv_cmd_dereg_mr(&iwucq->vmr_shadow_area.ibv_mr);
542 		irdma_free_hw_buf(info.shadow_area, IRDMA_DB_SHADOW_AREA_SIZE);
543 	}
544 err_alloc_shadow:
545 	ibv_cmd_dereg_mr(&iwucq->vmr.ibv_mr);
546 err_dereg_mr:
547 	irdma_free_hw_buf(info.cq_base, total_size);
548 err_cq_base:
549 	printf("%s: failed to initialize CQ\n", __func__);
550 	pthread_spin_destroy(&iwucq->lock);
551 
552 	free(iwucq);
553 
554 	errno = ret;
555 	return NULL;
556 }
557 
558 struct ibv_cq *
559 irdma_ucreate_cq(struct ibv_context *context, int cqe,
560 		 struct ibv_comp_channel *channel,
561 		 int comp_vector)
562 {
563 	struct ibv_cq_init_attr_ex attr_ex = {
564 		.cqe = cqe,
565 		.channel = channel,
566 		.comp_vector = comp_vector,
567 	};
568 	struct ibv_cq_ex *ibvcq_ex;
569 
570 	ibvcq_ex = ucreate_cq(context, &attr_ex, false);
571 
572 	return ibvcq_ex ? ibv_cq_ex_to_cq(ibvcq_ex) : NULL;
573 }
574 
575 struct ibv_cq_ex *
576 irdma_ucreate_cq_ex(struct ibv_context *context,
577 		    struct ibv_cq_init_attr_ex *attr_ex)
578 {
579 	return ucreate_cq(context, attr_ex, true);
580 }
581 
582 /**
583  * irdma_free_cq_buf - free memory for cq buffer
584  * @cq_buf: cq buf to free
585  */
586 static void
587 irdma_free_cq_buf(struct irdma_cq_buf *cq_buf)
588 {
589 	ibv_cmd_dereg_mr(&cq_buf->vmr.ibv_mr);
590 	irdma_free_hw_buf(cq_buf->cq.cq_base, cq_buf->buf_size);
591 	free(cq_buf);
592 }
593 
594 /**
595  * irdma_process_resize_list - process the cq list to remove buffers
596  * @iwucq: cq which owns the list
597  * @lcqe_buf: cq buf where the last cqe is found
598  */
599 static int
600 irdma_process_resize_list(struct irdma_ucq *iwucq,
601 			  struct irdma_cq_buf *lcqe_buf)
602 {
603 	struct irdma_cq_buf *cq_buf, *next;
604 	int cq_cnt = 0;
605 
606 	LIST_FOREACH_SAFE(cq_buf, &iwucq->resize_list, list, next) {
607 		if (cq_buf == lcqe_buf)
608 			return cq_cnt;
609 
610 		LIST_REMOVE(cq_buf, list);
611 		irdma_free_cq_buf(cq_buf);
612 		cq_cnt++;
613 	}
614 
615 	return cq_cnt;
616 }
617 
618 static void
619 irdma_remove_cmpls_list(struct irdma_ucq *iwucq)
620 {
621 	struct irdma_cmpl_gen *cmpl_node, *next;
622 
623 	LIST_FOREACH_SAFE(cmpl_node, &iwucq->cmpl_generated, list, next) {
624 		LIST_REMOVE(cmpl_node, list);
625 		free(cmpl_node);
626 	}
627 }
628 
629 static int
630 irdma_generated_cmpls(struct irdma_ucq *iwucq, struct irdma_cq_poll_info *cq_poll_info)
631 {
632 	struct irdma_cmpl_gen *cmpl;
633 
634 	if (!iwucq || LIST_EMPTY(&iwucq->cmpl_generated))
635 		return ENOENT;
636 	cmpl = LIST_FIRST(&iwucq->cmpl_generated);
637 	LIST_REMOVE(cmpl, list);
638 	memcpy(cq_poll_info, &cmpl->cpi, sizeof(*cq_poll_info));
639 
640 	free(cmpl);
641 
642 	return 0;
643 }
644 
645 /**
646  * irdma_set_cpi_common_values - fill in values for polling info struct
647  * @cpi: resulting structure of cq_poll_info type
648  * @qp: QPair
649  * @qp_num: id of the QP
650  */
651 static void
652 irdma_set_cpi_common_values(struct irdma_cq_poll_info *cpi,
653 			    struct irdma_qp_uk *qp, __u32 qp_num)
654 {
655 	cpi->comp_status = IRDMA_COMPL_STATUS_FLUSHED;
656 	cpi->error = 1;
657 	cpi->major_err = IRDMA_FLUSH_MAJOR_ERR;
658 	cpi->minor_err = FLUSH_GENERAL_ERR;
659 	cpi->qp_handle = (irdma_qp_handle) (uintptr_t)qp;
660 	cpi->qp_id = qp_num;
661 }
662 
663 static bool
664 irdma_cq_empty(struct irdma_ucq *iwucq)
665 {
666 	struct irdma_cq_uk *ukcq;
667 	__u64 qword3;
668 	__le64 *cqe;
669 	__u8 polarity;
670 
671 	ukcq = &iwucq->cq;
672 	cqe = IRDMA_GET_CURRENT_CQ_ELEM(ukcq);
673 	get_64bit_val(cqe, 24, &qword3);
674 	polarity = (__u8) FIELD_GET(IRDMA_CQ_VALID, qword3);
675 
676 	return polarity != ukcq->polarity;
677 }
678 
679 /**
680  * irdma_generate_flush_completions - generate completion from WRs
681  * @iwuqp: pointer to QP
682  */
683 static void
684 irdma_generate_flush_completions(struct irdma_uqp *iwuqp)
685 {
686 	struct irdma_qp_uk *qp = &iwuqp->qp;
687 	struct irdma_ring *sq_ring = &qp->sq_ring;
688 	struct irdma_ring *rq_ring = &qp->rq_ring;
689 	struct irdma_cmpl_gen *cmpl;
690 	__le64 *sw_wqe;
691 	__u64 wqe_qword;
692 	__u32 wqe_idx;
693 
694 	if (pthread_spin_lock(&iwuqp->send_cq->lock))
695 		return;
696 	if (irdma_cq_empty(iwuqp->send_cq)) {
697 		while (IRDMA_RING_MORE_WORK(*sq_ring)) {
698 			cmpl = malloc(sizeof(*cmpl));
699 			if (!cmpl) {
700 				pthread_spin_unlock(&iwuqp->send_cq->lock);
701 				return;
702 			}
703 
704 			wqe_idx = sq_ring->tail;
705 			irdma_set_cpi_common_values(&cmpl->cpi, qp, qp->qp_id);
706 			cmpl->cpi.wr_id = qp->sq_wrtrk_array[wqe_idx].wrid;
707 			sw_wqe = qp->sq_base[wqe_idx].elem;
708 			get_64bit_val(sw_wqe, 24, &wqe_qword);
709 			cmpl->cpi.op_type = (__u8) FIELD_GET(IRDMAQPSQ_OPCODE, wqe_qword);
710 			/* remove the SQ WR by moving SQ tail */
711 			IRDMA_RING_SET_TAIL(*sq_ring, sq_ring->tail + qp->sq_wrtrk_array[sq_ring->tail].quanta);
712 			LIST_INSERT_HEAD(&iwuqp->send_cq->cmpl_generated, cmpl, list);
713 		}
714 	}
715 	pthread_spin_unlock(&iwuqp->send_cq->lock);
716 	if (pthread_spin_lock(&iwuqp->recv_cq->lock))
717 		return;
718 	if (irdma_cq_empty(iwuqp->recv_cq)) {
719 		while (IRDMA_RING_MORE_WORK(*rq_ring)) {
720 			cmpl = malloc(sizeof(*cmpl));
721 			if (!cmpl) {
722 				pthread_spin_unlock(&iwuqp->recv_cq->lock);
723 				return;
724 			}
725 
726 			wqe_idx = rq_ring->tail;
727 			irdma_set_cpi_common_values(&cmpl->cpi, qp, qp->qp_id);
728 			cmpl->cpi.wr_id = qp->rq_wrid_array[wqe_idx];
729 			cmpl->cpi.op_type = IRDMA_OP_TYPE_REC;
730 			/* remove the RQ WR by moving RQ tail */
731 			IRDMA_RING_SET_TAIL(*rq_ring, rq_ring->tail + 1);
732 			LIST_INSERT_HEAD(&iwuqp->recv_cq->cmpl_generated, cmpl, list);
733 		}
734 	}
735 	pthread_spin_unlock(&iwuqp->recv_cq->lock);
736 }
737 
738 void *
739 irdma_flush_thread(void *arg)
740 {
741 	__u8 i = 5;
742 	struct irdma_uqp *iwuqp = arg;
743 
744 	while (--i) {
745 		if (pthread_spin_lock(&iwuqp->lock))
746 			break;
747 		irdma_generate_flush_completions(arg);
748 		pthread_spin_unlock(&iwuqp->lock);
749 		sleep(1);
750 	}
751 	pthread_exit(NULL);
752 }
753 
754 /**
755  * irdma_udestroy_cq - destroys cq
756  * @cq: ptr to cq to be destroyed
757  */
758 int
759 irdma_udestroy_cq(struct ibv_cq *cq)
760 {
761 	struct irdma_uk_attrs *uk_attrs;
762 	struct irdma_uvcontext *iwvctx;
763 	struct irdma_ucq *iwucq;
764 	int ret;
765 
766 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
767 	iwvctx = container_of(cq->context, struct irdma_uvcontext, ibv_ctx);
768 	uk_attrs = &iwvctx->uk_attrs;
769 
770 	ret = pthread_spin_destroy(&iwucq->lock);
771 	if (ret)
772 		goto err;
773 
774 	if (!LIST_EMPTY(&iwucq->cmpl_generated))
775 		irdma_remove_cmpls_list(iwucq);
776 	irdma_process_resize_list(iwucq, NULL);
777 	ret = ibv_cmd_destroy_cq(cq);
778 	if (ret)
779 		goto err;
780 
781 	ibv_cmd_dereg_mr(&iwucq->vmr.ibv_mr);
782 	irdma_free_hw_buf(iwucq->cq.cq_base, iwucq->buf_size);
783 
784 	if (uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE) {
785 		ibv_cmd_dereg_mr(&iwucq->vmr_shadow_area.ibv_mr);
786 		irdma_free_hw_buf(iwucq->cq.shadow_area, IRDMA_DB_SHADOW_AREA_SIZE);
787 	}
788 	free(iwucq);
789 	return 0;
790 
791 err:
792 	return ret;
793 }
794 
795 static enum ibv_wc_status
796 irdma_flush_err_to_ib_wc_status(enum irdma_flush_opcode opcode)
797 {
798 	switch (opcode) {
799 	case FLUSH_PROT_ERR:
800 		return IBV_WC_LOC_PROT_ERR;
801 	case FLUSH_REM_ACCESS_ERR:
802 		return IBV_WC_REM_ACCESS_ERR;
803 	case FLUSH_LOC_QP_OP_ERR:
804 		return IBV_WC_LOC_QP_OP_ERR;
805 	case FLUSH_REM_OP_ERR:
806 		return IBV_WC_REM_OP_ERR;
807 	case FLUSH_LOC_LEN_ERR:
808 		return IBV_WC_LOC_LEN_ERR;
809 	case FLUSH_GENERAL_ERR:
810 		return IBV_WC_WR_FLUSH_ERR;
811 	case FLUSH_MW_BIND_ERR:
812 		return IBV_WC_MW_BIND_ERR;
813 	case FLUSH_REM_INV_REQ_ERR:
814 		return IBV_WC_REM_INV_REQ_ERR;
815 	case FLUSH_RETRY_EXC_ERR:
816 		return IBV_WC_RETRY_EXC_ERR;
817 	case FLUSH_FATAL_ERR:
818 	default:
819 		return IBV_WC_FATAL_ERR;
820 	}
821 }
822 
823 static inline void
824 set_ib_wc_op_sq(struct irdma_cq_poll_info *cur_cqe, struct ibv_wc *entry)
825 {
826 	switch (cur_cqe->op_type) {
827 	case IRDMA_OP_TYPE_RDMA_WRITE:
828 	case IRDMA_OP_TYPE_RDMA_WRITE_SOL:
829 		entry->opcode = IBV_WC_RDMA_WRITE;
830 		break;
831 	case IRDMA_OP_TYPE_RDMA_READ:
832 		entry->opcode = IBV_WC_RDMA_READ;
833 		break;
834 	case IRDMA_OP_TYPE_SEND_SOL:
835 	case IRDMA_OP_TYPE_SEND_SOL_INV:
836 	case IRDMA_OP_TYPE_SEND_INV:
837 	case IRDMA_OP_TYPE_SEND:
838 		entry->opcode = IBV_WC_SEND;
839 		break;
840 	case IRDMA_OP_TYPE_BIND_MW:
841 		entry->opcode = IBV_WC_BIND_MW;
842 		break;
843 	case IRDMA_OP_TYPE_INV_STAG:
844 		entry->opcode = IBV_WC_LOCAL_INV;
845 		break;
846 	default:
847 		entry->status = IBV_WC_GENERAL_ERR;
848 		printf("%s: Invalid opcode = %d in CQE\n",
849 		       __func__, cur_cqe->op_type);
850 	}
851 }
852 
853 static inline void
854 set_ib_wc_op_rq(struct irdma_cq_poll_info *cur_cqe,
855 		struct ibv_wc *entry, bool send_imm_support)
856 {
857 	if (!send_imm_support) {
858 		entry->opcode = cur_cqe->imm_valid ? IBV_WC_RECV_RDMA_WITH_IMM :
859 		    IBV_WC_RECV;
860 		return;
861 	}
862 	switch (cur_cqe->op_type) {
863 	case IBV_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
864 	case IBV_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
865 		entry->opcode = IBV_WC_RECV_RDMA_WITH_IMM;
866 		break;
867 	default:
868 		entry->opcode = IBV_WC_RECV;
869 	}
870 }
871 
872 /**
873  * irdma_process_cqe_ext - process current cqe for extended CQ
874  * @cur_cqe - current cqe info
875  */
876 static void
877 irdma_process_cqe_ext(struct irdma_cq_poll_info *cur_cqe)
878 {
879 	struct irdma_ucq *iwucq = container_of(cur_cqe, struct irdma_ucq, cur_cqe);
880 	struct ibv_cq_ex *ibvcq_ex = &iwucq->verbs_cq.cq_ex;
881 
882 	ibvcq_ex->wr_id = cur_cqe->wr_id;
883 	if (cur_cqe->error)
884 		ibvcq_ex->status = (cur_cqe->comp_status == IRDMA_COMPL_STATUS_FLUSHED) ?
885 		    irdma_flush_err_to_ib_wc_status(cur_cqe->minor_err) : IBV_WC_GENERAL_ERR;
886 	else
887 		ibvcq_ex->status = IBV_WC_SUCCESS;
888 }
889 
890 /**
891  * irdma_process_cqe - process current cqe info
892  * @entry - ibv_wc object to fill in for non-extended CQ
893  * @cur_cqe - current cqe info
894  */
895 static void
896 irdma_process_cqe(struct ibv_wc *entry, struct irdma_cq_poll_info *cur_cqe)
897 {
898 	struct irdma_qp_uk *qp;
899 	struct ibv_qp *ib_qp;
900 
901 	entry->wc_flags = 0;
902 	entry->wr_id = cur_cqe->wr_id;
903 	entry->qp_num = cur_cqe->qp_id;
904 	qp = cur_cqe->qp_handle;
905 	ib_qp = qp->back_qp;
906 
907 	if (cur_cqe->error) {
908 		entry->status = (cur_cqe->comp_status == IRDMA_COMPL_STATUS_FLUSHED) ?
909 		    irdma_flush_err_to_ib_wc_status(cur_cqe->minor_err) : IBV_WC_GENERAL_ERR;
910 		entry->vendor_err = cur_cqe->major_err << 16 |
911 		    cur_cqe->minor_err;
912 	} else {
913 		entry->status = IBV_WC_SUCCESS;
914 	}
915 
916 	if (cur_cqe->imm_valid) {
917 		entry->imm_data = htonl(cur_cqe->imm_data);
918 		entry->wc_flags |= IBV_WC_WITH_IMM;
919 	}
920 
921 	if (cur_cqe->q_type == IRDMA_CQE_QTYPE_SQ) {
922 		set_ib_wc_op_sq(cur_cqe, entry);
923 	} else {
924 		set_ib_wc_op_rq(cur_cqe, entry,
925 				qp->qp_caps & IRDMA_SEND_WITH_IMM ?
926 				true : false);
927 		if (ib_qp->qp_type != IBV_QPT_UD &&
928 		    cur_cqe->stag_invalid_set) {
929 			entry->invalidated_rkey = cur_cqe->inv_stag;
930 			entry->wc_flags |= IBV_WC_WITH_INV;
931 		}
932 	}
933 
934 	if (ib_qp->qp_type == IBV_QPT_UD) {
935 		entry->src_qp = cur_cqe->ud_src_qpn;
936 		entry->wc_flags |= IBV_WC_GRH;
937 	} else {
938 		entry->src_qp = cur_cqe->qp_id;
939 	}
940 	entry->byte_len = cur_cqe->bytes_xfered;
941 }
942 
943 /**
944  * irdma_poll_one - poll one entry of the CQ
945  * @ukcq: ukcq to poll
946  * @cur_cqe: current CQE info to be filled in
947  * @entry: ibv_wc object to be filled for non-extended CQ or NULL for extended CQ
948  *
949  * Returns the internal irdma device error code or 0 on success
950  */
951 static int
952 irdma_poll_one(struct irdma_cq_uk *ukcq, struct irdma_cq_poll_info *cur_cqe,
953 	       struct ibv_wc *entry)
954 {
955 	int ret = irdma_uk_cq_poll_cmpl(ukcq, cur_cqe);
956 
957 	if (ret)
958 		return ret;
959 
960 	if (!entry)
961 		irdma_process_cqe_ext(cur_cqe);
962 	else
963 		irdma_process_cqe(entry, cur_cqe);
964 
965 	return 0;
966 }
967 
968 /**
969  * __irdma_upoll_cq - irdma util function to poll device CQ
970  * @iwucq: irdma cq to poll
971  * @num_entries: max cq entries to poll
972  * @entry: pointer to array of ibv_wc objects to be filled in for each completion or NULL if ext CQ
973  *
974  * Returns non-negative value equal to the number of completions
975  * found. On failure, EINVAL
976  */
977 static int
978 __irdma_upoll_cq(struct irdma_ucq *iwucq, int num_entries,
979 		 struct ibv_wc *entry)
980 {
981 	struct irdma_cq_buf *cq_buf, *next;
982 	struct irdma_cq_buf *last_buf = NULL;
983 	struct irdma_cq_poll_info *cur_cqe = &iwucq->cur_cqe;
984 	bool cq_new_cqe = false;
985 	int resized_bufs = 0;
986 	int npolled = 0;
987 	int ret;
988 
989 	/* go through the list of previously resized CQ buffers */
990 	LIST_FOREACH_SAFE(cq_buf, &iwucq->resize_list, list, next) {
991 		while (npolled < num_entries) {
992 			ret = irdma_poll_one(&cq_buf->cq, cur_cqe,
993 					     entry ? entry + npolled : NULL);
994 			if (!ret) {
995 				++npolled;
996 				cq_new_cqe = true;
997 				continue;
998 			}
999 			if (ret == ENOENT)
1000 				break;
1001 			/* QP using the CQ is destroyed. Skip reporting this CQE */
1002 			if (ret == EFAULT) {
1003 				cq_new_cqe = true;
1004 				continue;
1005 			}
1006 			goto error;
1007 		}
1008 
1009 		/* save the resized CQ buffer which received the last cqe */
1010 		if (cq_new_cqe)
1011 			last_buf = cq_buf;
1012 		cq_new_cqe = false;
1013 	}
1014 
1015 	/* check the current CQ for new cqes */
1016 	while (npolled < num_entries) {
1017 		ret = irdma_poll_one(&iwucq->cq, cur_cqe,
1018 				     entry ? entry + npolled : NULL);
1019 		if (ret == ENOENT) {
1020 			ret = irdma_generated_cmpls(iwucq, cur_cqe);
1021 			if (!ret) {
1022 				if (entry)
1023 					irdma_process_cqe(entry + npolled, cur_cqe);
1024 				else
1025 					irdma_process_cqe_ext(cur_cqe);
1026 			}
1027 		}
1028 		if (!ret) {
1029 			++npolled;
1030 			cq_new_cqe = true;
1031 			continue;
1032 		}
1033 		if (ret == ENOENT)
1034 			break;
1035 		/* QP using the CQ is destroyed. Skip reporting this CQE */
1036 		if (ret == EFAULT) {
1037 			cq_new_cqe = true;
1038 			continue;
1039 		}
1040 		goto error;
1041 	}
1042 
1043 	if (cq_new_cqe)
1044 		/* all previous CQ resizes are complete */
1045 		resized_bufs = irdma_process_resize_list(iwucq, NULL);
1046 	else if (last_buf)
1047 		/* only CQ resizes up to the last_buf are complete */
1048 		resized_bufs = irdma_process_resize_list(iwucq, last_buf);
1049 	if (resized_bufs)
1050 		/* report to the HW the number of complete CQ resizes */
1051 		irdma_uk_cq_set_resized_cnt(&iwucq->cq, resized_bufs);
1052 
1053 	return npolled;
1054 
1055 error:
1056 	printf("%s: Error polling CQ, irdma_err: %d\n", __func__, ret);
1057 
1058 	return EINVAL;
1059 }
1060 
1061 /**
1062  * irdma_upoll_cq - verb API callback to poll device CQ
1063  * @cq: ibv_cq to poll
1064  * @num_entries: max cq entries to poll
1065  * @entry: pointer to array of ibv_wc objects to be filled in for each completion
1066  *
1067  * Returns non-negative value equal to the number of completions
1068  * found and a negative error code on failure
1069  */
1070 int
1071 irdma_upoll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *entry)
1072 {
1073 	struct irdma_ucq *iwucq;
1074 	int ret;
1075 
1076 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
1077 	ret = pthread_spin_lock(&iwucq->lock);
1078 	if (ret)
1079 		return -ret;
1080 
1081 	ret = __irdma_upoll_cq(iwucq, num_entries, entry);
1082 
1083 	pthread_spin_unlock(&iwucq->lock);
1084 
1085 	return ret;
1086 }
1087 
1088 /**
1089  * irdma_start_poll - verb_ex API callback to poll batch of WC's
1090  * @ibvcq_ex: ibv extended CQ
1091  * @attr: attributes (not used)
1092  *
1093  * Start polling batch of work completions. Return 0 on success, ENONENT when
1094  * no completions are available on CQ. And an error code on errors
1095  */
1096 static int
1097 irdma_start_poll(struct ibv_cq_ex *ibvcq_ex, struct ibv_poll_cq_attr *attr)
1098 {
1099 	struct irdma_ucq *iwucq;
1100 	int ret;
1101 
1102 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1103 	ret = pthread_spin_lock(&iwucq->lock);
1104 	if (ret)
1105 		return ret;
1106 
1107 	ret = __irdma_upoll_cq(iwucq, 1, NULL);
1108 	if (ret == 1)
1109 		return 0;
1110 
1111 	/* No Completions on CQ */
1112 	if (!ret)
1113 		ret = ENOENT;
1114 
1115 	pthread_spin_unlock(&iwucq->lock);
1116 
1117 	return ret;
1118 }
1119 
1120 /**
1121  * irdma_next_poll - verb_ex API callback to get next WC
1122  * @ibvcq_ex: ibv extended CQ
1123  *
1124  * Return 0 on success, ENONENT when no completions are available on CQ.
1125  * And an error code on errors
1126  */
1127 static int
1128 irdma_next_poll(struct ibv_cq_ex *ibvcq_ex)
1129 {
1130 	struct irdma_ucq *iwucq;
1131 	int ret;
1132 
1133 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1134 	ret = __irdma_upoll_cq(iwucq, 1, NULL);
1135 	if (ret == 1)
1136 		return 0;
1137 
1138 	/* No Completions on CQ */
1139 	if (!ret)
1140 		ret = ENOENT;
1141 
1142 	return ret;
1143 }
1144 
1145 /**
1146  * irdma_end_poll - verb_ex API callback to end polling of WC's
1147  * @ibvcq_ex: ibv extended CQ
1148  */
1149 static void
1150 irdma_end_poll(struct ibv_cq_ex *ibvcq_ex)
1151 {
1152 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1153 					       verbs_cq.cq_ex);
1154 
1155 	pthread_spin_unlock(&iwucq->lock);
1156 }
1157 
1158 static enum ibv_wc_opcode
1159 irdma_wc_read_opcode(struct ibv_cq_ex *ibvcq_ex)
1160 {
1161 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1162 					       verbs_cq.cq_ex);
1163 
1164 	switch (iwucq->cur_cqe.op_type) {
1165 	case IRDMA_OP_TYPE_RDMA_WRITE:
1166 	case IRDMA_OP_TYPE_RDMA_WRITE_SOL:
1167 		return IBV_WC_RDMA_WRITE;
1168 	case IRDMA_OP_TYPE_RDMA_READ:
1169 		return IBV_WC_RDMA_READ;
1170 	case IRDMA_OP_TYPE_SEND_SOL:
1171 	case IRDMA_OP_TYPE_SEND_SOL_INV:
1172 	case IRDMA_OP_TYPE_SEND_INV:
1173 	case IRDMA_OP_TYPE_SEND:
1174 		return IBV_WC_SEND;
1175 	case IRDMA_OP_TYPE_BIND_MW:
1176 		return IBV_WC_BIND_MW;
1177 	case IRDMA_OP_TYPE_REC:
1178 		return IBV_WC_RECV;
1179 	case IRDMA_OP_TYPE_REC_IMM:
1180 		return IBV_WC_RECV_RDMA_WITH_IMM;
1181 	case IRDMA_OP_TYPE_INV_STAG:
1182 		return IBV_WC_LOCAL_INV;
1183 	}
1184 
1185 	printf("%s: Invalid opcode = %d in CQE\n", __func__,
1186 	       iwucq->cur_cqe.op_type);
1187 
1188 	return 0;
1189 }
1190 
1191 static uint32_t irdma_wc_read_vendor_err(struct ibv_cq_ex *ibvcq_ex){
1192 	struct irdma_cq_poll_info *cur_cqe;
1193 	struct irdma_ucq *iwucq;
1194 
1195 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1196 	cur_cqe = &iwucq->cur_cqe;
1197 
1198 	return cur_cqe->error ? cur_cqe->major_err << 16 | cur_cqe->minor_err : 0;
1199 }
1200 
1201 static int
1202 irdma_wc_read_wc_flags(struct ibv_cq_ex *ibvcq_ex)
1203 {
1204 	struct irdma_cq_poll_info *cur_cqe;
1205 	struct irdma_ucq *iwucq;
1206 	struct irdma_qp_uk *qp;
1207 	struct ibv_qp *ib_qp;
1208 	int wc_flags = 0;
1209 
1210 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1211 	cur_cqe = &iwucq->cur_cqe;
1212 	qp = cur_cqe->qp_handle;
1213 	ib_qp = qp->back_qp;
1214 
1215 	if (cur_cqe->imm_valid)
1216 		wc_flags |= IBV_WC_WITH_IMM;
1217 
1218 	if (ib_qp->qp_type == IBV_QPT_UD) {
1219 		wc_flags |= IBV_WC_GRH;
1220 	} else {
1221 		if (cur_cqe->stag_invalid_set) {
1222 			switch (cur_cqe->op_type) {
1223 			case IRDMA_OP_TYPE_REC:
1224 				wc_flags |= IBV_WC_WITH_INV;
1225 				break;
1226 			case IRDMA_OP_TYPE_REC_IMM:
1227 				wc_flags |= IBV_WC_WITH_INV;
1228 				break;
1229 			}
1230 		}
1231 	}
1232 
1233 	return wc_flags;
1234 }
1235 
1236 static uint32_t irdma_wc_read_byte_len(struct ibv_cq_ex *ibvcq_ex){
1237 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1238 					       verbs_cq.cq_ex);
1239 
1240 	return iwucq->cur_cqe.bytes_xfered;
1241 }
1242 
1243 static __be32 irdma_wc_read_imm_data(struct ibv_cq_ex *ibvcq_ex){
1244 	struct irdma_cq_poll_info *cur_cqe;
1245 	struct irdma_ucq *iwucq;
1246 
1247 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1248 	cur_cqe = &iwucq->cur_cqe;
1249 
1250 	return cur_cqe->imm_valid ? htonl(cur_cqe->imm_data) : 0;
1251 }
1252 
1253 static uint32_t irdma_wc_read_qp_num(struct ibv_cq_ex *ibvcq_ex){
1254 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1255 					       verbs_cq.cq_ex);
1256 
1257 	return iwucq->cur_cqe.qp_id;
1258 }
1259 
1260 static uint32_t irdma_wc_read_src_qp(struct ibv_cq_ex *ibvcq_ex){
1261 	struct irdma_cq_poll_info *cur_cqe;
1262 	struct irdma_ucq *iwucq;
1263 	struct irdma_qp_uk *qp;
1264 	struct ibv_qp *ib_qp;
1265 
1266 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1267 	cur_cqe = &iwucq->cur_cqe;
1268 	qp = cur_cqe->qp_handle;
1269 	ib_qp = qp->back_qp;
1270 
1271 	return ib_qp->qp_type == IBV_QPT_UD ? cur_cqe->ud_src_qpn : cur_cqe->qp_id;
1272 }
1273 
1274 static uint8_t irdma_wc_read_sl(struct ibv_cq_ex *ibvcq_ex){
1275 	return 0;
1276 }
1277 
1278 void
1279 irdma_ibvcq_ex_fill_priv_funcs(struct irdma_ucq *iwucq,
1280 			       struct ibv_cq_init_attr_ex *attr_ex)
1281 {
1282 	struct ibv_cq_ex *ibvcq_ex = &iwucq->verbs_cq.cq_ex;
1283 
1284 	ibvcq_ex->start_poll = irdma_start_poll;
1285 	ibvcq_ex->end_poll = irdma_end_poll;
1286 	ibvcq_ex->next_poll = irdma_next_poll;
1287 
1288 	ibvcq_ex->read_opcode = irdma_wc_read_opcode;
1289 	ibvcq_ex->read_vendor_err = irdma_wc_read_vendor_err;
1290 	ibvcq_ex->read_wc_flags = irdma_wc_read_wc_flags;
1291 
1292 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_BYTE_LEN)
1293 		ibvcq_ex->read_byte_len = irdma_wc_read_byte_len;
1294 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_IMM)
1295 		ibvcq_ex->read_imm_data = irdma_wc_read_imm_data;
1296 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_QP_NUM)
1297 		ibvcq_ex->read_qp_num = irdma_wc_read_qp_num;
1298 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_SRC_QP)
1299 		ibvcq_ex->read_src_qp = irdma_wc_read_src_qp;
1300 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_SL)
1301 		ibvcq_ex->read_sl = irdma_wc_read_sl;
1302 }
1303 
1304 /**
1305  * irdma_arm_cq - arm of cq
1306  * @iwucq: cq to which arm
1307  * @cq_notify: notification params
1308  */
1309 static void
1310 irdma_arm_cq(struct irdma_ucq *iwucq,
1311 	     enum irdma_cmpl_notify cq_notify)
1312 {
1313 	iwucq->is_armed = true;
1314 	iwucq->arm_sol = true;
1315 	iwucq->skip_arm = false;
1316 	iwucq->skip_sol = true;
1317 	irdma_uk_cq_request_notification(&iwucq->cq, cq_notify);
1318 }
1319 
1320 /**
1321  * irdma_uarm_cq - callback for arm of cq
1322  * @cq: cq to arm
1323  * @solicited: to get notify params
1324  */
1325 int
1326 irdma_uarm_cq(struct ibv_cq *cq, int solicited)
1327 {
1328 	struct irdma_ucq *iwucq;
1329 	enum irdma_cmpl_notify cq_notify = IRDMA_CQ_COMPL_EVENT;
1330 	int ret;
1331 
1332 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
1333 	if (solicited)
1334 		cq_notify = IRDMA_CQ_COMPL_SOLICITED;
1335 
1336 	ret = pthread_spin_lock(&iwucq->lock);
1337 	if (ret)
1338 		return ret;
1339 
1340 	if (iwucq->is_armed) {
1341 		if (iwucq->arm_sol && !solicited) {
1342 			irdma_arm_cq(iwucq, cq_notify);
1343 		} else {
1344 			iwucq->skip_arm = true;
1345 			iwucq->skip_sol = solicited ? true : false;
1346 		}
1347 	} else {
1348 		irdma_arm_cq(iwucq, cq_notify);
1349 	}
1350 
1351 	pthread_spin_unlock(&iwucq->lock);
1352 
1353 	return 0;
1354 }
1355 
1356 /**
1357  * irdma_cq_event - cq to do completion event
1358  * @cq: cq to arm
1359  */
1360 void
1361 irdma_cq_event(struct ibv_cq *cq)
1362 {
1363 	struct irdma_ucq *iwucq;
1364 
1365 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
1366 	if (pthread_spin_lock(&iwucq->lock))
1367 		return;
1368 
1369 	if (iwucq->skip_arm)
1370 		irdma_arm_cq(iwucq, IRDMA_CQ_COMPL_EVENT);
1371 	else
1372 		iwucq->is_armed = false;
1373 
1374 	pthread_spin_unlock(&iwucq->lock);
1375 }
1376 
1377 void *
1378 irdma_mmap(int fd, off_t offset)
1379 {
1380 	void *map;
1381 
1382 	map = mmap(NULL, IRDMA_HW_PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED,
1383 		   fd, offset);
1384 	if (map == MAP_FAILED)
1385 		return map;
1386 
1387 	if (ibv_dontfork_range(map, IRDMA_HW_PAGE_SIZE)) {
1388 		munmap(map, IRDMA_HW_PAGE_SIZE);
1389 		return MAP_FAILED;
1390 	}
1391 
1392 	return map;
1393 }
1394 
1395 void
1396 irdma_munmap(void *map)
1397 {
1398 	ibv_dofork_range(map, IRDMA_HW_PAGE_SIZE);
1399 	munmap(map, IRDMA_HW_PAGE_SIZE);
1400 }
1401 
1402 /**
1403  * irdma_destroy_vmapped_qp - destroy resources for qp
1404  * @iwuqp: qp struct for resources
1405  */
1406 static int
1407 irdma_destroy_vmapped_qp(struct irdma_uqp *iwuqp)
1408 {
1409 	int ret;
1410 
1411 	ret = ibv_cmd_destroy_qp(&iwuqp->ibv_qp);
1412 	if (ret)
1413 		return ret;
1414 
1415 	if (iwuqp->qp.push_db)
1416 		irdma_munmap(iwuqp->qp.push_db);
1417 	if (iwuqp->qp.push_wqe)
1418 		irdma_munmap(iwuqp->qp.push_wqe);
1419 
1420 	ibv_cmd_dereg_mr(&iwuqp->vmr.ibv_mr);
1421 
1422 	return 0;
1423 }
1424 
1425 /**
1426  * irdma_vmapped_qp - create resources for qp
1427  * @iwuqp: qp struct for resources
1428  * @pd: pd for the qp
1429  * @attr: attributes of qp passed
1430  * @resp: response back from create qp
1431  * @info: uk info for initializing user level qp
1432  * @abi_ver: abi version of the create qp command
1433  */
1434 static int
1435 irdma_vmapped_qp(struct irdma_uqp *iwuqp, struct ibv_pd *pd,
1436 		 struct ibv_qp_init_attr *attr,
1437 		 struct irdma_qp_uk_init_info *info,
1438 		 bool legacy_mode)
1439 {
1440 	struct irdma_ucreate_qp cmd = {};
1441 	size_t sqsize, rqsize, totalqpsize;
1442 	struct irdma_ucreate_qp_resp resp = {};
1443 	struct irdma_ureg_mr reg_mr_cmd = {};
1444 	struct ibv_reg_mr_resp reg_mr_resp = {};
1445 	int ret;
1446 
1447 	sqsize = roundup(info->sq_depth * IRDMA_QP_WQE_MIN_SIZE, IRDMA_HW_PAGE_SIZE);
1448 	rqsize = roundup(info->rq_depth * IRDMA_QP_WQE_MIN_SIZE, IRDMA_HW_PAGE_SIZE);
1449 	totalqpsize = rqsize + sqsize + IRDMA_DB_SHADOW_AREA_SIZE;
1450 	info->sq = irdma_alloc_hw_buf(totalqpsize);
1451 	iwuqp->buf_size = totalqpsize;
1452 
1453 	if (!info->sq)
1454 		return ENOMEM;
1455 
1456 	memset(info->sq, 0, totalqpsize);
1457 	info->rq = &info->sq[sqsize / IRDMA_QP_WQE_MIN_SIZE];
1458 	info->shadow_area = info->rq[rqsize / IRDMA_QP_WQE_MIN_SIZE].elem;
1459 
1460 	reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_QP;
1461 	reg_mr_cmd.sq_pages = sqsize >> IRDMA_HW_PAGE_SHIFT;
1462 	reg_mr_cmd.rq_pages = rqsize >> IRDMA_HW_PAGE_SHIFT;
1463 
1464 	ret = ibv_cmd_reg_mr(pd, info->sq, totalqpsize,
1465 			     (uintptr_t)info->sq, IBV_ACCESS_LOCAL_WRITE,
1466 			     &iwuqp->vmr.ibv_mr, &reg_mr_cmd.ibv_cmd,
1467 			     sizeof(reg_mr_cmd), &reg_mr_resp,
1468 			     sizeof(reg_mr_resp));
1469 	if (ret)
1470 		goto err_dereg_mr;
1471 
1472 	cmd.user_wqe_bufs = (__u64) ((uintptr_t)info->sq);
1473 	cmd.user_compl_ctx = (__u64) (uintptr_t)&iwuqp->qp;
1474 	ret = ibv_cmd_create_qp(pd, &iwuqp->ibv_qp, attr, &cmd.ibv_cmd,
1475 				sizeof(cmd), &resp.ibv_resp,
1476 				sizeof(struct irdma_ucreate_qp_resp));
1477 	if (ret)
1478 		goto err_qp;
1479 
1480 	info->sq_size = resp.actual_sq_size;
1481 	info->rq_size = resp.actual_rq_size;
1482 	info->first_sq_wq = legacy_mode ? 1 : resp.lsmm;
1483 	info->qp_caps = resp.qp_caps;
1484 	info->qp_id = resp.qp_id;
1485 	iwuqp->irdma_drv_opt = resp.irdma_drv_opt;
1486 	iwuqp->ibv_qp.qp_num = resp.qp_id;
1487 
1488 	iwuqp->send_cq = container_of(attr->send_cq, struct irdma_ucq,
1489 				      verbs_cq.cq);
1490 	iwuqp->recv_cq = container_of(attr->recv_cq, struct irdma_ucq,
1491 				      verbs_cq.cq);
1492 	iwuqp->send_cq->uqp = iwuqp;
1493 	iwuqp->recv_cq->uqp = iwuqp;
1494 
1495 	return 0;
1496 err_qp:
1497 	ibv_cmd_dereg_mr(&iwuqp->vmr.ibv_mr);
1498 err_dereg_mr:
1499 	printf("%s: failed to create QP, status %d\n", __func__, ret);
1500 	irdma_free_hw_buf(info->sq, iwuqp->buf_size);
1501 	return ret;
1502 }
1503 
1504 /**
1505  * irdma_ucreate_qp - create qp on user app
1506  * @pd: pd for the qp
1507  * @attr: attributes of the qp to be created (sizes, sge, cq)
1508  */
1509 struct ibv_qp *
1510 irdma_ucreate_qp(struct ibv_pd *pd,
1511 		 struct ibv_qp_init_attr *attr)
1512 {
1513 	struct irdma_qp_uk_init_info info = {};
1514 	struct irdma_uk_attrs *uk_attrs;
1515 	struct irdma_uvcontext *iwvctx;
1516 	struct irdma_uqp *iwuqp;
1517 	int status;
1518 
1519 	if (attr->qp_type != IBV_QPT_RC && attr->qp_type != IBV_QPT_UD) {
1520 		printf("%s: failed to create QP, unsupported QP type: 0x%x\n",
1521 		       __func__, attr->qp_type);
1522 		errno = EOPNOTSUPP;
1523 		return NULL;
1524 	}
1525 
1526 	iwvctx = container_of(pd->context, struct irdma_uvcontext, ibv_ctx);
1527 	uk_attrs = &iwvctx->uk_attrs;
1528 
1529 	if (attr->cap.max_send_sge > uk_attrs->max_hw_wq_frags ||
1530 	    attr->cap.max_recv_sge > uk_attrs->max_hw_wq_frags ||
1531 	    attr->cap.max_inline_data > uk_attrs->max_hw_inline) {
1532 		errno = EINVAL;
1533 		return NULL;
1534 	}
1535 
1536 	info.uk_attrs = uk_attrs;
1537 	info.sq_size = attr->cap.max_send_wr;
1538 	info.rq_size = attr->cap.max_recv_wr;
1539 	info.max_sq_frag_cnt = attr->cap.max_send_sge;
1540 	info.max_rq_frag_cnt = attr->cap.max_recv_sge;
1541 	info.max_inline_data = attr->cap.max_inline_data;
1542 	info.abi_ver = iwvctx->abi_ver;
1543 
1544 	status = irdma_uk_calc_depth_shift_sq(&info, &info.sq_depth, &info.sq_shift);
1545 	if (status) {
1546 		printf("%s: invalid SQ attributes, max_send_wr=%d max_send_sge=%d max_inline=%d\n",
1547 		       __func__, attr->cap.max_send_wr, attr->cap.max_send_sge,
1548 		       attr->cap.max_inline_data);
1549 		errno = status;
1550 		return NULL;
1551 	}
1552 
1553 	status = irdma_uk_calc_depth_shift_rq(&info, &info.rq_depth, &info.rq_shift);
1554 	if (status) {
1555 		printf("%s: invalid RQ attributes, recv_wr=%d recv_sge=%d\n",
1556 		       __func__, attr->cap.max_recv_wr, attr->cap.max_recv_sge);
1557 		errno = status;
1558 		return NULL;
1559 	}
1560 
1561 	iwuqp = memalign(1024, sizeof(*iwuqp));
1562 	if (!iwuqp)
1563 		return NULL;
1564 
1565 	memset(iwuqp, 0, sizeof(*iwuqp));
1566 
1567 	status = pthread_spin_init(&iwuqp->lock, PTHREAD_PROCESS_PRIVATE);
1568 	if (status)
1569 		goto err_free_qp;
1570 
1571 	info.sq_size = info.sq_depth >> info.sq_shift;
1572 	info.rq_size = info.rq_depth >> info.rq_shift;
1573 	/**
1574 	 * Maintain backward compatibility with older ABI which pass sq
1575 	 * and rq depth (in quanta) in cap.max_send_wr a cap.max_recv_wr
1576 	 */
1577 	if (!iwvctx->use_raw_attrs) {
1578 		attr->cap.max_send_wr = info.sq_size;
1579 		attr->cap.max_recv_wr = info.rq_size;
1580 	}
1581 
1582 	iwuqp->recv_sges = calloc(attr->cap.max_recv_sge, sizeof(*iwuqp->recv_sges));
1583 	if (!iwuqp->recv_sges) {
1584 		status = errno;	/* preserve errno */
1585 		goto err_destroy_lock;
1586 	}
1587 
1588 	info.wqe_alloc_db = (u32 *)iwvctx->db;
1589 	info.legacy_mode = iwvctx->legacy_mode;
1590 	info.sq_wrtrk_array = calloc(info.sq_depth, sizeof(*info.sq_wrtrk_array));
1591 	if (!info.sq_wrtrk_array) {
1592 		status = errno;	/* preserve errno */
1593 		goto err_free_rsges;
1594 	}
1595 
1596 	info.rq_wrid_array = calloc(info.rq_depth, sizeof(*info.rq_wrid_array));
1597 	if (!info.rq_wrid_array) {
1598 		status = errno;	/* preserve errno */
1599 		goto err_free_sq_wrtrk;
1600 	}
1601 
1602 	iwuqp->sq_sig_all = attr->sq_sig_all;
1603 	iwuqp->qp_type = attr->qp_type;
1604 	status = irdma_vmapped_qp(iwuqp, pd, attr, &info, iwvctx->legacy_mode);
1605 	if (status)
1606 		goto err_free_rq_wrid;
1607 
1608 	iwuqp->qp.back_qp = iwuqp;
1609 	iwuqp->qp.lock = &iwuqp->lock;
1610 
1611 	status = irdma_uk_qp_init(&iwuqp->qp, &info);
1612 	if (status)
1613 		goto err_free_vmap_qp;
1614 
1615 	attr->cap.max_send_wr = (info.sq_depth - IRDMA_SQ_RSVD) >> info.sq_shift;
1616 	attr->cap.max_recv_wr = (info.rq_depth - IRDMA_RQ_RSVD) >> info.rq_shift;
1617 
1618 	return &iwuqp->ibv_qp;
1619 
1620 err_free_vmap_qp:
1621 	irdma_destroy_vmapped_qp(iwuqp);
1622 	irdma_free_hw_buf(info.sq, iwuqp->buf_size);
1623 err_free_rq_wrid:
1624 	free(info.rq_wrid_array);
1625 err_free_sq_wrtrk:
1626 	free(info.sq_wrtrk_array);
1627 err_free_rsges:
1628 	free(iwuqp->recv_sges);
1629 err_destroy_lock:
1630 	pthread_spin_destroy(&iwuqp->lock);
1631 err_free_qp:
1632 	printf("%s: failed to create QP\n", __func__);
1633 	free(iwuqp);
1634 
1635 	errno = status;
1636 	return NULL;
1637 }
1638 
1639 /**
1640  * irdma_uquery_qp - query qp for some attribute
1641  * @qp: qp for the attributes query
1642  * @attr: to return the attributes
1643  * @attr_mask: mask of what is query for
1644  * @init_attr: initial attributes during create_qp
1645  */
1646 int
1647 irdma_uquery_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask,
1648 		struct ibv_qp_init_attr *init_attr)
1649 {
1650 	struct ibv_query_qp cmd;
1651 
1652 	return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, &cmd,
1653 				sizeof(cmd));
1654 }
1655 
1656 /**
1657  * irdma_umodify_qp - send qp modify to driver
1658  * @qp: qp to modify
1659  * @attr: attribute to modify
1660  * @attr_mask: mask of the attribute
1661  */
1662 int
1663 irdma_umodify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask)
1664 {
1665 	struct irdma_umodify_qp_resp resp = {};
1666 	struct ibv_modify_qp cmd = {};
1667 	struct irdma_modify_qp_cmd cmd_ex = {};
1668 	struct irdma_uvcontext *iwvctx;
1669 	struct irdma_uqp *iwuqp;
1670 
1671 	iwuqp = container_of(qp, struct irdma_uqp, ibv_qp);
1672 	iwvctx = container_of(qp->context, struct irdma_uvcontext, ibv_ctx);
1673 
1674 	if (iwuqp->qp.qp_caps & IRDMA_PUSH_MODE && attr_mask & IBV_QP_STATE &&
1675 	    iwvctx->uk_attrs.hw_rev > IRDMA_GEN_1) {
1676 		u64 offset;
1677 		void *map;
1678 		int ret;
1679 
1680 		ret = ibv_cmd_modify_qp_ex(qp, attr, attr_mask, &cmd_ex.ibv_cmd,
1681 					   sizeof(cmd_ex.ibv_cmd),
1682 					   sizeof(cmd_ex), &resp.ibv_resp,
1683 					   sizeof(resp.ibv_resp),
1684 					   sizeof(resp));
1685 		if (!ret)
1686 			iwuqp->qp.rd_fence_rate = resp.rd_fence_rate;
1687 		if (ret || !resp.push_valid)
1688 			return ret;
1689 
1690 		if (iwuqp->qp.push_wqe)
1691 			return ret;
1692 
1693 		offset = resp.push_wqe_mmap_key;
1694 		map = irdma_mmap(qp->context->cmd_fd, offset);
1695 		if (map == MAP_FAILED)
1696 			return ret;
1697 
1698 		iwuqp->qp.push_wqe = map;
1699 
1700 		offset = resp.push_db_mmap_key;
1701 		map = irdma_mmap(qp->context->cmd_fd, offset);
1702 		if (map == MAP_FAILED) {
1703 			irdma_munmap(iwuqp->qp.push_wqe);
1704 			iwuqp->qp.push_wqe = NULL;
1705 			printf("failed to map push page, errno %d\n", errno);
1706 			return ret;
1707 		}
1708 		iwuqp->qp.push_wqe += resp.push_offset;
1709 		iwuqp->qp.push_db = map + resp.push_offset;
1710 
1711 		return ret;
1712 	} else {
1713 		int ret;
1714 
1715 		ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd));
1716 		if (ret)
1717 			return ret;
1718 		if (attr_mask & IBV_QP_STATE && attr->qp_state == IBV_QPS_ERR)
1719 			pthread_create(&iwuqp->flush_thread, NULL, irdma_flush_thread, iwuqp);
1720 		return 0;
1721 	}
1722 }
1723 
1724 static void
1725 irdma_issue_flush(struct ibv_qp *qp, bool sq_flush, bool rq_flush)
1726 {
1727 	struct irdma_umodify_qp_resp resp = {};
1728 	struct irdma_modify_qp_cmd cmd_ex = {};
1729 	struct ibv_qp_attr attr = {};
1730 
1731 	attr.qp_state = IBV_QPS_ERR;
1732 	cmd_ex.sq_flush = sq_flush;
1733 	cmd_ex.rq_flush = rq_flush;
1734 
1735 	ibv_cmd_modify_qp_ex(qp, &attr, IBV_QP_STATE,
1736 			     &cmd_ex.ibv_cmd,
1737 			     sizeof(cmd_ex.ibv_cmd),
1738 			     sizeof(cmd_ex), &resp.ibv_resp,
1739 			     sizeof(resp.ibv_resp),
1740 			     sizeof(resp));
1741 }
1742 
1743 /**
1744  * irdma_clean_cqes - clean cq entries for qp
1745  * @qp: qp for which completions are cleaned
1746  * @iwcq: cq to be cleaned
1747  */
1748 static void
1749 irdma_clean_cqes(struct irdma_qp_uk *qp, struct irdma_ucq *iwucq)
1750 {
1751 	struct irdma_cq_uk *ukcq = &iwucq->cq;
1752 	int ret;
1753 
1754 	ret = pthread_spin_lock(&iwucq->lock);
1755 	if (ret)
1756 		return;
1757 
1758 	irdma_uk_clean_cq(qp, ukcq);
1759 	pthread_spin_unlock(&iwucq->lock);
1760 }
1761 
1762 /**
1763  * irdma_udestroy_qp - destroy qp
1764  * @qp: qp to destroy
1765  */
1766 int
1767 irdma_udestroy_qp(struct ibv_qp *qp)
1768 {
1769 	struct irdma_uqp *iwuqp;
1770 	int ret;
1771 
1772 	iwuqp = container_of(qp, struct irdma_uqp, ibv_qp);
1773 	if (iwuqp->flush_thread) {
1774 		pthread_cancel(iwuqp->flush_thread);
1775 		pthread_join(iwuqp->flush_thread, NULL);
1776 	}
1777 	ret = pthread_spin_destroy(&iwuqp->lock);
1778 	if (ret)
1779 		goto err;
1780 
1781 	ret = irdma_destroy_vmapped_qp(iwuqp);
1782 	if (ret)
1783 		goto err;
1784 
1785 	/* Clean any pending completions from the cq(s) */
1786 	if (iwuqp->send_cq)
1787 		irdma_clean_cqes(&iwuqp->qp, iwuqp->send_cq);
1788 
1789 	if (iwuqp->recv_cq && iwuqp->recv_cq != iwuqp->send_cq)
1790 		irdma_clean_cqes(&iwuqp->qp, iwuqp->recv_cq);
1791 
1792 	if (iwuqp->qp.sq_wrtrk_array)
1793 		free(iwuqp->qp.sq_wrtrk_array);
1794 	if (iwuqp->qp.rq_wrid_array)
1795 		free(iwuqp->qp.rq_wrid_array);
1796 
1797 	irdma_free_hw_buf(iwuqp->qp.sq_base, iwuqp->buf_size);
1798 	free(iwuqp->recv_sges);
1799 	free(iwuqp);
1800 	return 0;
1801 
1802 err:
1803 	printf("%s: failed to destroy QP, status %d\n",
1804 	       __func__, ret);
1805 	return ret;
1806 }
1807 
1808 /**
1809  * irdma_copy_sg_list - copy sg list for qp
1810  * @sg_list: copied into sg_list
1811  * @sgl: copy from sgl
1812  * @num_sges: count of sg entries
1813  * @max_sges: count of max supported sg entries
1814  */
1815 static void
1816 irdma_copy_sg_list(struct irdma_sge *sg_list, struct ibv_sge *sgl,
1817 		   int num_sges)
1818 {
1819 	int i;
1820 
1821 	for (i = 0; i < num_sges; i++) {
1822 		sg_list[i].tag_off = sgl[i].addr;
1823 		sg_list[i].len = sgl[i].length;
1824 		sg_list[i].stag = sgl[i].lkey;
1825 	}
1826 }
1827 
1828 /**
1829  * calc_type2_mw_stag - calculate type 2 MW stag
1830  * @rkey: desired rkey of the MW
1831  * @mw_rkey: type2 memory window rkey
1832  *
1833  * compute type2 memory window stag by taking lower 8 bits
1834  * of the desired rkey and leaving 24 bits if mw->rkey unchanged
1835  */
1836 static inline u32 calc_type2_mw_stag(u32 rkey, u32 mw_rkey) {
1837 	const u32 mask = 0xff;
1838 
1839 	return (rkey & mask) | (mw_rkey & ~mask);
1840 }
1841 
1842 /**
1843  * irdma_post_send -  post send wr for user application
1844  * @ib_qp: qp to post wr
1845  * @ib_wr: work request ptr
1846  * @bad_wr: return of bad wr if err
1847  */
1848 int
1849 irdma_upost_send(struct ibv_qp *ib_qp, struct ibv_send_wr *ib_wr,
1850 		 struct ibv_send_wr **bad_wr)
1851 {
1852 	struct irdma_post_sq_info info;
1853 	struct irdma_uvcontext *iwvctx;
1854 	struct irdma_uk_attrs *uk_attrs;
1855 	struct irdma_uqp *iwuqp;
1856 	bool reflush = false;
1857 	int err = 0;
1858 
1859 	iwuqp = container_of(ib_qp, struct irdma_uqp, ibv_qp);
1860 	iwvctx = container_of(ib_qp->context, struct irdma_uvcontext, ibv_ctx);
1861 	uk_attrs = &iwvctx->uk_attrs;
1862 
1863 	err = pthread_spin_lock(&iwuqp->lock);
1864 	if (err)
1865 		return err;
1866 
1867 	if (!IRDMA_RING_MORE_WORK(iwuqp->qp.sq_ring) &&
1868 	    ib_qp->state == IBV_QPS_ERR)
1869 		reflush = true;
1870 
1871 	while (ib_wr) {
1872 		memset(&info, 0, sizeof(info));
1873 		info.wr_id = (u64)(ib_wr->wr_id);
1874 		if ((ib_wr->send_flags & IBV_SEND_SIGNALED) ||
1875 		    iwuqp->sq_sig_all)
1876 			info.signaled = true;
1877 		if (ib_wr->send_flags & IBV_SEND_FENCE)
1878 			info.read_fence = true;
1879 
1880 		switch (ib_wr->opcode) {
1881 		case IBV_WR_SEND_WITH_IMM:
1882 			if (iwuqp->qp.qp_caps & IRDMA_SEND_WITH_IMM) {
1883 				info.imm_data_valid = true;
1884 				info.imm_data = ntohl(ib_wr->imm_data);
1885 			} else {
1886 				err = EINVAL;
1887 				break;
1888 			}
1889 			/* fallthrough */
1890 		case IBV_WR_SEND:
1891 		case IBV_WR_SEND_WITH_INV:
1892 			if (ib_wr->opcode == IBV_WR_SEND ||
1893 			    ib_wr->opcode == IBV_WR_SEND_WITH_IMM) {
1894 				if (ib_wr->send_flags & IBV_SEND_SOLICITED)
1895 					info.op_type = IRDMA_OP_TYPE_SEND_SOL;
1896 				else
1897 					info.op_type = IRDMA_OP_TYPE_SEND;
1898 			} else {
1899 				if (ib_wr->send_flags & IBV_SEND_SOLICITED)
1900 					info.op_type = IRDMA_OP_TYPE_SEND_SOL_INV;
1901 				else
1902 					info.op_type = IRDMA_OP_TYPE_SEND_INV;
1903 				info.stag_to_inv = ib_wr->imm_data;
1904 			}
1905 			info.op.send.num_sges = ib_wr->num_sge;
1906 			info.op.send.sg_list = (struct irdma_sge *)ib_wr->sg_list;
1907 			if (ib_qp->qp_type == IBV_QPT_UD) {
1908 				struct irdma_uah *ah = container_of(ib_wr->wr.ud.ah,
1909 								    struct irdma_uah, ibv_ah);
1910 
1911 				info.op.send.ah_id = ah->ah_id;
1912 				info.op.send.qkey = ib_wr->wr.ud.remote_qkey;
1913 				info.op.send.dest_qp = ib_wr->wr.ud.remote_qpn;
1914 			}
1915 
1916 			if (ib_wr->send_flags & IBV_SEND_INLINE)
1917 				err = irdma_uk_inline_send(&iwuqp->qp, &info, false);
1918 			else
1919 				err = irdma_uk_send(&iwuqp->qp, &info, false);
1920 			break;
1921 		case IBV_WR_RDMA_WRITE_WITH_IMM:
1922 			if (iwuqp->qp.qp_caps & IRDMA_WRITE_WITH_IMM) {
1923 				info.imm_data_valid = true;
1924 				info.imm_data = ntohl(ib_wr->imm_data);
1925 			} else {
1926 				err = EINVAL;
1927 				break;
1928 			}
1929 			/* fallthrough */
1930 		case IBV_WR_RDMA_WRITE:
1931 			if (ib_wr->send_flags & IBV_SEND_SOLICITED)
1932 				info.op_type = IRDMA_OP_TYPE_RDMA_WRITE_SOL;
1933 			else
1934 				info.op_type = IRDMA_OP_TYPE_RDMA_WRITE;
1935 
1936 			info.op.rdma_write.num_lo_sges = ib_wr->num_sge;
1937 			info.op.rdma_write.lo_sg_list = (void *)ib_wr->sg_list;
1938 			info.op.rdma_write.rem_addr.tag_off = ib_wr->wr.rdma.remote_addr;
1939 			info.op.rdma_write.rem_addr.stag = ib_wr->wr.rdma.rkey;
1940 			if (ib_wr->send_flags & IBV_SEND_INLINE)
1941 				err = irdma_uk_inline_rdma_write(&iwuqp->qp, &info, false);
1942 			else
1943 				err = irdma_uk_rdma_write(&iwuqp->qp, &info, false);
1944 			break;
1945 		case IBV_WR_RDMA_READ:
1946 			if (ib_wr->num_sge > uk_attrs->max_hw_read_sges) {
1947 				err = EINVAL;
1948 				break;
1949 			}
1950 			info.op_type = IRDMA_OP_TYPE_RDMA_READ;
1951 			info.op.rdma_read.rem_addr.tag_off = ib_wr->wr.rdma.remote_addr;
1952 			info.op.rdma_read.rem_addr.stag = ib_wr->wr.rdma.rkey;
1953 
1954 			info.op.rdma_read.lo_sg_list = (void *)ib_wr->sg_list;
1955 			info.op.rdma_read.num_lo_sges = ib_wr->num_sge;
1956 			err = irdma_uk_rdma_read(&iwuqp->qp, &info, false, false);
1957 			break;
1958 		case IBV_WR_BIND_MW:
1959 			if (ib_qp->qp_type != IBV_QPT_RC) {
1960 				err = EINVAL;
1961 				break;
1962 			}
1963 			info.op_type = IRDMA_OP_TYPE_BIND_MW;
1964 			info.op.bind_window.mr_stag = ib_wr->bind_mw.bind_info.mr->rkey;
1965 			if (ib_wr->bind_mw.mw->type == IBV_MW_TYPE_1) {
1966 				info.op.bind_window.mem_window_type_1 = true;
1967 				info.op.bind_window.mw_stag = ib_wr->bind_mw.rkey;
1968 			} else {
1969 				struct verbs_mr *vmr = verbs_get_mr(ib_wr->bind_mw.bind_info.mr);
1970 
1971 				if (vmr->access & IBV_ACCESS_ZERO_BASED) {
1972 					err = EINVAL;
1973 					break;
1974 				}
1975 				info.op.bind_window.mw_stag =
1976 				    calc_type2_mw_stag(ib_wr->bind_mw.rkey, ib_wr->bind_mw.mw->rkey);
1977 				ib_wr->bind_mw.mw->rkey = info.op.bind_window.mw_stag;
1978 
1979 			}
1980 
1981 			if (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_ZERO_BASED) {
1982 				info.op.bind_window.addressing_type = IRDMA_ADDR_TYPE_ZERO_BASED;
1983 				info.op.bind_window.va = NULL;
1984 			} else {
1985 				info.op.bind_window.addressing_type = IRDMA_ADDR_TYPE_VA_BASED;
1986 				info.op.bind_window.va = (void *)(uintptr_t)ib_wr->bind_mw.bind_info.addr;
1987 			}
1988 			info.op.bind_window.bind_len = ib_wr->bind_mw.bind_info.length;
1989 			info.op.bind_window.ena_reads =
1990 			    (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_REMOTE_READ) ? 1 : 0;
1991 			info.op.bind_window.ena_writes =
1992 			    (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_REMOTE_WRITE) ? 1 : 0;
1993 
1994 			err = irdma_uk_mw_bind(&iwuqp->qp, &info, false);
1995 			break;
1996 		case IBV_WR_LOCAL_INV:
1997 			info.op_type = IRDMA_OP_TYPE_INV_STAG;
1998 			info.op.inv_local_stag.target_stag = ib_wr->imm_data;
1999 			err = irdma_uk_stag_local_invalidate(&iwuqp->qp, &info, true);
2000 			break;
2001 		default:
2002 			/* error */
2003 			err = EINVAL;
2004 			printf("%s: post work request failed, invalid opcode: 0x%x\n",
2005 			       __func__, ib_wr->opcode);
2006 			break;
2007 		}
2008 		if (err)
2009 			break;
2010 
2011 		ib_wr = ib_wr->next;
2012 	}
2013 
2014 	if (err)
2015 		*bad_wr = ib_wr;
2016 
2017 	irdma_uk_qp_post_wr(&iwuqp->qp);
2018 	if (reflush)
2019 		irdma_issue_flush(ib_qp, 1, 0);
2020 
2021 	pthread_spin_unlock(&iwuqp->lock);
2022 
2023 	return err;
2024 }
2025 
2026 /**
2027  * irdma_post_recv - post receive wr for user application
2028  * @ib_wr: work request for receive
2029  * @bad_wr: bad wr caused an error
2030  */
2031 int
2032 irdma_upost_recv(struct ibv_qp *ib_qp, struct ibv_recv_wr *ib_wr,
2033 		 struct ibv_recv_wr **bad_wr)
2034 {
2035 	struct irdma_post_rq_info post_recv = {};
2036 	struct irdma_sge *sg_list;
2037 	struct irdma_uqp *iwuqp;
2038 	bool reflush = false;
2039 	int err = 0;
2040 
2041 	iwuqp = container_of(ib_qp, struct irdma_uqp, ibv_qp);
2042 	sg_list = iwuqp->recv_sges;
2043 
2044 	err = pthread_spin_lock(&iwuqp->lock);
2045 	if (err)
2046 		return err;
2047 
2048 	if (!IRDMA_RING_MORE_WORK(iwuqp->qp.rq_ring) &&
2049 	    ib_qp->state == IBV_QPS_ERR)
2050 		reflush = true;
2051 
2052 	while (ib_wr) {
2053 		if (ib_wr->num_sge > iwuqp->qp.max_rq_frag_cnt) {
2054 			*bad_wr = ib_wr;
2055 			err = EINVAL;
2056 			goto error;
2057 		}
2058 		post_recv.num_sges = ib_wr->num_sge;
2059 		post_recv.wr_id = ib_wr->wr_id;
2060 		irdma_copy_sg_list(sg_list, ib_wr->sg_list, ib_wr->num_sge);
2061 		post_recv.sg_list = sg_list;
2062 		err = irdma_uk_post_receive(&iwuqp->qp, &post_recv);
2063 		if (err) {
2064 			*bad_wr = ib_wr;
2065 			goto error;
2066 		}
2067 
2068 		if (reflush)
2069 			irdma_issue_flush(ib_qp, 0, 1);
2070 
2071 		ib_wr = ib_wr->next;
2072 	}
2073 error:
2074 	pthread_spin_unlock(&iwuqp->lock);
2075 
2076 	return err;
2077 }
2078 
2079 /**
2080  * irdma_ucreate_ah - create address handle associated with a pd
2081  * @ibpd: pd for the address handle
2082  * @attr: attributes of address handle
2083  */
2084 struct ibv_ah *
2085 irdma_ucreate_ah(struct ibv_pd *ibpd, struct ibv_ah_attr *attr)
2086 {
2087 	struct irdma_uah *ah;
2088 	union ibv_gid sgid;
2089 	struct irdma_ucreate_ah_resp resp = {};
2090 	int err;
2091 
2092 	if (ibv_query_gid(ibpd->context, attr->port_num, attr->grh.sgid_index,
2093 			  &sgid)) {
2094 		fprintf(stderr, "irdma: Error from ibv_query_gid.\n");
2095 		errno = ENOENT;
2096 		return NULL;
2097 	}
2098 
2099 	ah = calloc(1, sizeof(*ah));
2100 	if (!ah)
2101 		return NULL;
2102 
2103 	err = ibv_cmd_create_ah(ibpd, &ah->ibv_ah, attr, &resp.ibv_resp,
2104 				sizeof(resp));
2105 	if (err) {
2106 		free(ah);
2107 		errno = err;
2108 		return NULL;
2109 	}
2110 
2111 	ah->ah_id = resp.ah_id;
2112 
2113 	return &ah->ibv_ah;
2114 }
2115 
2116 /**
2117  * irdma_udestroy_ah - destroy the address handle
2118  * @ibah: address handle
2119  */
2120 int
2121 irdma_udestroy_ah(struct ibv_ah *ibah)
2122 {
2123 	struct irdma_uah *ah;
2124 	int ret;
2125 
2126 	ah = container_of(ibah, struct irdma_uah, ibv_ah);
2127 
2128 	ret = ibv_cmd_destroy_ah(ibah);
2129 	if (ret)
2130 		return ret;
2131 
2132 	free(ah);
2133 
2134 	return 0;
2135 }
2136 
2137 /**
2138  * irdma_uattach_mcast - Attach qp to multicast group implemented
2139  * @qp: The queue pair
2140  * @gid:The Global ID for multicast group
2141  * @lid: The Local ID
2142  */
2143 int
2144 irdma_uattach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
2145 		    uint16_t lid)
2146 {
2147 	return ibv_cmd_attach_mcast(qp, gid, lid);
2148 }
2149 
2150 /**
2151  * irdma_udetach_mcast - Detach qp from multicast group
2152  * @qp: The queue pair
2153  * @gid:The Global ID for multicast group
2154  * @lid: The Local ID
2155  */
2156 int
2157 irdma_udetach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
2158 		    uint16_t lid)
2159 {
2160 	return ibv_cmd_detach_mcast(qp, gid, lid);
2161 }
2162 
2163 /**
2164  * irdma_uresize_cq - resizes a cq
2165  * @cq: cq to resize
2166  * @cqe: the number of cqes of the new cq
2167  */
2168 int
2169 irdma_uresize_cq(struct ibv_cq *cq, int cqe)
2170 {
2171 	struct irdma_uvcontext *iwvctx;
2172 	struct irdma_uk_attrs *uk_attrs;
2173 	struct irdma_uresize_cq cmd = {};
2174 	struct ibv_resize_cq_resp resp = {};
2175 	struct irdma_ureg_mr reg_mr_cmd = {};
2176 	struct ibv_reg_mr_resp reg_mr_resp = {};
2177 	struct irdma_cq_buf *cq_buf = NULL;
2178 	struct irdma_cqe *cq_base = NULL;
2179 	struct verbs_mr new_mr = {};
2180 	struct irdma_ucq *iwucq;
2181 	size_t cq_size;
2182 	u32 cq_pages;
2183 	int cqe_needed;
2184 	int ret = 0;
2185 	bool cqe_64byte_ena;
2186 
2187 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
2188 	iwvctx = container_of(cq->context, struct irdma_uvcontext, ibv_ctx);
2189 	uk_attrs = &iwvctx->uk_attrs;
2190 
2191 	if (!(uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE))
2192 		return EOPNOTSUPP;
2193 
2194 	if (cqe < uk_attrs->min_hw_cq_size || cqe > uk_attrs->max_hw_cq_size - 1)
2195 		return EINVAL;
2196 
2197 	cqe_64byte_ena = uk_attrs->feature_flags & IRDMA_FEATURE_64_BYTE_CQE ? true : false;
2198 
2199 	cqe_needed = get_cq_size(cqe, uk_attrs->hw_rev, cqe_64byte_ena);
2200 
2201 	if (cqe_needed == iwucq->cq.cq_size)
2202 		return 0;
2203 
2204 	cq_size = get_cq_total_bytes(cqe_needed, cqe_64byte_ena);
2205 	cq_pages = cq_size >> IRDMA_HW_PAGE_SHIFT;
2206 	cq_base = irdma_alloc_hw_buf(cq_size);
2207 	if (!cq_base)
2208 		return ENOMEM;
2209 
2210 	memset(cq_base, 0, cq_size);
2211 
2212 	cq_buf = malloc(sizeof(*cq_buf));
2213 	if (!cq_buf) {
2214 		ret = ENOMEM;
2215 		goto err_buf;
2216 	}
2217 
2218 	new_mr.ibv_mr.pd = iwucq->vmr.ibv_mr.pd;
2219 	reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ;
2220 	reg_mr_cmd.cq_pages = cq_pages;
2221 
2222 	ret = ibv_cmd_reg_mr(new_mr.ibv_mr.pd, cq_base, cq_size,
2223 			     (uintptr_t)cq_base, IBV_ACCESS_LOCAL_WRITE,
2224 			     &new_mr.ibv_mr, &reg_mr_cmd.ibv_cmd, sizeof(reg_mr_cmd),
2225 			     &reg_mr_resp, sizeof(reg_mr_resp));
2226 	if (ret)
2227 		goto err_dereg_mr;
2228 
2229 	ret = pthread_spin_lock(&iwucq->lock);
2230 	if (ret)
2231 		goto err_lock;
2232 
2233 	cmd.user_cq_buffer = (__u64) ((uintptr_t)cq_base);
2234 	ret = ibv_cmd_resize_cq(&iwucq->verbs_cq.cq, cqe_needed, &cmd.ibv_cmd,
2235 				sizeof(cmd), &resp, sizeof(resp));
2236 	if (ret)
2237 		goto err_resize;
2238 
2239 	memcpy(&cq_buf->cq, &iwucq->cq, sizeof(cq_buf->cq));
2240 	cq_buf->buf_size = cq_size;
2241 	cq_buf->vmr = iwucq->vmr;
2242 	iwucq->vmr = new_mr;
2243 	irdma_uk_cq_resize(&iwucq->cq, cq_base, cqe_needed);
2244 	iwucq->verbs_cq.cq.cqe = cqe;
2245 	LIST_INSERT_HEAD(&iwucq->resize_list, cq_buf, list);
2246 
2247 	pthread_spin_unlock(&iwucq->lock);
2248 
2249 	return ret;
2250 
2251 err_resize:
2252 	pthread_spin_unlock(&iwucq->lock);
2253 err_lock:
2254 	ibv_cmd_dereg_mr(&new_mr.ibv_mr);
2255 err_dereg_mr:
2256 	free(cq_buf);
2257 err_buf:
2258 	fprintf(stderr, "failed to resize CQ cq_id=%d ret=%d\n", iwucq->cq.cq_id, ret);
2259 	irdma_free_hw_buf(cq_base, cq_size);
2260 	return ret;
2261 }
2262