xref: /freebsd/contrib/ofed/libirdma/irdma_uverbs.c (revision 3dd5524264095ed8612c28908e13f80668eff2f9)
1 /*-
2  * SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
3  *
4  * Copyright (C) 2019 - 2022 Intel Corporation
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenFabrics.org BSD license below:
11  *
12  *   Redistribution and use in source and binary forms, with or
13  *   without modification, are permitted provided that the following
14  *   conditions are met:
15  *
16  *    - Redistributions of source code must retain the above
17  *	copyright notice, this list of conditions and the following
18  *	disclaimer.
19  *
20  *    - Redistributions in binary form must reproduce the above
21  *	copyright notice, this list of conditions and the following
22  *	disclaimer in the documentation and/or other materials
23  *	provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  */
34 /*$FreeBSD$*/
35 
36 #include <config.h>
37 #include <stdlib.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <unistd.h>
41 #include <signal.h>
42 #include <errno.h>
43 #include <sys/param.h>
44 #include <sys/mman.h>
45 #include <netinet/in.h>
46 #include <sys/stat.h>
47 #include <fcntl.h>
48 #include <stdbool.h>
49 #include <infiniband/opcode.h>
50 
51 #include "irdma_umain.h"
52 #include "abi.h"
53 
54 static inline void
55 print_fw_ver(uint64_t fw_ver, char *str, size_t len)
56 {
57 	uint16_t major, minor;
58 
59 	major = fw_ver >> 32 & 0xffff;
60 	minor = fw_ver & 0xffff;
61 
62 	snprintf(str, len, "%d.%d", major, minor);
63 }
64 
65 /**
66  * irdma_uquery_device_ex - query device attributes including extended properties
67  * @context: user context for the device
68  * @input: extensible input struct for ibv_query_device_ex verb
69  * @attr: extended device attribute struct
70  * @attr_size: size of extended device attribute struct
71  **/
72 int
73 irdma_uquery_device_ex(struct ibv_context *context,
74 		       const struct ibv_query_device_ex_input *input,
75 		       struct ibv_device_attr_ex *attr, size_t attr_size)
76 {
77 	struct irdma_query_device_ex cmd = {};
78 	struct irdma_query_device_ex_resp resp = {};
79 	uint64_t fw_ver;
80 	int ret;
81 
82 	ret = ibv_cmd_query_device_ex(context, input, attr, attr_size, &fw_ver,
83 				      &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd),
84 				      &resp.ibv_resp, sizeof(resp.ibv_resp), sizeof(resp));
85 	if (ret)
86 		return ret;
87 
88 	print_fw_ver(fw_ver, attr->orig_attr.fw_ver, sizeof(attr->orig_attr.fw_ver));
89 
90 	return 0;
91 }
92 
93 /**
94  * irdma_uquery_device - call driver to query device for max resources
95  * @context: user context for the device
96  * @attr: where to save all the mx resources from the driver
97  **/
98 int
99 irdma_uquery_device(struct ibv_context *context, struct ibv_device_attr *attr)
100 {
101 	struct ibv_query_device cmd;
102 	uint64_t fw_ver;
103 	int ret;
104 
105 	ret = ibv_cmd_query_device(context, attr, &fw_ver, &cmd, sizeof(cmd));
106 	if (ret)
107 		return ret;
108 
109 	print_fw_ver(fw_ver, attr->fw_ver, sizeof(attr->fw_ver));
110 
111 	return 0;
112 }
113 
114 /**
115  * irdma_uquery_port - get port attributes (msg size, lnk, mtu...)
116  * @context: user context of the device
117  * @port: port for the attributes
118  * @attr: to return port attributes
119  **/
120 int
121 irdma_uquery_port(struct ibv_context *context, uint8_t port,
122 		  struct ibv_port_attr *attr)
123 {
124 	struct ibv_query_port cmd;
125 
126 	return ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd));
127 }
128 
129 /**
130  * irdma_ualloc_pd - allocates protection domain and return pd ptr
131  * @context: user context of the device
132  **/
133 struct ibv_pd *
134 irdma_ualloc_pd(struct ibv_context *context)
135 {
136 	struct ibv_alloc_pd cmd;
137 	struct irdma_ualloc_pd_resp resp = {};
138 	struct irdma_upd *iwupd;
139 	int err;
140 
141 	iwupd = calloc(1, sizeof(*iwupd));
142 	if (!iwupd)
143 		return NULL;
144 
145 	err = ibv_cmd_alloc_pd(context, &iwupd->ibv_pd, &cmd, sizeof(cmd),
146 			       &resp.ibv_resp, sizeof(resp));
147 	if (err)
148 		goto err_free;
149 
150 	iwupd->pd_id = resp.pd_id;
151 
152 	return &iwupd->ibv_pd;
153 
154 err_free:
155 	free(iwupd);
156 	errno = err;
157 	return NULL;
158 }
159 
160 /**
161  * irdma_ufree_pd - free pd resources
162  * @pd: pd to free resources
163  */
164 int
165 irdma_ufree_pd(struct ibv_pd *pd)
166 {
167 	struct irdma_uvcontext *iwvctx = container_of(pd->context, struct irdma_uvcontext, ibv_ctx);
168 	struct irdma_upd *iwupd;
169 	int ret;
170 
171 	iwupd = container_of(pd, struct irdma_upd, ibv_pd);
172 	ret = ibv_cmd_dealloc_pd(pd);
173 	if (ret)
174 		return ret;
175 
176 	free(iwupd);
177 
178 	return 0;
179 }
180 
181 /**
182  * irdma_ureg_mr - register user memory region
183  * @pd: pd for the mr
184  * @addr: user address of the memory region
185  * @length: length of the memory
186  * @hca_va: hca_va
187  * @access: access allowed on this mr
188  */
189 struct ibv_mr *
190 irdma_ureg_mr(struct ibv_pd *pd, void *addr, size_t length,
191 	      int access)
192 {
193 	struct verbs_mr *vmr;
194 	struct irdma_ureg_mr cmd = {};
195 	struct ibv_reg_mr_resp resp;
196 	int err;
197 
198 	vmr = malloc(sizeof(*vmr));
199 	if (!vmr)
200 		return NULL;
201 
202 	cmd.reg_type = IRDMA_MEMREG_TYPE_MEM;
203 	err = ibv_cmd_reg_mr(pd, addr, length,
204 			     (uintptr_t)addr, access, &vmr->ibv_mr, &cmd.ibv_cmd,
205 			     sizeof(cmd), &resp, sizeof(resp));
206 	if (err) {
207 		free(vmr);
208 		errno = err;
209 		return NULL;
210 	}
211 
212 	return &vmr->ibv_mr;
213 }
214 
215 /*
216  * irdma_urereg_mr - re-register memory region @vmr: mr that was allocated @flags: bit mask to indicate which of the
217  * attr's of MR modified @pd: pd of the mr @addr: user address of the memory region @length: length of the memory
218  * @access: access allowed on this mr
219  */
220 int
221 irdma_urereg_mr(struct verbs_mr *vmr, int flags, struct ibv_pd *pd,
222 		void *addr, size_t length, int access)
223 {
224 	struct irdma_urereg_mr cmd = {};
225 	struct ibv_rereg_mr_resp resp;
226 
227 	cmd.reg_type = IRDMA_MEMREG_TYPE_MEM;
228 	return ibv_cmd_rereg_mr(&vmr->ibv_mr, flags, addr, length, (uintptr_t)addr,
229 				access, pd, &cmd.ibv_cmd, sizeof(cmd), &resp,
230 				sizeof(resp));
231 }
232 
233 /**
234  * irdma_udereg_mr - re-register memory region
235  * @mr: mr that was allocated
236  */
237 int
238 irdma_udereg_mr(struct ibv_mr *mr)
239 {
240 	struct verbs_mr *vmr;
241 	int ret;
242 
243 	vmr = container_of(mr, struct verbs_mr, ibv_mr);
244 
245 	ret = ibv_cmd_dereg_mr(mr);
246 	if (ret)
247 		return ret;
248 
249 	return 0;
250 }
251 
252 /**
253  * irdma_ualloc_mw - allocate memory window
254  * @pd: protection domain
255  * @type: memory window type
256  */
257 struct ibv_mw *
258 irdma_ualloc_mw(struct ibv_pd *pd, enum ibv_mw_type type)
259 {
260 	struct ibv_mw *mw;
261 	struct ibv_alloc_mw cmd;
262 	struct ibv_alloc_mw_resp resp;
263 	int err;
264 
265 	mw = calloc(1, sizeof(*mw));
266 	if (!mw)
267 		return NULL;
268 
269 	if (ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd), &resp,
270 			     sizeof(resp))) {
271 		printf("%s: Failed to alloc memory window\n",
272 		       __func__);
273 		free(mw);
274 		return NULL;
275 	}
276 
277 	return mw;
278 }
279 
280 /**
281  * irdma_ubind_mw - bind a memory window
282  * @qp: qp to post WR
283  * @mw: memory window to bind
284  * @mw_bind: bind info
285  */
286 int
287 irdma_ubind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
288 	       struct ibv_mw_bind *mw_bind)
289 {
290 	struct ibv_mw_bind_info *bind_info = &mw_bind->bind_info;
291 	struct verbs_mr *vmr;
292 
293 	struct ibv_send_wr wr = {};
294 	struct ibv_send_wr *bad_wr;
295 	int err;
296 
297 	if (!bind_info->mr && (bind_info->addr || bind_info->length))
298 		return EINVAL;
299 
300 	if (bind_info->mr) {
301 		vmr = verbs_get_mr(bind_info->mr);
302 		if (vmr->mr_type != IBV_MR_TYPE_MR)
303 			return ENOTSUP;
304 
305 		if (vmr->access & IBV_ACCESS_ZERO_BASED)
306 			return EINVAL;
307 
308 		if (mw->pd != bind_info->mr->pd)
309 			return EPERM;
310 	}
311 
312 	wr.opcode = IBV_WR_BIND_MW;
313 	wr.bind_mw.bind_info = mw_bind->bind_info;
314 	wr.bind_mw.mw = mw;
315 	wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey);
316 
317 	wr.wr_id = mw_bind->wr_id;
318 	wr.send_flags = mw_bind->send_flags;
319 
320 	err = irdma_upost_send(qp, &wr, &bad_wr);
321 	if (!err)
322 		mw->rkey = wr.bind_mw.rkey;
323 
324 	return err;
325 }
326 
327 /**
328  * irdma_udealloc_mw - deallocate memory window
329  * @mw: memory window to dealloc
330  */
331 int
332 irdma_udealloc_mw(struct ibv_mw *mw)
333 {
334 	int ret;
335 	struct ibv_dealloc_mw cmd;
336 
337 	ret = ibv_cmd_dealloc_mw(mw, &cmd, sizeof(cmd));
338 	if (ret)
339 		return ret;
340 	free(mw);
341 
342 	return 0;
343 }
344 
345 static void *
346 irdma_alloc_hw_buf(size_t size)
347 {
348 	void *buf;
349 
350 	buf = memalign(IRDMA_HW_PAGE_SIZE, size);
351 
352 	if (!buf)
353 		return NULL;
354 	if (ibv_dontfork_range(buf, size)) {
355 		free(buf);
356 		return NULL;
357 	}
358 
359 	return buf;
360 }
361 
362 static void
363 irdma_free_hw_buf(void *buf, size_t size)
364 {
365 	ibv_dofork_range(buf, size);
366 	free(buf);
367 }
368 
369 /**
370  * get_cq_size - returns actual cqe needed by HW
371  * @ncqe: minimum cqes requested by application
372  * @hw_rev: HW generation
373  * @cqe_64byte_ena: enable 64byte cqe
374  */
375 static inline int
376 get_cq_size(int ncqe, u8 hw_rev, bool cqe_64byte_ena)
377 {
378 	ncqe++;
379 
380 	/* Completions with immediate require 1 extra entry */
381 	if (!cqe_64byte_ena && hw_rev > IRDMA_GEN_1)
382 		ncqe *= 2;
383 
384 	if (ncqe < IRDMA_U_MINCQ_SIZE)
385 		ncqe = IRDMA_U_MINCQ_SIZE;
386 
387 	return ncqe;
388 }
389 
390 static inline size_t get_cq_total_bytes(u32 cq_size, bool cqe_64byte_ena){
391 	if (cqe_64byte_ena)
392 		return roundup(cq_size * sizeof(struct irdma_extended_cqe), IRDMA_HW_PAGE_SIZE);
393 	else
394 		return roundup(cq_size * sizeof(struct irdma_cqe), IRDMA_HW_PAGE_SIZE);
395 }
396 
397 /**
398  * ucreate_cq - irdma util function to create a CQ
399  * @context: ibv context
400  * @attr_ex: CQ init attributes
401  * @ext_cq: flag to create an extendable or normal CQ
402  */
403 static struct ibv_cq_ex *
404 ucreate_cq(struct ibv_context *context,
405 	   struct ibv_cq_init_attr_ex *attr_ex,
406 	   bool ext_cq)
407 {
408 	struct irdma_cq_uk_init_info info = {};
409 	struct irdma_ureg_mr reg_mr_cmd = {};
410 	struct irdma_ucreate_cq_ex cmd = {};
411 	struct irdma_ucreate_cq_ex_resp resp = {};
412 	struct ibv_reg_mr_resp reg_mr_resp = {};
413 	struct irdma_ureg_mr reg_mr_shadow_cmd = {};
414 	struct ibv_reg_mr_resp reg_mr_shadow_resp = {};
415 	struct irdma_uk_attrs *uk_attrs;
416 	struct irdma_uvcontext *iwvctx;
417 	struct irdma_ucq *iwucq;
418 	size_t total_size;
419 	u32 cq_pages;
420 	int ret, ncqe;
421 	u8 hw_rev;
422 	bool cqe_64byte_ena;
423 
424 	iwvctx = container_of(context, struct irdma_uvcontext, ibv_ctx);
425 	uk_attrs = &iwvctx->uk_attrs;
426 	hw_rev = uk_attrs->hw_rev;
427 
428 	if (ext_cq) {
429 		u32 supported_flags = IRDMA_STANDARD_WC_FLAGS_EX;
430 
431 		if (hw_rev == IRDMA_GEN_1 || attr_ex->wc_flags & ~supported_flags) {
432 			errno = EOPNOTSUPP;
433 			return NULL;
434 		}
435 	}
436 
437 	if (attr_ex->cqe < uk_attrs->min_hw_cq_size || attr_ex->cqe > uk_attrs->max_hw_cq_size - 1) {
438 		errno = EINVAL;
439 		return NULL;
440 	}
441 
442 	/* save the cqe requested by application */
443 	ncqe = attr_ex->cqe;
444 
445 	iwucq = calloc(1, sizeof(*iwucq));
446 	if (!iwucq)
447 		return NULL;
448 
449 	if (pthread_spin_init(&iwucq->lock, PTHREAD_PROCESS_PRIVATE)) {
450 		free(iwucq);
451 		return NULL;
452 	}
453 
454 	cqe_64byte_ena = uk_attrs->feature_flags & IRDMA_FEATURE_64_BYTE_CQE ? true : false;
455 	info.cq_size = get_cq_size(attr_ex->cqe, hw_rev, cqe_64byte_ena);
456 	iwucq->comp_vector = attr_ex->comp_vector;
457 	LIST_INIT(&iwucq->resize_list);
458 	LIST_INIT(&iwucq->cmpl_generated);
459 	total_size = get_cq_total_bytes(info.cq_size, cqe_64byte_ena);
460 	cq_pages = total_size >> IRDMA_HW_PAGE_SHIFT;
461 
462 	if (!(uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE))
463 		total_size = (cq_pages << IRDMA_HW_PAGE_SHIFT) + IRDMA_DB_SHADOW_AREA_SIZE;
464 
465 	iwucq->buf_size = total_size;
466 	info.cq_base = irdma_alloc_hw_buf(total_size);
467 	if (!info.cq_base)
468 		goto err_cq_base;
469 
470 	memset(info.cq_base, 0, total_size);
471 	reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ;
472 	reg_mr_cmd.cq_pages = cq_pages;
473 
474 	ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, info.cq_base,
475 			     total_size, (uintptr_t)info.cq_base,
476 			     IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr.ibv_mr,
477 			     &reg_mr_cmd.ibv_cmd, sizeof(reg_mr_cmd),
478 			     &reg_mr_resp, sizeof(reg_mr_resp));
479 	if (ret) {
480 		errno = ret;
481 		goto err_dereg_mr;
482 	}
483 
484 	iwucq->vmr.ibv_mr.pd = &iwvctx->iwupd->ibv_pd;
485 
486 	if (uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE) {
487 		info.shadow_area = irdma_alloc_hw_buf(IRDMA_DB_SHADOW_AREA_SIZE);
488 		if (!info.shadow_area)
489 			goto err_alloc_shadow;
490 
491 		memset(info.shadow_area, 0, IRDMA_DB_SHADOW_AREA_SIZE);
492 		reg_mr_shadow_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ;
493 		reg_mr_shadow_cmd.cq_pages = 1;
494 
495 		ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, info.shadow_area,
496 				     IRDMA_DB_SHADOW_AREA_SIZE, (uintptr_t)info.shadow_area,
497 				     IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr_shadow_area.ibv_mr,
498 				     &reg_mr_shadow_cmd.ibv_cmd, sizeof(reg_mr_shadow_cmd),
499 				     &reg_mr_shadow_resp, sizeof(reg_mr_shadow_resp));
500 		if (ret) {
501 			irdma_free_hw_buf(info.shadow_area, IRDMA_DB_SHADOW_AREA_SIZE);
502 			errno = ret;
503 			goto err_alloc_shadow;
504 		}
505 
506 		iwucq->vmr_shadow_area.ibv_mr.pd = &iwvctx->iwupd->ibv_pd;
507 
508 	} else {
509 		info.shadow_area = (__le64 *) ((u8 *)info.cq_base + (cq_pages << IRDMA_HW_PAGE_SHIFT));
510 	}
511 
512 	attr_ex->cqe = info.cq_size;
513 	cmd.user_cq_buf = (__u64) ((uintptr_t)info.cq_base);
514 	cmd.user_shadow_area = (__u64) ((uintptr_t)info.shadow_area);
515 
516 	ret = ibv_cmd_create_cq_ex(context, attr_ex, &iwucq->verbs_cq.cq_ex,
517 				   &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd), &resp.ibv_resp,
518 				   sizeof(resp.ibv_resp), sizeof(resp));
519 	attr_ex->cqe = ncqe;
520 	if (ret) {
521 		errno = ret;
522 		goto err_create_cq;
523 	}
524 
525 	if (ext_cq)
526 		irdma_ibvcq_ex_fill_priv_funcs(iwucq, attr_ex);
527 	info.cq_id = resp.cq_id;
528 	/* Do not report the CQE's reserved for immediate and burned by HW */
529 	iwucq->verbs_cq.cq.cqe = ncqe;
530 	if (cqe_64byte_ena)
531 		info.avoid_mem_cflct = true;
532 	info.cqe_alloc_db = (u32 *)((u8 *)iwvctx->db + IRDMA_DB_CQ_OFFSET);
533 	irdma_uk_cq_init(&iwucq->cq, &info);
534 	return &iwucq->verbs_cq.cq_ex;
535 
536 err_create_cq:
537 	if (iwucq->vmr_shadow_area.ibv_mr.handle) {
538 		ibv_cmd_dereg_mr(&iwucq->vmr_shadow_area.ibv_mr);
539 		irdma_free_hw_buf(info.shadow_area, IRDMA_DB_SHADOW_AREA_SIZE);
540 	}
541 err_alloc_shadow:
542 	ibv_cmd_dereg_mr(&iwucq->vmr.ibv_mr);
543 err_dereg_mr:
544 	irdma_free_hw_buf(info.cq_base, total_size);
545 err_cq_base:
546 	printf("%s: failed to initialize CQ\n", __func__);
547 	pthread_spin_destroy(&iwucq->lock);
548 
549 	free(iwucq);
550 
551 	return NULL;
552 }
553 
554 struct ibv_cq *
555 irdma_ucreate_cq(struct ibv_context *context, int cqe,
556 		 struct ibv_comp_channel *channel,
557 		 int comp_vector)
558 {
559 	struct ibv_cq_init_attr_ex attr_ex = {
560 		.cqe = cqe,
561 		.channel = channel,
562 		.comp_vector = comp_vector,
563 	};
564 	struct ibv_cq_ex *ibvcq_ex;
565 
566 	ibvcq_ex = ucreate_cq(context, &attr_ex, false);
567 
568 	return ibvcq_ex ? ibv_cq_ex_to_cq(ibvcq_ex) : NULL;
569 }
570 
571 struct ibv_cq_ex *
572 irdma_ucreate_cq_ex(struct ibv_context *context,
573 		    struct ibv_cq_init_attr_ex *attr_ex)
574 {
575 	return ucreate_cq(context, attr_ex, true);
576 }
577 
578 /**
579  * irdma_free_cq_buf - free memory for cq buffer
580  * @cq_buf: cq buf to free
581  */
582 static void
583 irdma_free_cq_buf(struct irdma_cq_buf *cq_buf)
584 {
585 	ibv_cmd_dereg_mr(&cq_buf->vmr.ibv_mr);
586 	irdma_free_hw_buf(cq_buf->cq.cq_base, cq_buf->buf_size);
587 	free(cq_buf);
588 }
589 
590 /**
591  * irdma_process_resize_list - process the cq list to remove buffers
592  * @iwucq: cq which owns the list
593  * @lcqe_buf: cq buf where the last cqe is found
594  */
595 static int
596 irdma_process_resize_list(struct irdma_ucq *iwucq,
597 			  struct irdma_cq_buf *lcqe_buf)
598 {
599 	struct irdma_cq_buf *cq_buf, *next;
600 	int cq_cnt = 0;
601 
602 	LIST_FOREACH_SAFE(cq_buf, &iwucq->resize_list, list, next) {
603 		if (cq_buf == lcqe_buf)
604 			return cq_cnt;
605 
606 		LIST_REMOVE(cq_buf, list);
607 		irdma_free_cq_buf(cq_buf);
608 		cq_cnt++;
609 	}
610 
611 	return cq_cnt;
612 }
613 
614 static void
615 irdma_remove_cmpls_list(struct irdma_ucq *iwucq)
616 {
617 	struct irdma_cmpl_gen *cmpl_node, *next;
618 
619 	LIST_FOREACH_SAFE(cmpl_node, &iwucq->cmpl_generated, list, next) {
620 		LIST_REMOVE(cmpl_node, list);
621 		free(cmpl_node);
622 	}
623 }
624 
625 static int
626 irdma_generated_cmpls(struct irdma_ucq *iwucq, struct irdma_cq_poll_info *cq_poll_info)
627 {
628 	struct irdma_cmpl_gen *cmpl;
629 
630 	if (!iwucq || LIST_EMPTY(&iwucq->cmpl_generated))
631 		return ENOENT;
632 	cmpl = LIST_FIRST(&iwucq->cmpl_generated);
633 	LIST_REMOVE(cmpl, list);
634 	memcpy(cq_poll_info, &cmpl->cpi, sizeof(*cq_poll_info));
635 
636 	free(cmpl);
637 
638 	return 0;
639 }
640 
641 /**
642  * irdma_set_cpi_common_values - fill in values for polling info struct
643  * @cpi: resulting structure of cq_poll_info type
644  * @qp: QPair
645  * @qp_num: id of the QP
646  */
647 static void
648 irdma_set_cpi_common_values(struct irdma_cq_poll_info *cpi,
649 			    struct irdma_qp_uk *qp, __u32 qp_num)
650 {
651 	cpi->comp_status = IRDMA_COMPL_STATUS_FLUSHED;
652 	cpi->error = 1;
653 	cpi->major_err = IRDMA_FLUSH_MAJOR_ERR;
654 	cpi->minor_err = FLUSH_GENERAL_ERR;
655 	cpi->qp_handle = (irdma_qp_handle) (uintptr_t)qp;
656 	cpi->qp_id = qp_num;
657 }
658 
659 static bool
660 irdma_cq_empty(struct irdma_ucq *iwucq)
661 {
662 	struct irdma_cq_uk *ukcq;
663 	__u64 qword3;
664 	__le64 *cqe;
665 	__u8 polarity;
666 
667 	ukcq = &iwucq->cq;
668 	cqe = IRDMA_GET_CURRENT_CQ_ELEM(ukcq);
669 	get_64bit_val(cqe, 24, &qword3);
670 	polarity = (__u8) FIELD_GET(IRDMA_CQ_VALID, qword3);
671 
672 	return polarity != ukcq->polarity;
673 }
674 
675 /**
676  * irdma_generate_flush_completions - generate completion from WRs
677  * @iwuqp: pointer to QP
678  */
679 static void
680 irdma_generate_flush_completions(struct irdma_uqp *iwuqp)
681 {
682 	struct irdma_qp_uk *qp = &iwuqp->qp;
683 	struct irdma_ring *sq_ring = &qp->sq_ring;
684 	struct irdma_ring *rq_ring = &qp->rq_ring;
685 	struct irdma_cmpl_gen *cmpl;
686 	__le64 *sw_wqe;
687 	__u64 wqe_qword;
688 	__u32 wqe_idx;
689 
690 	if (pthread_spin_lock(&iwuqp->send_cq->lock))
691 		return;
692 	if (irdma_cq_empty(iwuqp->send_cq)) {
693 		while (IRDMA_RING_MORE_WORK(*sq_ring)) {
694 			cmpl = malloc(sizeof(*cmpl));
695 			if (!cmpl) {
696 				pthread_spin_unlock(&iwuqp->send_cq->lock);
697 				return;
698 			}
699 
700 			wqe_idx = sq_ring->tail;
701 			irdma_set_cpi_common_values(&cmpl->cpi, qp, qp->qp_id);
702 			cmpl->cpi.wr_id = qp->sq_wrtrk_array[wqe_idx].wrid;
703 			sw_wqe = qp->sq_base[wqe_idx].elem;
704 			get_64bit_val(sw_wqe, 24, &wqe_qword);
705 			cmpl->cpi.op_type = (__u8) FIELD_GET(IRDMAQPSQ_OPCODE, wqe_qword);
706 			/* remove the SQ WR by moving SQ tail */
707 			IRDMA_RING_SET_TAIL(*sq_ring, sq_ring->tail + qp->sq_wrtrk_array[sq_ring->tail].quanta);
708 			LIST_INSERT_HEAD(&iwuqp->send_cq->cmpl_generated, cmpl, list);
709 		}
710 	}
711 	pthread_spin_unlock(&iwuqp->send_cq->lock);
712 	if (pthread_spin_lock(&iwuqp->recv_cq->lock))
713 		return;
714 	if (irdma_cq_empty(iwuqp->recv_cq)) {
715 		while (IRDMA_RING_MORE_WORK(*rq_ring)) {
716 			cmpl = malloc(sizeof(*cmpl));
717 			if (!cmpl) {
718 				pthread_spin_unlock(&iwuqp->recv_cq->lock);
719 				return;
720 			}
721 
722 			wqe_idx = rq_ring->tail;
723 			irdma_set_cpi_common_values(&cmpl->cpi, qp, qp->qp_id);
724 			cmpl->cpi.wr_id = qp->rq_wrid_array[wqe_idx];
725 			cmpl->cpi.op_type = IRDMA_OP_TYPE_REC;
726 			/* remove the RQ WR by moving RQ tail */
727 			IRDMA_RING_SET_TAIL(*rq_ring, rq_ring->tail + 1);
728 			LIST_INSERT_HEAD(&iwuqp->recv_cq->cmpl_generated, cmpl, list);
729 		}
730 	}
731 	pthread_spin_unlock(&iwuqp->recv_cq->lock);
732 }
733 
734 void *
735 irdma_flush_thread(void *arg)
736 {
737 	__u8 i = 5;
738 	struct irdma_uqp *iwuqp = arg;
739 
740 	while (--i) {
741 		if (pthread_spin_lock(&iwuqp->lock))
742 			break;
743 		irdma_generate_flush_completions(arg);
744 		pthread_spin_unlock(&iwuqp->lock);
745 		sleep(1);
746 	}
747 	pthread_exit(NULL);
748 }
749 
750 /**
751  * irdma_udestroy_cq - destroys cq
752  * @cq: ptr to cq to be destroyed
753  */
754 int
755 irdma_udestroy_cq(struct ibv_cq *cq)
756 {
757 	struct irdma_uk_attrs *uk_attrs;
758 	struct irdma_uvcontext *iwvctx;
759 	struct irdma_ucq *iwucq;
760 	int ret;
761 
762 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
763 	iwvctx = container_of(cq->context, struct irdma_uvcontext, ibv_ctx);
764 	uk_attrs = &iwvctx->uk_attrs;
765 
766 	ret = pthread_spin_destroy(&iwucq->lock);
767 	if (ret)
768 		goto err;
769 
770 	if (!LIST_EMPTY(&iwucq->cmpl_generated))
771 		irdma_remove_cmpls_list(iwucq);
772 	irdma_process_resize_list(iwucq, NULL);
773 	ret = ibv_cmd_destroy_cq(cq);
774 	if (ret)
775 		goto err;
776 
777 	ibv_cmd_dereg_mr(&iwucq->vmr.ibv_mr);
778 	irdma_free_hw_buf(iwucq->cq.cq_base, iwucq->buf_size);
779 
780 	if (uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE) {
781 		ibv_cmd_dereg_mr(&iwucq->vmr_shadow_area.ibv_mr);
782 		irdma_free_hw_buf(iwucq->cq.shadow_area, IRDMA_DB_SHADOW_AREA_SIZE);
783 	}
784 	free(iwucq);
785 	return 0;
786 
787 err:
788 	return ret;
789 }
790 
791 static enum ibv_wc_status
792 irdma_flush_err_to_ib_wc_status(enum irdma_flush_opcode opcode)
793 {
794 	switch (opcode) {
795 	case FLUSH_PROT_ERR:
796 		return IBV_WC_LOC_PROT_ERR;
797 	case FLUSH_REM_ACCESS_ERR:
798 		return IBV_WC_REM_ACCESS_ERR;
799 	case FLUSH_LOC_QP_OP_ERR:
800 		return IBV_WC_LOC_QP_OP_ERR;
801 	case FLUSH_REM_OP_ERR:
802 		return IBV_WC_REM_OP_ERR;
803 	case FLUSH_LOC_LEN_ERR:
804 		return IBV_WC_LOC_LEN_ERR;
805 	case FLUSH_GENERAL_ERR:
806 		return IBV_WC_WR_FLUSH_ERR;
807 	case FLUSH_MW_BIND_ERR:
808 		return IBV_WC_MW_BIND_ERR;
809 	case FLUSH_REM_INV_REQ_ERR:
810 		return IBV_WC_REM_INV_REQ_ERR;
811 	case FLUSH_RETRY_EXC_ERR:
812 		return IBV_WC_RETRY_EXC_ERR;
813 	case FLUSH_FATAL_ERR:
814 	default:
815 		return IBV_WC_FATAL_ERR;
816 	}
817 }
818 
819 static inline void
820 set_ib_wc_op_sq(struct irdma_cq_poll_info *cur_cqe, struct ibv_wc *entry)
821 {
822 	switch (cur_cqe->op_type) {
823 	case IRDMA_OP_TYPE_RDMA_WRITE:
824 	case IRDMA_OP_TYPE_RDMA_WRITE_SOL:
825 		entry->opcode = IBV_WC_RDMA_WRITE;
826 		break;
827 	case IRDMA_OP_TYPE_RDMA_READ:
828 		entry->opcode = IBV_WC_RDMA_READ;
829 		break;
830 	case IRDMA_OP_TYPE_SEND_SOL:
831 	case IRDMA_OP_TYPE_SEND_SOL_INV:
832 	case IRDMA_OP_TYPE_SEND_INV:
833 	case IRDMA_OP_TYPE_SEND:
834 		entry->opcode = IBV_WC_SEND;
835 		break;
836 	case IRDMA_OP_TYPE_BIND_MW:
837 		entry->opcode = IBV_WC_BIND_MW;
838 		break;
839 	case IRDMA_OP_TYPE_INV_STAG:
840 		entry->opcode = IBV_WC_LOCAL_INV;
841 		break;
842 	default:
843 		entry->status = IBV_WC_GENERAL_ERR;
844 		printf("%s: Invalid opcode = %d in CQE\n",
845 		       __func__, cur_cqe->op_type);
846 	}
847 }
848 
849 static inline void
850 set_ib_wc_op_rq(struct irdma_cq_poll_info *cur_cqe,
851 		struct ibv_wc *entry, bool send_imm_support)
852 {
853 	if (!send_imm_support) {
854 		entry->opcode = cur_cqe->imm_valid ? IBV_WC_RECV_RDMA_WITH_IMM :
855 		    IBV_WC_RECV;
856 		return;
857 	}
858 	switch (cur_cqe->op_type) {
859 	case IBV_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
860 	case IBV_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
861 		entry->opcode = IBV_WC_RECV_RDMA_WITH_IMM;
862 		break;
863 	default:
864 		entry->opcode = IBV_WC_RECV;
865 	}
866 }
867 
868 /**
869  * irdma_process_cqe_ext - process current cqe for extended CQ
870  * @cur_cqe - current cqe info
871  */
872 static void
873 irdma_process_cqe_ext(struct irdma_cq_poll_info *cur_cqe)
874 {
875 	struct irdma_ucq *iwucq = container_of(cur_cqe, struct irdma_ucq, cur_cqe);
876 	struct ibv_cq_ex *ibvcq_ex = &iwucq->verbs_cq.cq_ex;
877 
878 	ibvcq_ex->wr_id = cur_cqe->wr_id;
879 	if (cur_cqe->error)
880 		ibvcq_ex->status = (cur_cqe->comp_status == IRDMA_COMPL_STATUS_FLUSHED) ?
881 		    irdma_flush_err_to_ib_wc_status(cur_cqe->minor_err) : IBV_WC_GENERAL_ERR;
882 	else
883 		ibvcq_ex->status = IBV_WC_SUCCESS;
884 }
885 
886 /**
887  * irdma_process_cqe - process current cqe info
888  * @entry - ibv_wc object to fill in for non-extended CQ
889  * @cur_cqe - current cqe info
890  */
891 static void
892 irdma_process_cqe(struct ibv_wc *entry, struct irdma_cq_poll_info *cur_cqe)
893 {
894 	struct irdma_qp_uk *qp;
895 	struct ibv_qp *ib_qp;
896 
897 	entry->wc_flags = 0;
898 	entry->wr_id = cur_cqe->wr_id;
899 	entry->qp_num = cur_cqe->qp_id;
900 	qp = cur_cqe->qp_handle;
901 	ib_qp = qp->back_qp;
902 
903 	if (cur_cqe->error) {
904 		entry->status = (cur_cqe->comp_status == IRDMA_COMPL_STATUS_FLUSHED) ?
905 		    irdma_flush_err_to_ib_wc_status(cur_cqe->minor_err) : IBV_WC_GENERAL_ERR;
906 		entry->vendor_err = cur_cqe->major_err << 16 |
907 		    cur_cqe->minor_err;
908 	} else {
909 		entry->status = IBV_WC_SUCCESS;
910 	}
911 
912 	if (cur_cqe->imm_valid) {
913 		entry->imm_data = htonl(cur_cqe->imm_data);
914 		entry->wc_flags |= IBV_WC_WITH_IMM;
915 	}
916 
917 	if (cur_cqe->q_type == IRDMA_CQE_QTYPE_SQ) {
918 		set_ib_wc_op_sq(cur_cqe, entry);
919 	} else {
920 		set_ib_wc_op_rq(cur_cqe, entry,
921 				qp->qp_caps & IRDMA_SEND_WITH_IMM ?
922 				true : false);
923 		if (ib_qp->qp_type != IBV_QPT_UD &&
924 		    cur_cqe->stag_invalid_set) {
925 			entry->invalidated_rkey = cur_cqe->inv_stag;
926 			entry->wc_flags |= IBV_WC_WITH_INV;
927 		}
928 	}
929 
930 	if (ib_qp->qp_type == IBV_QPT_UD) {
931 		entry->src_qp = cur_cqe->ud_src_qpn;
932 		entry->wc_flags |= IBV_WC_GRH;
933 	} else {
934 		entry->src_qp = cur_cqe->qp_id;
935 	}
936 	entry->byte_len = cur_cqe->bytes_xfered;
937 }
938 
939 /**
940  * irdma_poll_one - poll one entry of the CQ
941  * @ukcq: ukcq to poll
942  * @cur_cqe: current CQE info to be filled in
943  * @entry: ibv_wc object to be filled for non-extended CQ or NULL for extended CQ
944  *
945  * Returns the internal irdma device error code or 0 on success
946  */
947 static int
948 irdma_poll_one(struct irdma_cq_uk *ukcq, struct irdma_cq_poll_info *cur_cqe,
949 	       struct ibv_wc *entry)
950 {
951 	int ret = irdma_uk_cq_poll_cmpl(ukcq, cur_cqe);
952 
953 	if (ret)
954 		return ret;
955 
956 	if (!entry)
957 		irdma_process_cqe_ext(cur_cqe);
958 	else
959 		irdma_process_cqe(entry, cur_cqe);
960 
961 	return 0;
962 }
963 
964 /**
965  * __irdma_upoll_cq - irdma util function to poll device CQ
966  * @iwucq: irdma cq to poll
967  * @num_entries: max cq entries to poll
968  * @entry: pointer to array of ibv_wc objects to be filled in for each completion or NULL if ext CQ
969  *
970  * Returns non-negative value equal to the number of completions
971  * found. On failure, EINVAL
972  */
973 static int
974 __irdma_upoll_cq(struct irdma_ucq *iwucq, int num_entries,
975 		 struct ibv_wc *entry)
976 {
977 	struct irdma_cq_buf *cq_buf, *next;
978 	struct irdma_cq_buf *last_buf = NULL;
979 	struct irdma_cq_poll_info *cur_cqe = &iwucq->cur_cqe;
980 	bool cq_new_cqe = false;
981 	int resized_bufs = 0;
982 	int npolled = 0;
983 	int ret;
984 
985 	/* go through the list of previously resized CQ buffers */
986 	LIST_FOREACH_SAFE(cq_buf, &iwucq->resize_list, list, next) {
987 		while (npolled < num_entries) {
988 			ret = irdma_poll_one(&cq_buf->cq, cur_cqe,
989 					     entry ? entry + npolled : NULL);
990 			if (!ret) {
991 				++npolled;
992 				cq_new_cqe = true;
993 				continue;
994 			}
995 			if (ret == ENOENT)
996 				break;
997 			/* QP using the CQ is destroyed. Skip reporting this CQE */
998 			if (ret == EFAULT) {
999 				cq_new_cqe = true;
1000 				continue;
1001 			}
1002 			goto error;
1003 		}
1004 
1005 		/* save the resized CQ buffer which received the last cqe */
1006 		if (cq_new_cqe)
1007 			last_buf = cq_buf;
1008 		cq_new_cqe = false;
1009 	}
1010 
1011 	/* check the current CQ for new cqes */
1012 	while (npolled < num_entries) {
1013 		ret = irdma_poll_one(&iwucq->cq, cur_cqe,
1014 				     entry ? entry + npolled : NULL);
1015 		if (ret == ENOENT) {
1016 			ret = irdma_generated_cmpls(iwucq, cur_cqe);
1017 			if (!ret) {
1018 				if (entry)
1019 					irdma_process_cqe(entry + npolled, cur_cqe);
1020 				else
1021 					irdma_process_cqe_ext(cur_cqe);
1022 			}
1023 		}
1024 		if (!ret) {
1025 			++npolled;
1026 			cq_new_cqe = true;
1027 			continue;
1028 		}
1029 		if (ret == ENOENT)
1030 			break;
1031 		/* QP using the CQ is destroyed. Skip reporting this CQE */
1032 		if (ret == EFAULT) {
1033 			cq_new_cqe = true;
1034 			continue;
1035 		}
1036 		goto error;
1037 	}
1038 
1039 	if (cq_new_cqe)
1040 		/* all previous CQ resizes are complete */
1041 		resized_bufs = irdma_process_resize_list(iwucq, NULL);
1042 	else if (last_buf)
1043 		/* only CQ resizes up to the last_buf are complete */
1044 		resized_bufs = irdma_process_resize_list(iwucq, last_buf);
1045 	if (resized_bufs)
1046 		/* report to the HW the number of complete CQ resizes */
1047 		irdma_uk_cq_set_resized_cnt(&iwucq->cq, resized_bufs);
1048 
1049 	return npolled;
1050 
1051 error:
1052 	printf("%s: Error polling CQ, irdma_err: %d\n", __func__, ret);
1053 
1054 	return EINVAL;
1055 }
1056 
1057 /**
1058  * irdma_upoll_cq - verb API callback to poll device CQ
1059  * @cq: ibv_cq to poll
1060  * @num_entries: max cq entries to poll
1061  * @entry: pointer to array of ibv_wc objects to be filled in for each completion
1062  *
1063  * Returns non-negative value equal to the number of completions
1064  * found and a negative error code on failure
1065  */
1066 int
1067 irdma_upoll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *entry)
1068 {
1069 	struct irdma_ucq *iwucq;
1070 	int ret;
1071 
1072 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
1073 	ret = pthread_spin_lock(&iwucq->lock);
1074 	if (ret)
1075 		return -ret;
1076 
1077 	ret = __irdma_upoll_cq(iwucq, num_entries, entry);
1078 
1079 	pthread_spin_unlock(&iwucq->lock);
1080 
1081 	return ret;
1082 }
1083 
1084 /**
1085  * irdma_start_poll - verb_ex API callback to poll batch of WC's
1086  * @ibvcq_ex: ibv extended CQ
1087  * @attr: attributes (not used)
1088  *
1089  * Start polling batch of work completions. Return 0 on success, ENONENT when
1090  * no completions are available on CQ. And an error code on errors
1091  */
1092 static int
1093 irdma_start_poll(struct ibv_cq_ex *ibvcq_ex, struct ibv_poll_cq_attr *attr)
1094 {
1095 	struct irdma_ucq *iwucq;
1096 	int ret;
1097 
1098 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1099 	ret = pthread_spin_lock(&iwucq->lock);
1100 	if (ret)
1101 		return ret;
1102 
1103 	ret = __irdma_upoll_cq(iwucq, 1, NULL);
1104 	if (ret == 1)
1105 		return 0;
1106 
1107 	/* No Completions on CQ */
1108 	if (!ret)
1109 		ret = ENOENT;
1110 
1111 	pthread_spin_unlock(&iwucq->lock);
1112 
1113 	return ret;
1114 }
1115 
1116 /**
1117  * irdma_next_poll - verb_ex API callback to get next WC
1118  * @ibvcq_ex: ibv extended CQ
1119  *
1120  * Return 0 on success, ENONENT when no completions are available on CQ.
1121  * And an error code on errors
1122  */
1123 static int
1124 irdma_next_poll(struct ibv_cq_ex *ibvcq_ex)
1125 {
1126 	struct irdma_ucq *iwucq;
1127 	int ret;
1128 
1129 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1130 	ret = __irdma_upoll_cq(iwucq, 1, NULL);
1131 	if (ret == 1)
1132 		return 0;
1133 
1134 	/* No Completions on CQ */
1135 	if (!ret)
1136 		ret = ENOENT;
1137 
1138 	return ret;
1139 }
1140 
1141 /**
1142  * irdma_end_poll - verb_ex API callback to end polling of WC's
1143  * @ibvcq_ex: ibv extended CQ
1144  */
1145 static void
1146 irdma_end_poll(struct ibv_cq_ex *ibvcq_ex)
1147 {
1148 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1149 					       verbs_cq.cq_ex);
1150 
1151 	pthread_spin_unlock(&iwucq->lock);
1152 }
1153 
1154 static enum ibv_wc_opcode
1155 irdma_wc_read_opcode(struct ibv_cq_ex *ibvcq_ex)
1156 {
1157 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1158 					       verbs_cq.cq_ex);
1159 
1160 	switch (iwucq->cur_cqe.op_type) {
1161 	case IRDMA_OP_TYPE_RDMA_WRITE:
1162 	case IRDMA_OP_TYPE_RDMA_WRITE_SOL:
1163 		return IBV_WC_RDMA_WRITE;
1164 	case IRDMA_OP_TYPE_RDMA_READ:
1165 		return IBV_WC_RDMA_READ;
1166 	case IRDMA_OP_TYPE_SEND_SOL:
1167 	case IRDMA_OP_TYPE_SEND_SOL_INV:
1168 	case IRDMA_OP_TYPE_SEND_INV:
1169 	case IRDMA_OP_TYPE_SEND:
1170 		return IBV_WC_SEND;
1171 	case IRDMA_OP_TYPE_BIND_MW:
1172 		return IBV_WC_BIND_MW;
1173 	case IRDMA_OP_TYPE_REC:
1174 		return IBV_WC_RECV;
1175 	case IRDMA_OP_TYPE_REC_IMM:
1176 		return IBV_WC_RECV_RDMA_WITH_IMM;
1177 	case IRDMA_OP_TYPE_INV_STAG:
1178 		return IBV_WC_LOCAL_INV;
1179 	}
1180 
1181 	printf("%s: Invalid opcode = %d in CQE\n", __func__,
1182 	       iwucq->cur_cqe.op_type);
1183 
1184 	return 0;
1185 }
1186 
1187 static uint32_t irdma_wc_read_vendor_err(struct ibv_cq_ex *ibvcq_ex){
1188 	struct irdma_cq_poll_info *cur_cqe;
1189 	struct irdma_ucq *iwucq;
1190 
1191 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1192 	cur_cqe = &iwucq->cur_cqe;
1193 
1194 	return cur_cqe->error ? cur_cqe->major_err << 16 | cur_cqe->minor_err : 0;
1195 }
1196 
1197 static int
1198 irdma_wc_read_wc_flags(struct ibv_cq_ex *ibvcq_ex)
1199 {
1200 	struct irdma_cq_poll_info *cur_cqe;
1201 	struct irdma_ucq *iwucq;
1202 	struct irdma_qp_uk *qp;
1203 	struct ibv_qp *ib_qp;
1204 	int wc_flags = 0;
1205 
1206 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1207 	cur_cqe = &iwucq->cur_cqe;
1208 	qp = cur_cqe->qp_handle;
1209 	ib_qp = qp->back_qp;
1210 
1211 	if (cur_cqe->imm_valid)
1212 		wc_flags |= IBV_WC_WITH_IMM;
1213 
1214 	if (ib_qp->qp_type == IBV_QPT_UD) {
1215 		wc_flags |= IBV_WC_GRH;
1216 	} else {
1217 		if (cur_cqe->stag_invalid_set) {
1218 			switch (cur_cqe->op_type) {
1219 			case IRDMA_OP_TYPE_REC:
1220 				wc_flags |= IBV_WC_WITH_INV;
1221 				break;
1222 			case IRDMA_OP_TYPE_REC_IMM:
1223 				wc_flags |= IBV_WC_WITH_INV;
1224 				break;
1225 			}
1226 		}
1227 	}
1228 
1229 	return wc_flags;
1230 }
1231 
1232 static uint32_t irdma_wc_read_byte_len(struct ibv_cq_ex *ibvcq_ex){
1233 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1234 					       verbs_cq.cq_ex);
1235 
1236 	return iwucq->cur_cqe.bytes_xfered;
1237 }
1238 
1239 static __be32 irdma_wc_read_imm_data(struct ibv_cq_ex *ibvcq_ex){
1240 	struct irdma_cq_poll_info *cur_cqe;
1241 	struct irdma_ucq *iwucq;
1242 
1243 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1244 	cur_cqe = &iwucq->cur_cqe;
1245 
1246 	return cur_cqe->imm_valid ? htonl(cur_cqe->imm_data) : 0;
1247 }
1248 
1249 static uint32_t irdma_wc_read_qp_num(struct ibv_cq_ex *ibvcq_ex){
1250 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1251 					       verbs_cq.cq_ex);
1252 
1253 	return iwucq->cur_cqe.qp_id;
1254 }
1255 
1256 static uint32_t irdma_wc_read_src_qp(struct ibv_cq_ex *ibvcq_ex){
1257 	struct irdma_cq_poll_info *cur_cqe;
1258 	struct irdma_ucq *iwucq;
1259 	struct irdma_qp_uk *qp;
1260 	struct ibv_qp *ib_qp;
1261 
1262 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1263 	cur_cqe = &iwucq->cur_cqe;
1264 	qp = cur_cqe->qp_handle;
1265 	ib_qp = qp->back_qp;
1266 
1267 	return ib_qp->qp_type == IBV_QPT_UD ? cur_cqe->ud_src_qpn : cur_cqe->qp_id;
1268 }
1269 
1270 static uint8_t irdma_wc_read_sl(struct ibv_cq_ex *ibvcq_ex){
1271 	return 0;
1272 }
1273 
1274 void
1275 irdma_ibvcq_ex_fill_priv_funcs(struct irdma_ucq *iwucq,
1276 			       struct ibv_cq_init_attr_ex *attr_ex)
1277 {
1278 	struct ibv_cq_ex *ibvcq_ex = &iwucq->verbs_cq.cq_ex;
1279 
1280 	ibvcq_ex->start_poll = irdma_start_poll;
1281 	ibvcq_ex->end_poll = irdma_end_poll;
1282 	ibvcq_ex->next_poll = irdma_next_poll;
1283 
1284 	ibvcq_ex->read_opcode = irdma_wc_read_opcode;
1285 	ibvcq_ex->read_vendor_err = irdma_wc_read_vendor_err;
1286 	ibvcq_ex->read_wc_flags = irdma_wc_read_wc_flags;
1287 
1288 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_BYTE_LEN)
1289 		ibvcq_ex->read_byte_len = irdma_wc_read_byte_len;
1290 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_IMM)
1291 		ibvcq_ex->read_imm_data = irdma_wc_read_imm_data;
1292 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_QP_NUM)
1293 		ibvcq_ex->read_qp_num = irdma_wc_read_qp_num;
1294 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_SRC_QP)
1295 		ibvcq_ex->read_src_qp = irdma_wc_read_src_qp;
1296 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_SL)
1297 		ibvcq_ex->read_sl = irdma_wc_read_sl;
1298 }
1299 
1300 /**
1301  * irdma_arm_cq - arm of cq
1302  * @iwucq: cq to which arm
1303  * @cq_notify: notification params
1304  */
1305 static void
1306 irdma_arm_cq(struct irdma_ucq *iwucq,
1307 	     enum irdma_cmpl_notify cq_notify)
1308 {
1309 	iwucq->is_armed = true;
1310 	iwucq->arm_sol = true;
1311 	iwucq->skip_arm = false;
1312 	iwucq->skip_sol = true;
1313 	irdma_uk_cq_request_notification(&iwucq->cq, cq_notify);
1314 }
1315 
1316 /**
1317  * irdma_uarm_cq - callback for arm of cq
1318  * @cq: cq to arm
1319  * @solicited: to get notify params
1320  */
1321 int
1322 irdma_uarm_cq(struct ibv_cq *cq, int solicited)
1323 {
1324 	struct irdma_ucq *iwucq;
1325 	enum irdma_cmpl_notify cq_notify = IRDMA_CQ_COMPL_EVENT;
1326 	int ret;
1327 
1328 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
1329 	if (solicited)
1330 		cq_notify = IRDMA_CQ_COMPL_SOLICITED;
1331 
1332 	ret = pthread_spin_lock(&iwucq->lock);
1333 	if (ret)
1334 		return ret;
1335 
1336 	if (iwucq->is_armed) {
1337 		if (iwucq->arm_sol && !solicited) {
1338 			irdma_arm_cq(iwucq, cq_notify);
1339 		} else {
1340 			iwucq->skip_arm = true;
1341 			iwucq->skip_sol = solicited ? true : false;
1342 		}
1343 	} else {
1344 		irdma_arm_cq(iwucq, cq_notify);
1345 	}
1346 
1347 	pthread_spin_unlock(&iwucq->lock);
1348 
1349 	return 0;
1350 }
1351 
1352 /**
1353  * irdma_cq_event - cq to do completion event
1354  * @cq: cq to arm
1355  */
1356 void
1357 irdma_cq_event(struct ibv_cq *cq)
1358 {
1359 	struct irdma_ucq *iwucq;
1360 
1361 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
1362 	if (pthread_spin_lock(&iwucq->lock))
1363 		return;
1364 
1365 	if (iwucq->skip_arm)
1366 		irdma_arm_cq(iwucq, IRDMA_CQ_COMPL_EVENT);
1367 	else
1368 		iwucq->is_armed = false;
1369 
1370 	pthread_spin_unlock(&iwucq->lock);
1371 }
1372 
1373 void *
1374 irdma_mmap(int fd, off_t offset)
1375 {
1376 	void *map;
1377 
1378 	map = mmap(NULL, IRDMA_HW_PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED,
1379 		   fd, offset);
1380 	if (map == MAP_FAILED)
1381 		return map;
1382 
1383 	if (ibv_dontfork_range(map, IRDMA_HW_PAGE_SIZE)) {
1384 		munmap(map, IRDMA_HW_PAGE_SIZE);
1385 		return MAP_FAILED;
1386 	}
1387 
1388 	return map;
1389 }
1390 
1391 void
1392 irdma_munmap(void *map)
1393 {
1394 	ibv_dofork_range(map, IRDMA_HW_PAGE_SIZE);
1395 	munmap(map, IRDMA_HW_PAGE_SIZE);
1396 }
1397 
1398 /**
1399  * irdma_destroy_vmapped_qp - destroy resources for qp
1400  * @iwuqp: qp struct for resources
1401  */
1402 static int
1403 irdma_destroy_vmapped_qp(struct irdma_uqp *iwuqp)
1404 {
1405 	int ret;
1406 
1407 	ret = ibv_cmd_destroy_qp(&iwuqp->ibv_qp);
1408 	if (ret)
1409 		return ret;
1410 
1411 	if (iwuqp->qp.push_db)
1412 		irdma_munmap(iwuqp->qp.push_db);
1413 	if (iwuqp->qp.push_wqe)
1414 		irdma_munmap(iwuqp->qp.push_wqe);
1415 
1416 	ibv_cmd_dereg_mr(&iwuqp->vmr.ibv_mr);
1417 
1418 	return 0;
1419 }
1420 
1421 /**
1422  * irdma_vmapped_qp - create resources for qp
1423  * @iwuqp: qp struct for resources
1424  * @pd: pd for the qp
1425  * @attr: attributes of qp passed
1426  * @resp: response back from create qp
1427  * @info: uk info for initializing user level qp
1428  * @abi_ver: abi version of the create qp command
1429  */
1430 static int
1431 irdma_vmapped_qp(struct irdma_uqp *iwuqp, struct ibv_pd *pd,
1432 		 struct ibv_qp_init_attr *attr,
1433 		 struct irdma_qp_uk_init_info *info,
1434 		 bool legacy_mode)
1435 {
1436 	struct irdma_ucreate_qp cmd = {};
1437 	size_t sqsize, rqsize, totalqpsize;
1438 	struct irdma_ucreate_qp_resp resp = {};
1439 	struct irdma_ureg_mr reg_mr_cmd = {};
1440 	struct ibv_reg_mr_resp reg_mr_resp = {};
1441 	int ret;
1442 
1443 	sqsize = roundup(info->sq_depth * IRDMA_QP_WQE_MIN_SIZE, IRDMA_HW_PAGE_SIZE);
1444 	rqsize = roundup(info->rq_depth * IRDMA_QP_WQE_MIN_SIZE, IRDMA_HW_PAGE_SIZE);
1445 	totalqpsize = rqsize + sqsize + IRDMA_DB_SHADOW_AREA_SIZE;
1446 	info->sq = irdma_alloc_hw_buf(totalqpsize);
1447 	iwuqp->buf_size = totalqpsize;
1448 
1449 	if (!info->sq)
1450 		return ENOMEM;
1451 
1452 	memset(info->sq, 0, totalqpsize);
1453 	info->rq = &info->sq[sqsize / IRDMA_QP_WQE_MIN_SIZE];
1454 	info->shadow_area = info->rq[rqsize / IRDMA_QP_WQE_MIN_SIZE].elem;
1455 
1456 	reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_QP;
1457 	reg_mr_cmd.sq_pages = sqsize >> IRDMA_HW_PAGE_SHIFT;
1458 	reg_mr_cmd.rq_pages = rqsize >> IRDMA_HW_PAGE_SHIFT;
1459 
1460 	ret = ibv_cmd_reg_mr(pd, info->sq, totalqpsize,
1461 			     (uintptr_t)info->sq, IBV_ACCESS_LOCAL_WRITE,
1462 			     &iwuqp->vmr.ibv_mr, &reg_mr_cmd.ibv_cmd,
1463 			     sizeof(reg_mr_cmd), &reg_mr_resp,
1464 			     sizeof(reg_mr_resp));
1465 	if (ret)
1466 		goto err_dereg_mr;
1467 
1468 	cmd.user_wqe_bufs = (__u64) ((uintptr_t)info->sq);
1469 	cmd.user_compl_ctx = (__u64) (uintptr_t)&iwuqp->qp;
1470 	ret = ibv_cmd_create_qp(pd, &iwuqp->ibv_qp, attr, &cmd.ibv_cmd,
1471 				sizeof(cmd), &resp.ibv_resp,
1472 				sizeof(struct irdma_ucreate_qp_resp));
1473 	if (ret)
1474 		goto err_qp;
1475 
1476 	info->sq_size = resp.actual_sq_size;
1477 	info->rq_size = resp.actual_rq_size;
1478 	info->first_sq_wq = legacy_mode ? 1 : resp.lsmm;
1479 	info->qp_caps = resp.qp_caps;
1480 	info->qp_id = resp.qp_id;
1481 	iwuqp->irdma_drv_opt = resp.irdma_drv_opt;
1482 	iwuqp->ibv_qp.qp_num = resp.qp_id;
1483 
1484 	iwuqp->send_cq = container_of(attr->send_cq, struct irdma_ucq,
1485 				      verbs_cq.cq);
1486 	iwuqp->recv_cq = container_of(attr->recv_cq, struct irdma_ucq,
1487 				      verbs_cq.cq);
1488 	iwuqp->send_cq->uqp = iwuqp;
1489 	iwuqp->recv_cq->uqp = iwuqp;
1490 
1491 	return 0;
1492 err_qp:
1493 	ibv_cmd_dereg_mr(&iwuqp->vmr.ibv_mr);
1494 err_dereg_mr:
1495 	printf("%s: failed to create QP, status %d\n", __func__, ret);
1496 	irdma_free_hw_buf(info->sq, iwuqp->buf_size);
1497 	return ret;
1498 }
1499 
1500 /**
1501  * irdma_ucreate_qp - create qp on user app
1502  * @pd: pd for the qp
1503  * @attr: attributes of the qp to be created (sizes, sge, cq)
1504  */
1505 struct ibv_qp *
1506 irdma_ucreate_qp(struct ibv_pd *pd,
1507 		 struct ibv_qp_init_attr *attr)
1508 {
1509 	struct irdma_qp_uk_init_info info = {};
1510 	struct irdma_uk_attrs *uk_attrs;
1511 	struct irdma_uvcontext *iwvctx;
1512 	struct irdma_uqp *iwuqp;
1513 	int status;
1514 
1515 	if (attr->qp_type != IBV_QPT_RC && attr->qp_type != IBV_QPT_UD) {
1516 		printf("%s: failed to create QP, unsupported QP type: 0x%x\n",
1517 		       __func__, attr->qp_type);
1518 		errno = EOPNOTSUPP;
1519 		return NULL;
1520 	}
1521 
1522 	iwvctx = container_of(pd->context, struct irdma_uvcontext, ibv_ctx);
1523 	uk_attrs = &iwvctx->uk_attrs;
1524 
1525 	if (attr->cap.max_send_sge > uk_attrs->max_hw_wq_frags ||
1526 	    attr->cap.max_recv_sge > uk_attrs->max_hw_wq_frags ||
1527 	    attr->cap.max_inline_data > uk_attrs->max_hw_inline) {
1528 		errno = EINVAL;
1529 		return NULL;
1530 	}
1531 
1532 	info.uk_attrs = uk_attrs;
1533 	info.sq_size = attr->cap.max_send_wr;
1534 	info.rq_size = attr->cap.max_recv_wr;
1535 	info.max_sq_frag_cnt = attr->cap.max_send_sge;
1536 	info.max_rq_frag_cnt = attr->cap.max_recv_sge;
1537 	info.max_inline_data = attr->cap.max_inline_data;
1538 	info.abi_ver = iwvctx->abi_ver;
1539 
1540 	status = irdma_uk_calc_depth_shift_sq(&info, &info.sq_depth, &info.sq_shift);
1541 	if (status) {
1542 		printf("%s: invalid SQ attributes, max_send_wr=%d max_send_sge=%d max_inline=%d\n",
1543 		       __func__, attr->cap.max_send_wr, attr->cap.max_send_sge,
1544 		       attr->cap.max_inline_data);
1545 		errno = status;
1546 		return NULL;
1547 	}
1548 
1549 	status = irdma_uk_calc_depth_shift_rq(&info, &info.rq_depth, &info.rq_shift);
1550 	if (status) {
1551 		printf("%s: invalid RQ attributes, recv_wr=%d recv_sge=%d\n",
1552 		       __func__, attr->cap.max_recv_wr, attr->cap.max_recv_sge);
1553 		errno = status;
1554 		return NULL;
1555 	}
1556 
1557 	iwuqp = memalign(1024, sizeof(*iwuqp));
1558 	if (!iwuqp)
1559 		return NULL;
1560 
1561 	memset(iwuqp, 0, sizeof(*iwuqp));
1562 
1563 	if (pthread_spin_init(&iwuqp->lock, PTHREAD_PROCESS_PRIVATE))
1564 		goto err_free_qp;
1565 
1566 	info.sq_size = info.sq_depth >> info.sq_shift;
1567 	info.rq_size = info.rq_depth >> info.rq_shift;
1568 	/**
1569 	 * For older ABI version (less than 6) passes raw sq and rq
1570 	 * quanta in cap.max_send_wr and cap.max_recv_wr.
1571 	 * But then kernel had no way of calculating the actual qp size.
1572 	 */
1573 	if (iwvctx->abi_ver <= 5) {
1574 		attr->cap.max_send_wr = info.sq_size;
1575 		attr->cap.max_recv_wr = info.rq_size;
1576 	}
1577 
1578 	iwuqp->recv_sges = calloc(attr->cap.max_recv_sge, sizeof(*iwuqp->recv_sges));
1579 	if (!iwuqp->recv_sges)
1580 		goto err_destroy_lock;
1581 
1582 	info.wqe_alloc_db = (u32 *)iwvctx->db;
1583 	info.legacy_mode = iwvctx->legacy_mode;
1584 	info.sq_wrtrk_array = calloc(info.sq_depth, sizeof(*info.sq_wrtrk_array));
1585 	if (!info.sq_wrtrk_array)
1586 		goto err_free_rsges;
1587 
1588 	info.rq_wrid_array = calloc(info.rq_depth, sizeof(*info.rq_wrid_array));
1589 	if (!info.rq_wrid_array)
1590 		goto err_free_sq_wrtrk;
1591 
1592 	iwuqp->sq_sig_all = attr->sq_sig_all;
1593 	iwuqp->qp_type = attr->qp_type;
1594 	status = irdma_vmapped_qp(iwuqp, pd, attr, &info, iwvctx->legacy_mode);
1595 	if (status) {
1596 		errno = status;
1597 		goto err_free_rq_wrid;
1598 	}
1599 
1600 	iwuqp->qp.back_qp = iwuqp;
1601 	iwuqp->qp.lock = &iwuqp->lock;
1602 
1603 	status = irdma_uk_qp_init(&iwuqp->qp, &info);
1604 	if (status) {
1605 		errno = status;
1606 		goto err_free_vmap_qp;
1607 	}
1608 
1609 	attr->cap.max_send_wr = (info.sq_depth - IRDMA_SQ_RSVD) >> info.sq_shift;
1610 	attr->cap.max_recv_wr = (info.rq_depth - IRDMA_RQ_RSVD) >> info.rq_shift;
1611 
1612 	return &iwuqp->ibv_qp;
1613 
1614 err_free_vmap_qp:
1615 	irdma_destroy_vmapped_qp(iwuqp);
1616 	irdma_free_hw_buf(info.sq, iwuqp->buf_size);
1617 err_free_rq_wrid:
1618 	free(info.rq_wrid_array);
1619 err_free_sq_wrtrk:
1620 	free(info.sq_wrtrk_array);
1621 err_free_rsges:
1622 	free(iwuqp->recv_sges);
1623 err_destroy_lock:
1624 	pthread_spin_destroy(&iwuqp->lock);
1625 err_free_qp:
1626 	printf("%s: failed to create QP\n", __func__);
1627 	free(iwuqp);
1628 
1629 	return NULL;
1630 }
1631 
1632 /**
1633  * irdma_uquery_qp - query qp for some attribute
1634  * @qp: qp for the attributes query
1635  * @attr: to return the attributes
1636  * @attr_mask: mask of what is query for
1637  * @init_attr: initial attributes during create_qp
1638  */
1639 int
1640 irdma_uquery_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask,
1641 		struct ibv_qp_init_attr *init_attr)
1642 {
1643 	struct ibv_query_qp cmd;
1644 
1645 	return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, &cmd,
1646 				sizeof(cmd));
1647 }
1648 
1649 /**
1650  * irdma_umodify_qp - send qp modify to driver
1651  * @qp: qp to modify
1652  * @attr: attribute to modify
1653  * @attr_mask: mask of the attribute
1654  */
1655 int
1656 irdma_umodify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask)
1657 {
1658 	struct irdma_umodify_qp_resp resp = {};
1659 	struct ibv_modify_qp cmd = {};
1660 	struct irdma_modify_qp_cmd cmd_ex = {};
1661 	struct irdma_uvcontext *iwvctx;
1662 	struct irdma_uqp *iwuqp;
1663 
1664 	iwuqp = container_of(qp, struct irdma_uqp, ibv_qp);
1665 	iwvctx = container_of(qp->context, struct irdma_uvcontext, ibv_ctx);
1666 
1667 	if (iwuqp->qp.qp_caps & IRDMA_PUSH_MODE && attr_mask & IBV_QP_STATE &&
1668 	    iwvctx->uk_attrs.hw_rev > IRDMA_GEN_1) {
1669 		u64 offset;
1670 		void *map;
1671 		int ret;
1672 
1673 		ret = ibv_cmd_modify_qp_ex(qp, attr, attr_mask, &cmd_ex.ibv_cmd,
1674 					   sizeof(cmd_ex.ibv_cmd),
1675 					   sizeof(cmd_ex), &resp.ibv_resp,
1676 					   sizeof(resp.ibv_resp),
1677 					   sizeof(resp));
1678 		if (!ret)
1679 			iwuqp->qp.rd_fence_rate = resp.rd_fence_rate;
1680 		if (ret || !resp.push_valid)
1681 			return ret;
1682 
1683 		if (iwuqp->qp.push_wqe)
1684 			return ret;
1685 
1686 		offset = resp.push_wqe_mmap_key;
1687 		map = irdma_mmap(qp->context->cmd_fd, offset);
1688 		if (map == MAP_FAILED)
1689 			return ret;
1690 
1691 		iwuqp->qp.push_wqe = map;
1692 
1693 		offset = resp.push_db_mmap_key;
1694 		map = irdma_mmap(qp->context->cmd_fd, offset);
1695 		if (map == MAP_FAILED) {
1696 			irdma_munmap(iwuqp->qp.push_wqe);
1697 			iwuqp->qp.push_wqe = NULL;
1698 			printf("failed to map push page, errno %d\n", errno);
1699 			return ret;
1700 		}
1701 		iwuqp->qp.push_wqe += resp.push_offset;
1702 		iwuqp->qp.push_db = map + resp.push_offset;
1703 
1704 		return ret;
1705 	} else {
1706 		int ret;
1707 
1708 		ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd));
1709 		if (ret)
1710 			return ret;
1711 		if (attr_mask & IBV_QP_STATE && attr->qp_state == IBV_QPS_ERR)
1712 			pthread_create(&iwuqp->flush_thread, NULL, irdma_flush_thread, iwuqp);
1713 		return 0;
1714 	}
1715 }
1716 
1717 static void
1718 irdma_issue_flush(struct ibv_qp *qp, bool sq_flush, bool rq_flush)
1719 {
1720 	struct irdma_umodify_qp_resp resp = {};
1721 	struct irdma_modify_qp_cmd cmd_ex = {};
1722 	struct ibv_qp_attr attr = {};
1723 
1724 	attr.qp_state = IBV_QPS_ERR;
1725 	cmd_ex.sq_flush = sq_flush;
1726 	cmd_ex.rq_flush = rq_flush;
1727 
1728 	ibv_cmd_modify_qp_ex(qp, &attr, IBV_QP_STATE,
1729 			     &cmd_ex.ibv_cmd,
1730 			     sizeof(cmd_ex.ibv_cmd),
1731 			     sizeof(cmd_ex), &resp.ibv_resp,
1732 			     sizeof(resp.ibv_resp),
1733 			     sizeof(resp));
1734 }
1735 
1736 /**
1737  * irdma_clean_cqes - clean cq entries for qp
1738  * @qp: qp for which completions are cleaned
1739  * @iwcq: cq to be cleaned
1740  */
1741 static void
1742 irdma_clean_cqes(struct irdma_qp_uk *qp, struct irdma_ucq *iwucq)
1743 {
1744 	struct irdma_cq_uk *ukcq = &iwucq->cq;
1745 	int ret;
1746 
1747 	ret = pthread_spin_lock(&iwucq->lock);
1748 	if (ret)
1749 		return;
1750 
1751 	irdma_uk_clean_cq(qp, ukcq);
1752 	pthread_spin_unlock(&iwucq->lock);
1753 }
1754 
1755 /**
1756  * irdma_udestroy_qp - destroy qp
1757  * @qp: qp to destroy
1758  */
1759 int
1760 irdma_udestroy_qp(struct ibv_qp *qp)
1761 {
1762 	struct irdma_uqp *iwuqp;
1763 	int ret;
1764 
1765 	iwuqp = container_of(qp, struct irdma_uqp, ibv_qp);
1766 	if (iwuqp->flush_thread) {
1767 		pthread_cancel(iwuqp->flush_thread);
1768 		pthread_join(iwuqp->flush_thread, NULL);
1769 	}
1770 	ret = pthread_spin_destroy(&iwuqp->lock);
1771 	if (ret)
1772 		goto err;
1773 
1774 	ret = irdma_destroy_vmapped_qp(iwuqp);
1775 	if (ret)
1776 		goto err;
1777 
1778 	/* Clean any pending completions from the cq(s) */
1779 	if (iwuqp->send_cq)
1780 		irdma_clean_cqes(&iwuqp->qp, iwuqp->send_cq);
1781 
1782 	if (iwuqp->recv_cq && iwuqp->recv_cq != iwuqp->send_cq)
1783 		irdma_clean_cqes(&iwuqp->qp, iwuqp->recv_cq);
1784 
1785 	if (iwuqp->qp.sq_wrtrk_array)
1786 		free(iwuqp->qp.sq_wrtrk_array);
1787 	if (iwuqp->qp.rq_wrid_array)
1788 		free(iwuqp->qp.rq_wrid_array);
1789 
1790 	irdma_free_hw_buf(iwuqp->qp.sq_base, iwuqp->buf_size);
1791 	free(iwuqp->recv_sges);
1792 	free(iwuqp);
1793 	return 0;
1794 
1795 err:
1796 	printf("%s: failed to destroy QP, status %d\n",
1797 	       __func__, ret);
1798 	return ret;
1799 }
1800 
1801 /**
1802  * irdma_copy_sg_list - copy sg list for qp
1803  * @sg_list: copied into sg_list
1804  * @sgl: copy from sgl
1805  * @num_sges: count of sg entries
1806  * @max_sges: count of max supported sg entries
1807  */
1808 static void
1809 irdma_copy_sg_list(struct irdma_sge *sg_list, struct ibv_sge *sgl,
1810 		   int num_sges)
1811 {
1812 	int i;
1813 
1814 	for (i = 0; i < num_sges; i++) {
1815 		sg_list[i].tag_off = sgl[i].addr;
1816 		sg_list[i].len = sgl[i].length;
1817 		sg_list[i].stag = sgl[i].lkey;
1818 	}
1819 }
1820 
1821 /**
1822  * calc_type2_mw_stag - calculate type 2 MW stag
1823  * @rkey: desired rkey of the MW
1824  * @mw_rkey: type2 memory window rkey
1825  *
1826  * compute type2 memory window stag by taking lower 8 bits
1827  * of the desired rkey and leaving 24 bits if mw->rkey unchanged
1828  */
1829 static inline u32 calc_type2_mw_stag(u32 rkey, u32 mw_rkey) {
1830 	const u32 mask = 0xff;
1831 
1832 	return (rkey & mask) | (mw_rkey & ~mask);
1833 }
1834 
1835 /**
1836  * irdma_post_send -  post send wr for user application
1837  * @ib_qp: qp to post wr
1838  * @ib_wr: work request ptr
1839  * @bad_wr: return of bad wr if err
1840  */
1841 int
1842 irdma_upost_send(struct ibv_qp *ib_qp, struct ibv_send_wr *ib_wr,
1843 		 struct ibv_send_wr **bad_wr)
1844 {
1845 	struct irdma_post_sq_info info;
1846 	struct irdma_uvcontext *iwvctx;
1847 	struct irdma_uk_attrs *uk_attrs;
1848 	struct irdma_uqp *iwuqp;
1849 	bool reflush = false;
1850 	int err = 0;
1851 
1852 	iwuqp = container_of(ib_qp, struct irdma_uqp, ibv_qp);
1853 	iwvctx = container_of(ib_qp->context, struct irdma_uvcontext, ibv_ctx);
1854 	uk_attrs = &iwvctx->uk_attrs;
1855 
1856 	err = pthread_spin_lock(&iwuqp->lock);
1857 	if (err)
1858 		return err;
1859 
1860 	if (!IRDMA_RING_MORE_WORK(iwuqp->qp.sq_ring) &&
1861 	    ib_qp->state == IBV_QPS_ERR)
1862 		reflush = true;
1863 
1864 	while (ib_wr) {
1865 		memset(&info, 0, sizeof(info));
1866 		info.wr_id = (u64)(ib_wr->wr_id);
1867 		if ((ib_wr->send_flags & IBV_SEND_SIGNALED) ||
1868 		    iwuqp->sq_sig_all)
1869 			info.signaled = true;
1870 		if (ib_wr->send_flags & IBV_SEND_FENCE)
1871 			info.read_fence = true;
1872 
1873 		switch (ib_wr->opcode) {
1874 		case IBV_WR_SEND_WITH_IMM:
1875 			if (iwuqp->qp.qp_caps & IRDMA_SEND_WITH_IMM) {
1876 				info.imm_data_valid = true;
1877 				info.imm_data = ntohl(ib_wr->imm_data);
1878 			} else {
1879 				err = EINVAL;
1880 				break;
1881 			}
1882 			/* fallthrough */
1883 		case IBV_WR_SEND:
1884 		case IBV_WR_SEND_WITH_INV:
1885 			if (ib_wr->opcode == IBV_WR_SEND ||
1886 			    ib_wr->opcode == IBV_WR_SEND_WITH_IMM) {
1887 				if (ib_wr->send_flags & IBV_SEND_SOLICITED)
1888 					info.op_type = IRDMA_OP_TYPE_SEND_SOL;
1889 				else
1890 					info.op_type = IRDMA_OP_TYPE_SEND;
1891 			} else {
1892 				if (ib_wr->send_flags & IBV_SEND_SOLICITED)
1893 					info.op_type = IRDMA_OP_TYPE_SEND_SOL_INV;
1894 				else
1895 					info.op_type = IRDMA_OP_TYPE_SEND_INV;
1896 				info.stag_to_inv = ib_wr->imm_data;
1897 			}
1898 			info.op.send.num_sges = ib_wr->num_sge;
1899 			info.op.send.sg_list = (struct irdma_sge *)ib_wr->sg_list;
1900 			if (ib_qp->qp_type == IBV_QPT_UD) {
1901 				struct irdma_uah *ah = container_of(ib_wr->wr.ud.ah,
1902 								    struct irdma_uah, ibv_ah);
1903 
1904 				info.op.send.ah_id = ah->ah_id;
1905 				info.op.send.qkey = ib_wr->wr.ud.remote_qkey;
1906 				info.op.send.dest_qp = ib_wr->wr.ud.remote_qpn;
1907 			}
1908 
1909 			if (ib_wr->send_flags & IBV_SEND_INLINE)
1910 				err = irdma_uk_inline_send(&iwuqp->qp, &info, false);
1911 			else
1912 				err = irdma_uk_send(&iwuqp->qp, &info, false);
1913 			break;
1914 		case IBV_WR_RDMA_WRITE_WITH_IMM:
1915 			if (iwuqp->qp.qp_caps & IRDMA_WRITE_WITH_IMM) {
1916 				info.imm_data_valid = true;
1917 				info.imm_data = ntohl(ib_wr->imm_data);
1918 			} else {
1919 				err = EINVAL;
1920 				break;
1921 			}
1922 			/* fallthrough */
1923 		case IBV_WR_RDMA_WRITE:
1924 			if (ib_wr->send_flags & IBV_SEND_SOLICITED)
1925 				info.op_type = IRDMA_OP_TYPE_RDMA_WRITE_SOL;
1926 			else
1927 				info.op_type = IRDMA_OP_TYPE_RDMA_WRITE;
1928 
1929 			info.op.rdma_write.num_lo_sges = ib_wr->num_sge;
1930 			info.op.rdma_write.lo_sg_list = (void *)ib_wr->sg_list;
1931 			info.op.rdma_write.rem_addr.tag_off = ib_wr->wr.rdma.remote_addr;
1932 			info.op.rdma_write.rem_addr.stag = ib_wr->wr.rdma.rkey;
1933 			if (ib_wr->send_flags & IBV_SEND_INLINE)
1934 				err = irdma_uk_inline_rdma_write(&iwuqp->qp, &info, false);
1935 			else
1936 				err = irdma_uk_rdma_write(&iwuqp->qp, &info, false);
1937 			break;
1938 		case IBV_WR_RDMA_READ:
1939 			if (ib_wr->num_sge > uk_attrs->max_hw_read_sges) {
1940 				err = EINVAL;
1941 				break;
1942 			}
1943 			info.op_type = IRDMA_OP_TYPE_RDMA_READ;
1944 			info.op.rdma_read.rem_addr.tag_off = ib_wr->wr.rdma.remote_addr;
1945 			info.op.rdma_read.rem_addr.stag = ib_wr->wr.rdma.rkey;
1946 
1947 			info.op.rdma_read.lo_sg_list = (void *)ib_wr->sg_list;
1948 			info.op.rdma_read.num_lo_sges = ib_wr->num_sge;
1949 			err = irdma_uk_rdma_read(&iwuqp->qp, &info, false, false);
1950 			break;
1951 		case IBV_WR_BIND_MW:
1952 			if (ib_qp->qp_type != IBV_QPT_RC) {
1953 				err = EINVAL;
1954 				break;
1955 			}
1956 			info.op_type = IRDMA_OP_TYPE_BIND_MW;
1957 			info.op.bind_window.mr_stag = ib_wr->bind_mw.bind_info.mr->rkey;
1958 			if (ib_wr->bind_mw.mw->type == IBV_MW_TYPE_1) {
1959 				info.op.bind_window.mem_window_type_1 = true;
1960 				info.op.bind_window.mw_stag = ib_wr->bind_mw.rkey;
1961 			} else {
1962 				struct verbs_mr *vmr = verbs_get_mr(ib_wr->bind_mw.bind_info.mr);
1963 
1964 				if (vmr->access & IBV_ACCESS_ZERO_BASED) {
1965 					err = EINVAL;
1966 					break;
1967 				}
1968 				info.op.bind_window.mw_stag =
1969 				    calc_type2_mw_stag(ib_wr->bind_mw.rkey, ib_wr->bind_mw.mw->rkey);
1970 				ib_wr->bind_mw.mw->rkey = info.op.bind_window.mw_stag;
1971 
1972 			}
1973 
1974 			if (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_ZERO_BASED) {
1975 				info.op.bind_window.addressing_type = IRDMA_ADDR_TYPE_ZERO_BASED;
1976 				info.op.bind_window.va = NULL;
1977 			} else {
1978 				info.op.bind_window.addressing_type = IRDMA_ADDR_TYPE_VA_BASED;
1979 				info.op.bind_window.va = (void *)(uintptr_t)ib_wr->bind_mw.bind_info.addr;
1980 			}
1981 			info.op.bind_window.bind_len = ib_wr->bind_mw.bind_info.length;
1982 			info.op.bind_window.ena_reads =
1983 			    (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_REMOTE_READ) ? 1 : 0;
1984 			info.op.bind_window.ena_writes =
1985 			    (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_REMOTE_WRITE) ? 1 : 0;
1986 
1987 			err = irdma_uk_mw_bind(&iwuqp->qp, &info, false);
1988 			break;
1989 		case IBV_WR_LOCAL_INV:
1990 			info.op_type = IRDMA_OP_TYPE_INV_STAG;
1991 			info.op.inv_local_stag.target_stag = ib_wr->imm_data;
1992 			err = irdma_uk_stag_local_invalidate(&iwuqp->qp, &info, true);
1993 			break;
1994 		default:
1995 			/* error */
1996 			err = EINVAL;
1997 			printf("%s: post work request failed, invalid opcode: 0x%x\n",
1998 			       __func__, ib_wr->opcode);
1999 			break;
2000 		}
2001 		if (err)
2002 			break;
2003 
2004 		ib_wr = ib_wr->next;
2005 	}
2006 
2007 	if (err)
2008 		*bad_wr = ib_wr;
2009 
2010 	irdma_uk_qp_post_wr(&iwuqp->qp);
2011 	if (reflush)
2012 		irdma_issue_flush(ib_qp, 1, 0);
2013 
2014 	pthread_spin_unlock(&iwuqp->lock);
2015 
2016 	return err;
2017 }
2018 
2019 /**
2020  * irdma_post_recv - post receive wr for user application
2021  * @ib_wr: work request for receive
2022  * @bad_wr: bad wr caused an error
2023  */
2024 int
2025 irdma_upost_recv(struct ibv_qp *ib_qp, struct ibv_recv_wr *ib_wr,
2026 		 struct ibv_recv_wr **bad_wr)
2027 {
2028 	struct irdma_post_rq_info post_recv = {};
2029 	struct irdma_sge *sg_list;
2030 	struct irdma_uqp *iwuqp;
2031 	bool reflush = false;
2032 	int err = 0;
2033 
2034 	iwuqp = container_of(ib_qp, struct irdma_uqp, ibv_qp);
2035 	sg_list = iwuqp->recv_sges;
2036 
2037 	err = pthread_spin_lock(&iwuqp->lock);
2038 	if (err)
2039 		return err;
2040 
2041 	if (!IRDMA_RING_MORE_WORK(iwuqp->qp.rq_ring) &&
2042 	    ib_qp->state == IBV_QPS_ERR)
2043 		reflush = true;
2044 
2045 	while (ib_wr) {
2046 		if (ib_wr->num_sge > iwuqp->qp.max_rq_frag_cnt) {
2047 			*bad_wr = ib_wr;
2048 			err = EINVAL;
2049 			goto error;
2050 		}
2051 		post_recv.num_sges = ib_wr->num_sge;
2052 		post_recv.wr_id = ib_wr->wr_id;
2053 		irdma_copy_sg_list(sg_list, ib_wr->sg_list, ib_wr->num_sge);
2054 		post_recv.sg_list = sg_list;
2055 		err = irdma_uk_post_receive(&iwuqp->qp, &post_recv);
2056 		if (err) {
2057 			*bad_wr = ib_wr;
2058 			goto error;
2059 		}
2060 
2061 		if (reflush)
2062 			irdma_issue_flush(ib_qp, 0, 1);
2063 
2064 		ib_wr = ib_wr->next;
2065 	}
2066 error:
2067 	pthread_spin_unlock(&iwuqp->lock);
2068 
2069 	return err;
2070 }
2071 
2072 /**
2073  * irdma_ucreate_ah - create address handle associated with a pd
2074  * @ibpd: pd for the address handle
2075  * @attr: attributes of address handle
2076  */
2077 struct ibv_ah *
2078 irdma_ucreate_ah(struct ibv_pd *ibpd, struct ibv_ah_attr *attr)
2079 {
2080 	struct irdma_uah *ah;
2081 	union ibv_gid sgid;
2082 	struct irdma_ucreate_ah_resp resp = {};
2083 	int err;
2084 
2085 	err = ibv_query_gid(ibpd->context, attr->port_num, attr->grh.sgid_index,
2086 			    &sgid);
2087 	if (err) {
2088 		fprintf(stderr, "irdma: Error from ibv_query_gid.\n");
2089 		errno = err;
2090 		return NULL;
2091 	}
2092 
2093 	ah = calloc(1, sizeof(*ah));
2094 	if (!ah)
2095 		return NULL;
2096 
2097 	err = ibv_cmd_create_ah(ibpd, &ah->ibv_ah, attr, &resp.ibv_resp,
2098 				sizeof(resp));
2099 	if (err) {
2100 		free(ah);
2101 		errno = err;
2102 		return NULL;
2103 	}
2104 
2105 	ah->ah_id = resp.ah_id;
2106 
2107 	return &ah->ibv_ah;
2108 }
2109 
2110 /**
2111  * irdma_udestroy_ah - destroy the address handle
2112  * @ibah: address handle
2113  */
2114 int
2115 irdma_udestroy_ah(struct ibv_ah *ibah)
2116 {
2117 	struct irdma_uah *ah;
2118 	int ret;
2119 
2120 	ah = container_of(ibah, struct irdma_uah, ibv_ah);
2121 
2122 	ret = ibv_cmd_destroy_ah(ibah);
2123 	if (ret)
2124 		return ret;
2125 
2126 	free(ah);
2127 
2128 	return 0;
2129 }
2130 
2131 /**
2132  * irdma_uattach_mcast - Attach qp to multicast group implemented
2133  * @qp: The queue pair
2134  * @gid:The Global ID for multicast group
2135  * @lid: The Local ID
2136  */
2137 int
2138 irdma_uattach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
2139 		    uint16_t lid)
2140 {
2141 	return ibv_cmd_attach_mcast(qp, gid, lid);
2142 }
2143 
2144 /**
2145  * irdma_udetach_mcast - Detach qp from multicast group
2146  * @qp: The queue pair
2147  * @gid:The Global ID for multicast group
2148  * @lid: The Local ID
2149  */
2150 int
2151 irdma_udetach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
2152 		    uint16_t lid)
2153 {
2154 	return ibv_cmd_detach_mcast(qp, gid, lid);
2155 }
2156 
2157 /**
2158  * irdma_uresize_cq - resizes a cq
2159  * @cq: cq to resize
2160  * @cqe: the number of cqes of the new cq
2161  */
2162 int
2163 irdma_uresize_cq(struct ibv_cq *cq, int cqe)
2164 {
2165 	struct irdma_uvcontext *iwvctx;
2166 	struct irdma_uk_attrs *uk_attrs;
2167 	struct irdma_uresize_cq cmd = {};
2168 	struct ibv_resize_cq_resp resp = {};
2169 	struct irdma_ureg_mr reg_mr_cmd = {};
2170 	struct ibv_reg_mr_resp reg_mr_resp = {};
2171 	struct irdma_cq_buf *cq_buf = NULL;
2172 	struct irdma_cqe *cq_base = NULL;
2173 	struct verbs_mr new_mr = {};
2174 	struct irdma_ucq *iwucq;
2175 	size_t cq_size;
2176 	u32 cq_pages;
2177 	int cqe_needed;
2178 	int ret = 0;
2179 	bool cqe_64byte_ena;
2180 
2181 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
2182 	iwvctx = container_of(cq->context, struct irdma_uvcontext, ibv_ctx);
2183 	uk_attrs = &iwvctx->uk_attrs;
2184 
2185 	if (!(uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE))
2186 		return EOPNOTSUPP;
2187 
2188 	if (cqe < uk_attrs->min_hw_cq_size || cqe > uk_attrs->max_hw_cq_size - 1)
2189 		return EINVAL;
2190 
2191 	cqe_64byte_ena = uk_attrs->feature_flags & IRDMA_FEATURE_64_BYTE_CQE ? true : false;
2192 
2193 	cqe_needed = get_cq_size(cqe, uk_attrs->hw_rev, cqe_64byte_ena);
2194 
2195 	if (cqe_needed == iwucq->cq.cq_size)
2196 		return 0;
2197 
2198 	cq_size = get_cq_total_bytes(cqe_needed, cqe_64byte_ena);
2199 	cq_pages = cq_size >> IRDMA_HW_PAGE_SHIFT;
2200 	cq_base = irdma_alloc_hw_buf(cq_size);
2201 	if (!cq_base)
2202 		return ENOMEM;
2203 
2204 	memset(cq_base, 0, cq_size);
2205 
2206 	cq_buf = malloc(sizeof(*cq_buf));
2207 	if (!cq_buf) {
2208 		ret = ENOMEM;
2209 		goto err_buf;
2210 	}
2211 
2212 	new_mr.ibv_mr.pd = iwucq->vmr.ibv_mr.pd;
2213 	reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ;
2214 	reg_mr_cmd.cq_pages = cq_pages;
2215 
2216 	ret = ibv_cmd_reg_mr(new_mr.ibv_mr.pd, cq_base, cq_size,
2217 			     (uintptr_t)cq_base, IBV_ACCESS_LOCAL_WRITE,
2218 			     &new_mr.ibv_mr, &reg_mr_cmd.ibv_cmd, sizeof(reg_mr_cmd),
2219 			     &reg_mr_resp, sizeof(reg_mr_resp));
2220 	if (ret)
2221 		goto err_dereg_mr;
2222 
2223 	ret = pthread_spin_lock(&iwucq->lock);
2224 	if (ret)
2225 		goto err_lock;
2226 
2227 	cmd.user_cq_buffer = (__u64) ((uintptr_t)cq_base);
2228 	ret = ibv_cmd_resize_cq(&iwucq->verbs_cq.cq, cqe_needed, &cmd.ibv_cmd,
2229 				sizeof(cmd), &resp, sizeof(resp));
2230 	if (ret)
2231 		goto err_resize;
2232 
2233 	memcpy(&cq_buf->cq, &iwucq->cq, sizeof(cq_buf->cq));
2234 	cq_buf->buf_size = cq_size;
2235 	cq_buf->vmr = iwucq->vmr;
2236 	iwucq->vmr = new_mr;
2237 	irdma_uk_cq_resize(&iwucq->cq, cq_base, cqe_needed);
2238 	iwucq->verbs_cq.cq.cqe = cqe;
2239 	LIST_INSERT_HEAD(&iwucq->resize_list, cq_buf, list);
2240 
2241 	pthread_spin_unlock(&iwucq->lock);
2242 
2243 	return ret;
2244 
2245 err_resize:
2246 	pthread_spin_unlock(&iwucq->lock);
2247 err_lock:
2248 	ibv_cmd_dereg_mr(&new_mr.ibv_mr);
2249 err_dereg_mr:
2250 	free(cq_buf);
2251 err_buf:
2252 	fprintf(stderr, "failed to resize CQ cq_id=%d ret=%d\n", iwucq->cq.cq_id, ret);
2253 	irdma_free_hw_buf(cq_base, cq_size);
2254 	return ret;
2255 }
2256