xref: /freebsd/contrib/ofed/libirdma/irdma_uverbs.c (revision 357378bbdedf24ce2b90e9bd831af4a9db3ec70a)
1 /*-
2  * SPDX-License-Identifier: GPL-2.0 or Linux-OpenIB
3  *
4  * Copyright (C) 2019 - 2023 Intel Corporation
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenFabrics.org BSD license below:
11  *
12  *   Redistribution and use in source and binary forms, with or
13  *   without modification, are permitted provided that the following
14  *   conditions are met:
15  *
16  *    - Redistributions of source code must retain the above
17  *	copyright notice, this list of conditions and the following
18  *	disclaimer.
19  *
20  *    - Redistributions in binary form must reproduce the above
21  *	copyright notice, this list of conditions and the following
22  *	disclaimer in the documentation and/or other materials
23  *	provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  */
34 
35 #include <config.h>
36 #include <stdlib.h>
37 #include <stdio.h>
38 #include <string.h>
39 #include <unistd.h>
40 #include <signal.h>
41 #include <errno.h>
42 #include <sys/param.h>
43 #include <sys/mman.h>
44 #include <netinet/in.h>
45 #include <sys/stat.h>
46 #include <fcntl.h>
47 #include <stdbool.h>
48 #include <infiniband/opcode.h>
49 
50 #include "irdma_umain.h"
51 #include "abi.h"
52 
53 static inline void
54 print_fw_ver(uint64_t fw_ver, char *str, size_t len)
55 {
56 	uint16_t major, minor;
57 
58 	major = fw_ver >> 32 & 0xffff;
59 	minor = fw_ver & 0xffff;
60 
61 	snprintf(str, len, "%d.%d", major, minor);
62 }
63 
64 /**
65  * irdma_uquery_device_ex - query device attributes including extended properties
66  * @context: user context for the device
67  * @input: extensible input struct for ibv_query_device_ex verb
68  * @attr: extended device attribute struct
69  * @attr_size: size of extended device attribute struct
70  **/
71 int
72 irdma_uquery_device_ex(struct ibv_context *context,
73 		       const struct ibv_query_device_ex_input *input,
74 		       struct ibv_device_attr_ex *attr, size_t attr_size)
75 {
76 	struct irdma_query_device_ex cmd = {};
77 	struct irdma_query_device_ex_resp resp = {};
78 	uint64_t fw_ver;
79 	int ret;
80 
81 	ret = ibv_cmd_query_device_ex(context, input, attr, attr_size, &fw_ver,
82 				      &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd),
83 				      &resp.ibv_resp, sizeof(resp.ibv_resp), sizeof(resp));
84 	if (ret)
85 		return ret;
86 
87 	print_fw_ver(fw_ver, attr->orig_attr.fw_ver, sizeof(attr->orig_attr.fw_ver));
88 
89 	return 0;
90 }
91 
92 /**
93  * irdma_uquery_device - call driver to query device for max resources
94  * @context: user context for the device
95  * @attr: where to save all the mx resources from the driver
96  **/
97 int
98 irdma_uquery_device(struct ibv_context *context, struct ibv_device_attr *attr)
99 {
100 	struct ibv_query_device cmd;
101 	uint64_t fw_ver;
102 	int ret;
103 
104 	ret = ibv_cmd_query_device(context, attr, &fw_ver, &cmd, sizeof(cmd));
105 	if (ret)
106 		return ret;
107 
108 	print_fw_ver(fw_ver, attr->fw_ver, sizeof(attr->fw_ver));
109 
110 	return 0;
111 }
112 
113 /**
114  * irdma_uquery_port - get port attributes (msg size, lnk, mtu...)
115  * @context: user context of the device
116  * @port: port for the attributes
117  * @attr: to return port attributes
118  **/
119 int
120 irdma_uquery_port(struct ibv_context *context, uint8_t port,
121 		  struct ibv_port_attr *attr)
122 {
123 	struct ibv_query_port cmd;
124 
125 	return ibv_cmd_query_port(context, port, attr, &cmd, sizeof(cmd));
126 }
127 
128 /**
129  * irdma_ualloc_pd - allocates protection domain and return pd ptr
130  * @context: user context of the device
131  **/
132 struct ibv_pd *
133 irdma_ualloc_pd(struct ibv_context *context)
134 {
135 	struct ibv_alloc_pd cmd;
136 	struct irdma_ualloc_pd_resp resp = {};
137 	struct irdma_upd *iwupd;
138 	int err;
139 
140 	iwupd = calloc(1, sizeof(*iwupd));
141 	if (!iwupd)
142 		return NULL;
143 
144 	err = ibv_cmd_alloc_pd(context, &iwupd->ibv_pd, &cmd, sizeof(cmd),
145 			       &resp.ibv_resp, sizeof(resp));
146 	if (err)
147 		goto err_free;
148 
149 	iwupd->pd_id = resp.pd_id;
150 
151 	return &iwupd->ibv_pd;
152 
153 err_free:
154 	free(iwupd);
155 
156 	errno = err;
157 	return NULL;
158 }
159 
160 /**
161  * irdma_ufree_pd - free pd resources
162  * @pd: pd to free resources
163  */
164 int
165 irdma_ufree_pd(struct ibv_pd *pd)
166 {
167 	struct irdma_upd *iwupd;
168 	int ret;
169 
170 	iwupd = container_of(pd, struct irdma_upd, ibv_pd);
171 	ret = ibv_cmd_dealloc_pd(pd);
172 	if (ret)
173 		return ret;
174 
175 	free(iwupd);
176 
177 	return 0;
178 }
179 
180 /**
181  * irdma_ureg_mr - register user memory region
182  * @pd: pd for the mr
183  * @addr: user address of the memory region
184  * @length: length of the memory
185  * @hca_va: hca_va
186  * @access: access allowed on this mr
187  */
188 struct ibv_mr *
189 irdma_ureg_mr(struct ibv_pd *pd, void *addr, size_t length,
190 	      int access)
191 {
192 	struct verbs_mr *vmr;
193 	struct irdma_ureg_mr cmd = {};
194 	struct ibv_reg_mr_resp resp;
195 	int err;
196 
197 	vmr = malloc(sizeof(*vmr));
198 	if (!vmr)
199 		return NULL;
200 
201 	cmd.reg_type = IRDMA_MEMREG_TYPE_MEM;
202 	err = ibv_cmd_reg_mr(pd, addr, length,
203 			     (uintptr_t)addr, access, &vmr->ibv_mr, &cmd.ibv_cmd,
204 			     sizeof(cmd), &resp, sizeof(resp));
205 	if (err) {
206 		free(vmr);
207 		errno = err;
208 		return NULL;
209 	}
210 
211 	return &vmr->ibv_mr;
212 }
213 
214 /*
215  * irdma_urereg_mr - re-register memory region @vmr: mr that was allocated @flags: bit mask to indicate which of the
216  * attr's of MR modified @pd: pd of the mr @addr: user address of the memory region @length: length of the memory
217  * @access: access allowed on this mr
218  */
219 int
220 irdma_urereg_mr(struct verbs_mr *vmr, int flags, struct ibv_pd *pd,
221 		void *addr, size_t length, int access)
222 {
223 	struct irdma_urereg_mr cmd = {};
224 	struct ibv_rereg_mr_resp resp;
225 
226 	cmd.reg_type = IRDMA_MEMREG_TYPE_MEM;
227 	return ibv_cmd_rereg_mr(&vmr->ibv_mr, flags, addr, length, (uintptr_t)addr,
228 				access, pd, &cmd.ibv_cmd, sizeof(cmd), &resp,
229 				sizeof(resp));
230 }
231 
232 /**
233  * irdma_udereg_mr - re-register memory region
234  * @mr: mr that was allocated
235  */
236 int
237 irdma_udereg_mr(struct ibv_mr *mr)
238 {
239 	struct verbs_mr *vmr;
240 	int ret;
241 
242 	vmr = container_of(mr, struct verbs_mr, ibv_mr);
243 
244 	ret = ibv_cmd_dereg_mr(mr);
245 	if (ret)
246 		return ret;
247 
248 	return 0;
249 }
250 
251 /**
252  * irdma_ualloc_mw - allocate memory window
253  * @pd: protection domain
254  * @type: memory window type
255  */
256 struct ibv_mw *
257 irdma_ualloc_mw(struct ibv_pd *pd, enum ibv_mw_type type)
258 {
259 	struct ibv_mw *mw;
260 	struct ibv_alloc_mw cmd;
261 	struct ibv_alloc_mw_resp resp;
262 	int err;
263 
264 	mw = calloc(1, sizeof(*mw));
265 	if (!mw)
266 		return NULL;
267 
268 	err = ibv_cmd_alloc_mw(pd, type, mw, &cmd, sizeof(cmd), &resp,
269 			       sizeof(resp));
270 	if (err) {
271 		printf("%s: Failed to alloc memory window\n",
272 		       __func__);
273 		free(mw);
274 		errno = err;
275 		return NULL;
276 	}
277 
278 	return mw;
279 }
280 
281 /**
282  * irdma_ubind_mw - bind a memory window
283  * @qp: qp to post WR
284  * @mw: memory window to bind
285  * @mw_bind: bind info
286  */
287 int
288 irdma_ubind_mw(struct ibv_qp *qp, struct ibv_mw *mw,
289 	       struct ibv_mw_bind *mw_bind)
290 {
291 	struct ibv_mw_bind_info *bind_info = &mw_bind->bind_info;
292 	struct verbs_mr *vmr;
293 
294 	struct ibv_send_wr wr = {};
295 	struct ibv_send_wr *bad_wr;
296 	int err;
297 
298 	if (!bind_info->mr && (bind_info->addr || bind_info->length))
299 		return EINVAL;
300 
301 	if (bind_info->mr) {
302 		vmr = verbs_get_mr(bind_info->mr);
303 		if (vmr->mr_type != IBV_MR_TYPE_MR)
304 			return ENOTSUP;
305 
306 		if (vmr->access & IBV_ACCESS_ZERO_BASED)
307 			return EINVAL;
308 
309 		if (mw->pd != bind_info->mr->pd)
310 			return EPERM;
311 	}
312 
313 	wr.opcode = IBV_WR_BIND_MW;
314 	wr.bind_mw.bind_info = mw_bind->bind_info;
315 	wr.bind_mw.mw = mw;
316 	wr.bind_mw.rkey = ibv_inc_rkey(mw->rkey);
317 
318 	wr.wr_id = mw_bind->wr_id;
319 	wr.send_flags = mw_bind->send_flags;
320 
321 	err = irdma_upost_send(qp, &wr, &bad_wr);
322 	if (!err)
323 		mw->rkey = wr.bind_mw.rkey;
324 
325 	return err;
326 }
327 
328 /**
329  * irdma_udealloc_mw - deallocate memory window
330  * @mw: memory window to dealloc
331  */
332 int
333 irdma_udealloc_mw(struct ibv_mw *mw)
334 {
335 	int ret;
336 	struct ibv_dealloc_mw cmd;
337 
338 	ret = ibv_cmd_dealloc_mw(mw, &cmd, sizeof(cmd));
339 	if (ret)
340 		return ret;
341 	free(mw);
342 
343 	return 0;
344 }
345 
346 static void *
347 irdma_alloc_hw_buf(size_t size)
348 {
349 	void *buf;
350 
351 	buf = memalign(IRDMA_HW_PAGE_SIZE, size);
352 
353 	if (!buf)
354 		return NULL;
355 	if (ibv_dontfork_range(buf, size)) {
356 		free(buf);
357 		return NULL;
358 	}
359 
360 	return buf;
361 }
362 
363 static void
364 irdma_free_hw_buf(void *buf, size_t size)
365 {
366 	ibv_dofork_range(buf, size);
367 	free(buf);
368 }
369 
370 /**
371  * get_cq_size - returns actual cqe needed by HW
372  * @ncqe: minimum cqes requested by application
373  * @hw_rev: HW generation
374  * @cqe_64byte_ena: enable 64byte cqe
375  */
376 static inline int
377 get_cq_size(int ncqe, u8 hw_rev)
378 {
379 	ncqe++;
380 
381 	/* Completions with immediate require 1 extra entry */
382 	if (hw_rev > IRDMA_GEN_1)
383 		ncqe *= 2;
384 
385 	if (ncqe < IRDMA_U_MINCQ_SIZE)
386 		ncqe = IRDMA_U_MINCQ_SIZE;
387 
388 	return ncqe;
389 }
390 
391 static inline size_t get_cq_total_bytes(u32 cq_size) {
392 	return roundup(cq_size * sizeof(struct irdma_cqe), IRDMA_HW_PAGE_SIZE);
393 }
394 
395 /**
396  * ucreate_cq - irdma util function to create a CQ
397  * @context: ibv context
398  * @attr_ex: CQ init attributes
399  * @ext_cq: flag to create an extendable or normal CQ
400  */
401 static struct ibv_cq_ex *
402 ucreate_cq(struct ibv_context *context,
403 	   struct ibv_cq_init_attr_ex *attr_ex,
404 	   bool ext_cq)
405 {
406 	struct irdma_cq_uk_init_info info = {};
407 	struct irdma_ureg_mr reg_mr_cmd = {};
408 	struct irdma_ucreate_cq_ex cmd = {};
409 	struct irdma_ucreate_cq_ex_resp resp = {};
410 	struct ibv_reg_mr_resp reg_mr_resp = {};
411 	struct irdma_ureg_mr reg_mr_shadow_cmd = {};
412 	struct ibv_reg_mr_resp reg_mr_shadow_resp = {};
413 	struct irdma_uk_attrs *uk_attrs;
414 	struct irdma_uvcontext *iwvctx;
415 	struct irdma_ucq *iwucq;
416 	size_t total_size;
417 	u32 cq_pages;
418 	int ret, ncqe;
419 	u8 hw_rev;
420 
421 	iwvctx = container_of(context, struct irdma_uvcontext, ibv_ctx);
422 	uk_attrs = &iwvctx->uk_attrs;
423 	hw_rev = uk_attrs->hw_rev;
424 
425 	if (ext_cq) {
426 		u32 supported_flags = IRDMA_STANDARD_WC_FLAGS_EX;
427 
428 		if (hw_rev == IRDMA_GEN_1 || attr_ex->wc_flags & ~supported_flags) {
429 			errno = EOPNOTSUPP;
430 			return NULL;
431 		}
432 	}
433 
434 	if (attr_ex->cqe < uk_attrs->min_hw_cq_size || attr_ex->cqe > uk_attrs->max_hw_cq_size - 1) {
435 		errno = EINVAL;
436 		return NULL;
437 	}
438 
439 	/* save the cqe requested by application */
440 	ncqe = attr_ex->cqe;
441 
442 	iwucq = calloc(1, sizeof(*iwucq));
443 	if (!iwucq)
444 		return NULL;
445 
446 	ret = pthread_spin_init(&iwucq->lock, PTHREAD_PROCESS_PRIVATE);
447 	if (ret) {
448 		free(iwucq);
449 		errno = ret;
450 		return NULL;
451 	}
452 
453 	info.cq_size = get_cq_size(attr_ex->cqe, hw_rev);
454 	total_size = get_cq_total_bytes(info.cq_size);
455 	iwucq->comp_vector = attr_ex->comp_vector;
456 	LIST_INIT(&iwucq->resize_list);
457 	cq_pages = total_size >> IRDMA_HW_PAGE_SHIFT;
458 
459 	if (!(uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE))
460 		total_size = (cq_pages << IRDMA_HW_PAGE_SHIFT) + IRDMA_DB_SHADOW_AREA_SIZE;
461 
462 	iwucq->buf_size = total_size;
463 	info.cq_base = irdma_alloc_hw_buf(total_size);
464 	if (!info.cq_base) {
465 		ret = ENOMEM;
466 		goto err_cq_base;
467 	}
468 
469 	memset(info.cq_base, 0, total_size);
470 	reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ;
471 	reg_mr_cmd.cq_pages = cq_pages;
472 
473 	ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, info.cq_base,
474 			     total_size, (uintptr_t)info.cq_base,
475 			     IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr.ibv_mr,
476 			     &reg_mr_cmd.ibv_cmd, sizeof(reg_mr_cmd),
477 			     &reg_mr_resp, sizeof(reg_mr_resp));
478 	if (ret)
479 		goto err_dereg_mr;
480 
481 	iwucq->vmr.ibv_mr.pd = &iwvctx->iwupd->ibv_pd;
482 
483 	if (uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE) {
484 		info.shadow_area = irdma_alloc_hw_buf(IRDMA_DB_SHADOW_AREA_SIZE);
485 		if (!info.shadow_area) {
486 			ret = ENOMEM;
487 			goto err_alloc_shadow;
488 		}
489 
490 		memset(info.shadow_area, 0, IRDMA_DB_SHADOW_AREA_SIZE);
491 		reg_mr_shadow_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ;
492 		reg_mr_shadow_cmd.cq_pages = 1;
493 
494 		ret = ibv_cmd_reg_mr(&iwvctx->iwupd->ibv_pd, info.shadow_area,
495 				     IRDMA_DB_SHADOW_AREA_SIZE, (uintptr_t)info.shadow_area,
496 				     IBV_ACCESS_LOCAL_WRITE, &iwucq->vmr_shadow_area.ibv_mr,
497 				     &reg_mr_shadow_cmd.ibv_cmd, sizeof(reg_mr_shadow_cmd),
498 				     &reg_mr_shadow_resp, sizeof(reg_mr_shadow_resp));
499 		if (ret) {
500 			irdma_free_hw_buf(info.shadow_area, IRDMA_DB_SHADOW_AREA_SIZE);
501 			goto err_alloc_shadow;
502 		}
503 
504 		iwucq->vmr_shadow_area.ibv_mr.pd = &iwvctx->iwupd->ibv_pd;
505 
506 	} else {
507 		info.shadow_area = (__le64 *) ((u8 *)info.cq_base + (cq_pages << IRDMA_HW_PAGE_SHIFT));
508 	}
509 
510 	attr_ex->cqe = info.cq_size;
511 	cmd.user_cq_buf = (__u64) ((uintptr_t)info.cq_base);
512 	cmd.user_shadow_area = (__u64) ((uintptr_t)info.shadow_area);
513 
514 	ret = ibv_cmd_create_cq_ex(context, attr_ex, &iwucq->verbs_cq.cq_ex,
515 				   &cmd.ibv_cmd, sizeof(cmd.ibv_cmd), sizeof(cmd), &resp.ibv_resp,
516 				   sizeof(resp.ibv_resp), sizeof(resp));
517 	attr_ex->cqe = ncqe;
518 	if (ret)
519 		goto err_create_cq;
520 
521 	if (ext_cq)
522 		irdma_ibvcq_ex_fill_priv_funcs(iwucq, attr_ex);
523 	info.cq_id = resp.cq_id;
524 	/* Do not report the CQE's reserved for immediate and burned by HW */
525 	iwucq->verbs_cq.cq.cqe = ncqe;
526 	info.cqe_alloc_db = (u32 *)((u8 *)iwvctx->db + IRDMA_DB_CQ_OFFSET);
527 	irdma_uk_cq_init(&iwucq->cq, &info);
528 	return &iwucq->verbs_cq.cq_ex;
529 
530 err_create_cq:
531 	if (iwucq->vmr_shadow_area.ibv_mr.handle) {
532 		ibv_cmd_dereg_mr(&iwucq->vmr_shadow_area.ibv_mr);
533 		irdma_free_hw_buf(info.shadow_area, IRDMA_DB_SHADOW_AREA_SIZE);
534 	}
535 err_alloc_shadow:
536 	ibv_cmd_dereg_mr(&iwucq->vmr.ibv_mr);
537 err_dereg_mr:
538 	irdma_free_hw_buf(info.cq_base, total_size);
539 err_cq_base:
540 	printf("%s: failed to initialize CQ\n", __func__);
541 	pthread_spin_destroy(&iwucq->lock);
542 
543 	free(iwucq);
544 
545 	errno = ret;
546 	return NULL;
547 }
548 
549 struct ibv_cq *
550 irdma_ucreate_cq(struct ibv_context *context, int cqe,
551 		 struct ibv_comp_channel *channel,
552 		 int comp_vector)
553 {
554 	struct ibv_cq_init_attr_ex attr_ex = {
555 		.cqe = cqe,
556 		.channel = channel,
557 		.comp_vector = comp_vector,
558 	};
559 	struct ibv_cq_ex *ibvcq_ex;
560 
561 	ibvcq_ex = ucreate_cq(context, &attr_ex, false);
562 
563 	return ibvcq_ex ? ibv_cq_ex_to_cq(ibvcq_ex) : NULL;
564 }
565 
566 struct ibv_cq_ex *
567 irdma_ucreate_cq_ex(struct ibv_context *context,
568 		    struct ibv_cq_init_attr_ex *attr_ex)
569 {
570 	return ucreate_cq(context, attr_ex, true);
571 }
572 
573 /**
574  * irdma_free_cq_buf - free memory for cq buffer
575  * @cq_buf: cq buf to free
576  */
577 static void
578 irdma_free_cq_buf(struct irdma_cq_buf *cq_buf)
579 {
580 	ibv_cmd_dereg_mr(&cq_buf->vmr.ibv_mr);
581 	irdma_free_hw_buf(cq_buf->cq.cq_base, get_cq_total_bytes(cq_buf->cq.cq_size));
582 	free(cq_buf);
583 }
584 
585 /**
586  * irdma_process_resize_list - process the cq list to remove buffers
587  * @iwucq: cq which owns the list
588  * @lcqe_buf: cq buf where the last cqe is found
589  */
590 static int
591 irdma_process_resize_list(struct irdma_ucq *iwucq,
592 			  struct irdma_cq_buf *lcqe_buf)
593 {
594 	struct irdma_cq_buf *cq_buf, *next;
595 	int cq_cnt = 0;
596 
597 	LIST_FOREACH_SAFE(cq_buf, &iwucq->resize_list, list, next) {
598 		if (cq_buf == lcqe_buf)
599 			return cq_cnt;
600 
601 		LIST_REMOVE(cq_buf, list);
602 		irdma_free_cq_buf(cq_buf);
603 		cq_cnt++;
604 	}
605 
606 	return cq_cnt;
607 }
608 
609 /**
610  * irdma_udestroy_cq - destroys cq
611  * @cq: ptr to cq to be destroyed
612  */
613 int
614 irdma_udestroy_cq(struct ibv_cq *cq)
615 {
616 	struct irdma_uk_attrs *uk_attrs;
617 	struct irdma_uvcontext *iwvctx;
618 	struct irdma_ucq *iwucq;
619 	int ret;
620 
621 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
622 	iwvctx = container_of(cq->context, struct irdma_uvcontext, ibv_ctx);
623 	uk_attrs = &iwvctx->uk_attrs;
624 
625 	ret = pthread_spin_destroy(&iwucq->lock);
626 	if (ret)
627 		goto err;
628 
629 	irdma_process_resize_list(iwucq, NULL);
630 	ret = ibv_cmd_destroy_cq(cq);
631 	if (ret)
632 		goto err;
633 
634 	ibv_cmd_dereg_mr(&iwucq->vmr.ibv_mr);
635 	irdma_free_hw_buf(iwucq->cq.cq_base, iwucq->buf_size);
636 
637 	if (uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE) {
638 		ibv_cmd_dereg_mr(&iwucq->vmr_shadow_area.ibv_mr);
639 		irdma_free_hw_buf(iwucq->cq.shadow_area, IRDMA_DB_SHADOW_AREA_SIZE);
640 	}
641 	free(iwucq);
642 	return 0;
643 
644 err:
645 	return ret;
646 }
647 
648 static enum ibv_wc_status
649 irdma_flush_err_to_ib_wc_status(enum irdma_flush_opcode opcode)
650 {
651 	switch (opcode) {
652 	case FLUSH_PROT_ERR:
653 		return IBV_WC_LOC_PROT_ERR;
654 	case FLUSH_REM_ACCESS_ERR:
655 		return IBV_WC_REM_ACCESS_ERR;
656 	case FLUSH_LOC_QP_OP_ERR:
657 		return IBV_WC_LOC_QP_OP_ERR;
658 	case FLUSH_REM_OP_ERR:
659 		return IBV_WC_REM_OP_ERR;
660 	case FLUSH_LOC_LEN_ERR:
661 		return IBV_WC_LOC_LEN_ERR;
662 	case FLUSH_GENERAL_ERR:
663 		return IBV_WC_WR_FLUSH_ERR;
664 	case FLUSH_MW_BIND_ERR:
665 		return IBV_WC_MW_BIND_ERR;
666 	case FLUSH_REM_INV_REQ_ERR:
667 		return IBV_WC_REM_INV_REQ_ERR;
668 	case FLUSH_RETRY_EXC_ERR:
669 		return IBV_WC_RETRY_EXC_ERR;
670 	case FLUSH_FATAL_ERR:
671 	default:
672 		return IBV_WC_FATAL_ERR;
673 	}
674 }
675 
676 static inline void
677 set_ib_wc_op_sq(struct irdma_cq_poll_info *cur_cqe, struct ibv_wc *entry)
678 {
679 	switch (cur_cqe->op_type) {
680 	case IRDMA_OP_TYPE_RDMA_WRITE:
681 	case IRDMA_OP_TYPE_RDMA_WRITE_SOL:
682 		entry->opcode = IBV_WC_RDMA_WRITE;
683 		break;
684 	case IRDMA_OP_TYPE_RDMA_READ:
685 		entry->opcode = IBV_WC_RDMA_READ;
686 		break;
687 	case IRDMA_OP_TYPE_SEND_SOL:
688 	case IRDMA_OP_TYPE_SEND_SOL_INV:
689 	case IRDMA_OP_TYPE_SEND_INV:
690 	case IRDMA_OP_TYPE_SEND:
691 		entry->opcode = IBV_WC_SEND;
692 		break;
693 	case IRDMA_OP_TYPE_BIND_MW:
694 		entry->opcode = IBV_WC_BIND_MW;
695 		break;
696 	case IRDMA_OP_TYPE_INV_STAG:
697 		entry->opcode = IBV_WC_LOCAL_INV;
698 		break;
699 	default:
700 		entry->status = IBV_WC_GENERAL_ERR;
701 		printf("%s: Invalid opcode = %d in CQE\n",
702 		       __func__, cur_cqe->op_type);
703 	}
704 }
705 
706 static inline void
707 set_ib_wc_op_rq(struct irdma_cq_poll_info *cur_cqe,
708 		struct ibv_wc *entry, bool send_imm_support)
709 {
710 	if (!send_imm_support) {
711 		entry->opcode = cur_cqe->imm_valid ? IBV_WC_RECV_RDMA_WITH_IMM :
712 		    IBV_WC_RECV;
713 		return;
714 	}
715 	switch (cur_cqe->op_type) {
716 	case IBV_OPCODE_RDMA_WRITE_ONLY_WITH_IMMEDIATE:
717 	case IBV_OPCODE_RDMA_WRITE_LAST_WITH_IMMEDIATE:
718 		entry->opcode = IBV_WC_RECV_RDMA_WITH_IMM;
719 		break;
720 	default:
721 		entry->opcode = IBV_WC_RECV;
722 	}
723 }
724 
725 /**
726  * irdma_process_cqe_ext - process current cqe for extended CQ
727  * @cur_cqe - current cqe info
728  */
729 static void
730 irdma_process_cqe_ext(struct irdma_cq_poll_info *cur_cqe)
731 {
732 	struct irdma_ucq *iwucq = container_of(cur_cqe, struct irdma_ucq, cur_cqe);
733 	struct ibv_cq_ex *ibvcq_ex = &iwucq->verbs_cq.cq_ex;
734 
735 	ibvcq_ex->wr_id = cur_cqe->wr_id;
736 	if (cur_cqe->error)
737 		ibvcq_ex->status = (cur_cqe->comp_status == IRDMA_COMPL_STATUS_FLUSHED) ?
738 		    irdma_flush_err_to_ib_wc_status(cur_cqe->minor_err) : IBV_WC_GENERAL_ERR;
739 	else
740 		ibvcq_ex->status = IBV_WC_SUCCESS;
741 }
742 
743 /**
744  * irdma_process_cqe - process current cqe info
745  * @entry - ibv_wc object to fill in for non-extended CQ
746  * @cur_cqe - current cqe info
747  */
748 static void
749 irdma_process_cqe(struct ibv_wc *entry, struct irdma_cq_poll_info *cur_cqe)
750 {
751 	struct irdma_qp_uk *qp;
752 	struct ibv_qp *ib_qp;
753 
754 	entry->wc_flags = 0;
755 	entry->wr_id = cur_cqe->wr_id;
756 	entry->qp_num = cur_cqe->qp_id;
757 	qp = cur_cqe->qp_handle;
758 	ib_qp = qp->back_qp;
759 
760 	if (cur_cqe->error) {
761 		entry->status = (cur_cqe->comp_status == IRDMA_COMPL_STATUS_FLUSHED) ?
762 		    irdma_flush_err_to_ib_wc_status(cur_cqe->minor_err) : IBV_WC_GENERAL_ERR;
763 		entry->vendor_err = cur_cqe->major_err << 16 |
764 		    cur_cqe->minor_err;
765 	} else {
766 		entry->status = IBV_WC_SUCCESS;
767 	}
768 
769 	if (cur_cqe->imm_valid) {
770 		entry->imm_data = htonl(cur_cqe->imm_data);
771 		entry->wc_flags |= IBV_WC_WITH_IMM;
772 	}
773 
774 	if (cur_cqe->q_type == IRDMA_CQE_QTYPE_SQ) {
775 		set_ib_wc_op_sq(cur_cqe, entry);
776 	} else {
777 		set_ib_wc_op_rq(cur_cqe, entry,
778 				qp->qp_caps & IRDMA_SEND_WITH_IMM ?
779 				true : false);
780 		if (ib_qp->qp_type != IBV_QPT_UD &&
781 		    cur_cqe->stag_invalid_set) {
782 			entry->invalidated_rkey = cur_cqe->inv_stag;
783 			entry->wc_flags |= IBV_WC_WITH_INV;
784 		}
785 	}
786 
787 	if (ib_qp->qp_type == IBV_QPT_UD) {
788 		entry->src_qp = cur_cqe->ud_src_qpn;
789 		entry->wc_flags |= IBV_WC_GRH;
790 	} else {
791 		entry->src_qp = cur_cqe->qp_id;
792 	}
793 	entry->byte_len = cur_cqe->bytes_xfered;
794 }
795 
796 /**
797  * irdma_poll_one - poll one entry of the CQ
798  * @ukcq: ukcq to poll
799  * @cur_cqe: current CQE info to be filled in
800  * @entry: ibv_wc object to be filled for non-extended CQ or NULL for extended CQ
801  *
802  * Returns the internal irdma device error code or 0 on success
803  */
804 static int
805 irdma_poll_one(struct irdma_cq_uk *ukcq, struct irdma_cq_poll_info *cur_cqe,
806 	       struct ibv_wc *entry)
807 {
808 	int ret = irdma_uk_cq_poll_cmpl(ukcq, cur_cqe);
809 
810 	if (ret)
811 		return ret;
812 
813 	if (!entry)
814 		irdma_process_cqe_ext(cur_cqe);
815 	else
816 		irdma_process_cqe(entry, cur_cqe);
817 
818 	return 0;
819 }
820 
821 /**
822  * __irdma_upoll_cq - irdma util function to poll device CQ
823  * @iwucq: irdma cq to poll
824  * @num_entries: max cq entries to poll
825  * @entry: pointer to array of ibv_wc objects to be filled in for each completion or NULL if ext CQ
826  *
827  * Returns non-negative value equal to the number of completions
828  * found. On failure, EINVAL
829  */
830 static int
831 __irdma_upoll_cq(struct irdma_ucq *iwucq, int num_entries,
832 		 struct ibv_wc *entry)
833 {
834 	struct irdma_cq_buf *cq_buf, *next;
835 	struct irdma_cq_buf *last_buf = NULL;
836 	struct irdma_cq_poll_info *cur_cqe = &iwucq->cur_cqe;
837 	bool cq_new_cqe = false;
838 	int resized_bufs = 0;
839 	int npolled = 0;
840 	int ret;
841 
842 	/* go through the list of previously resized CQ buffers */
843 	LIST_FOREACH_SAFE(cq_buf, &iwucq->resize_list, list, next) {
844 		while (npolled < num_entries) {
845 			ret = irdma_poll_one(&cq_buf->cq, cur_cqe,
846 					     entry ? entry + npolled : NULL);
847 			if (!ret) {
848 				++npolled;
849 				cq_new_cqe = true;
850 				continue;
851 			}
852 			if (ret == ENOENT)
853 				break;
854 			/* QP using the CQ is destroyed. Skip reporting this CQE */
855 			if (ret == EFAULT) {
856 				cq_new_cqe = true;
857 				continue;
858 			}
859 			goto error;
860 		}
861 
862 		/* save the resized CQ buffer which received the last cqe */
863 		if (cq_new_cqe)
864 			last_buf = cq_buf;
865 		cq_new_cqe = false;
866 	}
867 
868 	/* check the current CQ for new cqes */
869 	while (npolled < num_entries) {
870 		ret = irdma_poll_one(&iwucq->cq, cur_cqe,
871 				     entry ? entry + npolled : NULL);
872 		if (!ret) {
873 			++npolled;
874 			cq_new_cqe = true;
875 			continue;
876 		}
877 		if (ret == ENOENT)
878 			break;
879 		/* QP using the CQ is destroyed. Skip reporting this CQE */
880 		if (ret == EFAULT) {
881 			cq_new_cqe = true;
882 			continue;
883 		}
884 		goto error;
885 	}
886 
887 	if (cq_new_cqe)
888 		/* all previous CQ resizes are complete */
889 		resized_bufs = irdma_process_resize_list(iwucq, NULL);
890 	else if (last_buf)
891 		/* only CQ resizes up to the last_buf are complete */
892 		resized_bufs = irdma_process_resize_list(iwucq, last_buf);
893 	if (resized_bufs)
894 		/* report to the HW the number of complete CQ resizes */
895 		irdma_uk_cq_set_resized_cnt(&iwucq->cq, resized_bufs);
896 
897 	return npolled;
898 
899 error:
900 	printf("%s: Error polling CQ, irdma_err: %d\n", __func__, ret);
901 
902 	return EINVAL;
903 }
904 
905 /**
906  * irdma_upoll_cq - verb API callback to poll device CQ
907  * @cq: ibv_cq to poll
908  * @num_entries: max cq entries to poll
909  * @entry: pointer to array of ibv_wc objects to be filled in for each completion
910  *
911  * Returns non-negative value equal to the number of completions
912  * found and a negative error code on failure
913  */
914 int
915 irdma_upoll_cq(struct ibv_cq *cq, int num_entries, struct ibv_wc *entry)
916 {
917 	struct irdma_ucq *iwucq;
918 	int ret;
919 
920 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
921 	ret = pthread_spin_lock(&iwucq->lock);
922 	if (ret)
923 		return -ret;
924 
925 	ret = __irdma_upoll_cq(iwucq, num_entries, entry);
926 
927 	pthread_spin_unlock(&iwucq->lock);
928 
929 	return ret;
930 }
931 
932 /**
933  * irdma_start_poll - verb_ex API callback to poll batch of WC's
934  * @ibvcq_ex: ibv extended CQ
935  * @attr: attributes (not used)
936  *
937  * Start polling batch of work completions. Return 0 on success, ENONENT when
938  * no completions are available on CQ. And an error code on errors
939  */
940 static int
941 irdma_start_poll(struct ibv_cq_ex *ibvcq_ex, struct ibv_poll_cq_attr *attr)
942 {
943 	struct irdma_ucq *iwucq;
944 	int ret;
945 
946 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
947 	ret = pthread_spin_lock(&iwucq->lock);
948 	if (ret)
949 		return ret;
950 
951 	ret = __irdma_upoll_cq(iwucq, 1, NULL);
952 	if (ret == 1)
953 		return 0;
954 
955 	/* No Completions on CQ */
956 	if (!ret)
957 		ret = ENOENT;
958 
959 	pthread_spin_unlock(&iwucq->lock);
960 
961 	return ret;
962 }
963 
964 /**
965  * irdma_next_poll - verb_ex API callback to get next WC
966  * @ibvcq_ex: ibv extended CQ
967  *
968  * Return 0 on success, ENONENT when no completions are available on CQ.
969  * And an error code on errors
970  */
971 static int
972 irdma_next_poll(struct ibv_cq_ex *ibvcq_ex)
973 {
974 	struct irdma_ucq *iwucq;
975 	int ret;
976 
977 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
978 	ret = __irdma_upoll_cq(iwucq, 1, NULL);
979 	if (ret == 1)
980 		return 0;
981 
982 	/* No Completions on CQ */
983 	if (!ret)
984 		ret = ENOENT;
985 
986 	return ret;
987 }
988 
989 /**
990  * irdma_end_poll - verb_ex API callback to end polling of WC's
991  * @ibvcq_ex: ibv extended CQ
992  */
993 static void
994 irdma_end_poll(struct ibv_cq_ex *ibvcq_ex)
995 {
996 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
997 					       verbs_cq.cq_ex);
998 
999 	pthread_spin_unlock(&iwucq->lock);
1000 }
1001 
1002 static enum ibv_wc_opcode
1003 irdma_wc_read_opcode(struct ibv_cq_ex *ibvcq_ex)
1004 {
1005 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1006 					       verbs_cq.cq_ex);
1007 
1008 	switch (iwucq->cur_cqe.op_type) {
1009 	case IRDMA_OP_TYPE_RDMA_WRITE:
1010 	case IRDMA_OP_TYPE_RDMA_WRITE_SOL:
1011 		return IBV_WC_RDMA_WRITE;
1012 	case IRDMA_OP_TYPE_RDMA_READ:
1013 		return IBV_WC_RDMA_READ;
1014 	case IRDMA_OP_TYPE_SEND_SOL:
1015 	case IRDMA_OP_TYPE_SEND_SOL_INV:
1016 	case IRDMA_OP_TYPE_SEND_INV:
1017 	case IRDMA_OP_TYPE_SEND:
1018 		return IBV_WC_SEND;
1019 	case IRDMA_OP_TYPE_BIND_MW:
1020 		return IBV_WC_BIND_MW;
1021 	case IRDMA_OP_TYPE_REC:
1022 		return IBV_WC_RECV;
1023 	case IRDMA_OP_TYPE_REC_IMM:
1024 		return IBV_WC_RECV_RDMA_WITH_IMM;
1025 	case IRDMA_OP_TYPE_INV_STAG:
1026 		return IBV_WC_LOCAL_INV;
1027 	}
1028 
1029 	printf("%s: Invalid opcode = %d in CQE\n", __func__,
1030 	       iwucq->cur_cqe.op_type);
1031 
1032 	return 0;
1033 }
1034 
1035 static uint32_t irdma_wc_read_vendor_err(struct ibv_cq_ex *ibvcq_ex){
1036 	struct irdma_cq_poll_info *cur_cqe;
1037 	struct irdma_ucq *iwucq;
1038 
1039 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1040 	cur_cqe = &iwucq->cur_cqe;
1041 
1042 	return cur_cqe->error ? cur_cqe->major_err << 16 | cur_cqe->minor_err : 0;
1043 }
1044 
1045 static int
1046 irdma_wc_read_wc_flags(struct ibv_cq_ex *ibvcq_ex)
1047 {
1048 	struct irdma_cq_poll_info *cur_cqe;
1049 	struct irdma_ucq *iwucq;
1050 	struct irdma_qp_uk *qp;
1051 	struct ibv_qp *ib_qp;
1052 	int wc_flags = 0;
1053 
1054 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1055 	cur_cqe = &iwucq->cur_cqe;
1056 	qp = cur_cqe->qp_handle;
1057 	ib_qp = qp->back_qp;
1058 
1059 	if (cur_cqe->imm_valid)
1060 		wc_flags |= IBV_WC_WITH_IMM;
1061 
1062 	if (ib_qp->qp_type == IBV_QPT_UD) {
1063 		wc_flags |= IBV_WC_GRH;
1064 	} else {
1065 		if (cur_cqe->stag_invalid_set) {
1066 			switch (cur_cqe->op_type) {
1067 			case IRDMA_OP_TYPE_REC:
1068 				wc_flags |= IBV_WC_WITH_INV;
1069 				break;
1070 			case IRDMA_OP_TYPE_REC_IMM:
1071 				wc_flags |= IBV_WC_WITH_INV;
1072 				break;
1073 			}
1074 		}
1075 	}
1076 
1077 	return wc_flags;
1078 }
1079 
1080 static uint32_t irdma_wc_read_byte_len(struct ibv_cq_ex *ibvcq_ex){
1081 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1082 					       verbs_cq.cq_ex);
1083 
1084 	return iwucq->cur_cqe.bytes_xfered;
1085 }
1086 
1087 static __be32 irdma_wc_read_imm_data(struct ibv_cq_ex *ibvcq_ex){
1088 	struct irdma_cq_poll_info *cur_cqe;
1089 	struct irdma_ucq *iwucq;
1090 
1091 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1092 	cur_cqe = &iwucq->cur_cqe;
1093 
1094 	return cur_cqe->imm_valid ? htonl(cur_cqe->imm_data) : 0;
1095 }
1096 
1097 static uint32_t irdma_wc_read_qp_num(struct ibv_cq_ex *ibvcq_ex){
1098 	struct irdma_ucq *iwucq = container_of(ibvcq_ex, struct irdma_ucq,
1099 					       verbs_cq.cq_ex);
1100 
1101 	return iwucq->cur_cqe.qp_id;
1102 }
1103 
1104 static uint32_t irdma_wc_read_src_qp(struct ibv_cq_ex *ibvcq_ex){
1105 	struct irdma_cq_poll_info *cur_cqe;
1106 	struct irdma_ucq *iwucq;
1107 	struct irdma_qp_uk *qp;
1108 	struct ibv_qp *ib_qp;
1109 
1110 	iwucq = container_of(ibvcq_ex, struct irdma_ucq, verbs_cq.cq_ex);
1111 	cur_cqe = &iwucq->cur_cqe;
1112 	qp = cur_cqe->qp_handle;
1113 	ib_qp = qp->back_qp;
1114 
1115 	return ib_qp->qp_type == IBV_QPT_UD ? cur_cqe->ud_src_qpn : cur_cqe->qp_id;
1116 }
1117 
1118 static uint8_t irdma_wc_read_sl(struct ibv_cq_ex *ibvcq_ex){
1119 	return 0;
1120 }
1121 
1122 void
1123 irdma_ibvcq_ex_fill_priv_funcs(struct irdma_ucq *iwucq,
1124 			       struct ibv_cq_init_attr_ex *attr_ex)
1125 {
1126 	struct ibv_cq_ex *ibvcq_ex = &iwucq->verbs_cq.cq_ex;
1127 
1128 	ibvcq_ex->start_poll = irdma_start_poll;
1129 	ibvcq_ex->end_poll = irdma_end_poll;
1130 	ibvcq_ex->next_poll = irdma_next_poll;
1131 
1132 	ibvcq_ex->read_opcode = irdma_wc_read_opcode;
1133 	ibvcq_ex->read_vendor_err = irdma_wc_read_vendor_err;
1134 	ibvcq_ex->read_wc_flags = irdma_wc_read_wc_flags;
1135 
1136 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_BYTE_LEN)
1137 		ibvcq_ex->read_byte_len = irdma_wc_read_byte_len;
1138 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_IMM)
1139 		ibvcq_ex->read_imm_data = irdma_wc_read_imm_data;
1140 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_QP_NUM)
1141 		ibvcq_ex->read_qp_num = irdma_wc_read_qp_num;
1142 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_SRC_QP)
1143 		ibvcq_ex->read_src_qp = irdma_wc_read_src_qp;
1144 	if (attr_ex->wc_flags & IBV_WC_EX_WITH_SL)
1145 		ibvcq_ex->read_sl = irdma_wc_read_sl;
1146 }
1147 
1148 /**
1149  * irdma_arm_cq - arm of cq
1150  * @iwucq: cq to which arm
1151  * @cq_notify: notification params
1152  */
1153 static void
1154 irdma_arm_cq(struct irdma_ucq *iwucq,
1155 	     enum irdma_cmpl_notify cq_notify)
1156 {
1157 	iwucq->is_armed = true;
1158 	iwucq->arm_sol = true;
1159 	iwucq->skip_arm = false;
1160 	iwucq->skip_sol = true;
1161 	irdma_uk_cq_request_notification(&iwucq->cq, cq_notify);
1162 }
1163 
1164 /**
1165  * irdma_uarm_cq - callback for arm of cq
1166  * @cq: cq to arm
1167  * @solicited: to get notify params
1168  */
1169 int
1170 irdma_uarm_cq(struct ibv_cq *cq, int solicited)
1171 {
1172 	struct irdma_ucq *iwucq;
1173 	enum irdma_cmpl_notify cq_notify = IRDMA_CQ_COMPL_EVENT;
1174 	int ret;
1175 
1176 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
1177 	if (solicited)
1178 		cq_notify = IRDMA_CQ_COMPL_SOLICITED;
1179 
1180 	ret = pthread_spin_lock(&iwucq->lock);
1181 	if (ret)
1182 		return ret;
1183 
1184 	if (iwucq->is_armed) {
1185 		if (iwucq->arm_sol && !solicited) {
1186 			irdma_arm_cq(iwucq, cq_notify);
1187 		} else {
1188 			iwucq->skip_arm = true;
1189 			iwucq->skip_sol = solicited ? true : false;
1190 		}
1191 	} else {
1192 		irdma_arm_cq(iwucq, cq_notify);
1193 	}
1194 
1195 	pthread_spin_unlock(&iwucq->lock);
1196 
1197 	return 0;
1198 }
1199 
1200 /**
1201  * irdma_cq_event - cq to do completion event
1202  * @cq: cq to arm
1203  */
1204 void
1205 irdma_cq_event(struct ibv_cq *cq)
1206 {
1207 	struct irdma_ucq *iwucq;
1208 
1209 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
1210 	if (pthread_spin_lock(&iwucq->lock))
1211 		return;
1212 
1213 	if (iwucq->skip_arm)
1214 		irdma_arm_cq(iwucq, IRDMA_CQ_COMPL_EVENT);
1215 	else
1216 		iwucq->is_armed = false;
1217 
1218 	pthread_spin_unlock(&iwucq->lock);
1219 }
1220 
1221 void *
1222 irdma_mmap(int fd, off_t offset)
1223 {
1224 	void *map;
1225 
1226 	map = mmap(NULL, IRDMA_HW_PAGE_SIZE, PROT_WRITE | PROT_READ, MAP_SHARED,
1227 		   fd, offset);
1228 	if (map == MAP_FAILED)
1229 		return map;
1230 
1231 	if (ibv_dontfork_range(map, IRDMA_HW_PAGE_SIZE)) {
1232 		munmap(map, IRDMA_HW_PAGE_SIZE);
1233 		return MAP_FAILED;
1234 	}
1235 
1236 	return map;
1237 }
1238 
1239 void
1240 irdma_munmap(void *map)
1241 {
1242 	ibv_dofork_range(map, IRDMA_HW_PAGE_SIZE);
1243 	munmap(map, IRDMA_HW_PAGE_SIZE);
1244 }
1245 
1246 /**
1247  * irdma_destroy_vmapped_qp - destroy resources for qp
1248  * @iwuqp: qp struct for resources
1249  */
1250 static int
1251 irdma_destroy_vmapped_qp(struct irdma_uqp *iwuqp)
1252 {
1253 	int ret;
1254 
1255 	ret = ibv_cmd_destroy_qp(&iwuqp->ibv_qp);
1256 	if (ret)
1257 		return ret;
1258 
1259 	if (iwuqp->qp.push_db)
1260 		irdma_munmap(iwuqp->qp.push_db);
1261 	if (iwuqp->qp.push_wqe)
1262 		irdma_munmap(iwuqp->qp.push_wqe);
1263 
1264 	ibv_cmd_dereg_mr(&iwuqp->vmr.ibv_mr);
1265 
1266 	return 0;
1267 }
1268 
1269 /**
1270  * irdma_vmapped_qp - create resources for qp
1271  * @iwuqp: qp struct for resources
1272  * @pd: pd for the qp
1273  * @attr: attributes of qp passed
1274  * @resp: response back from create qp
1275  * @info: uk info for initializing user level qp
1276  * @abi_ver: abi version of the create qp command
1277  */
1278 static int
1279 irdma_vmapped_qp(struct irdma_uqp *iwuqp, struct ibv_pd *pd,
1280 		 struct ibv_qp_init_attr *attr,
1281 		 struct irdma_qp_uk_init_info *info,
1282 		 bool legacy_mode)
1283 {
1284 	struct irdma_ucreate_qp cmd = {};
1285 	size_t sqsize, rqsize, totalqpsize;
1286 	struct irdma_ucreate_qp_resp resp = {};
1287 	struct irdma_ureg_mr reg_mr_cmd = {};
1288 	struct ibv_reg_mr_resp reg_mr_resp = {};
1289 	int ret;
1290 
1291 	sqsize = roundup(info->sq_depth * IRDMA_QP_WQE_MIN_SIZE, IRDMA_HW_PAGE_SIZE);
1292 	rqsize = roundup(info->rq_depth * IRDMA_QP_WQE_MIN_SIZE, IRDMA_HW_PAGE_SIZE);
1293 	totalqpsize = rqsize + sqsize + IRDMA_DB_SHADOW_AREA_SIZE;
1294 	info->sq = irdma_alloc_hw_buf(totalqpsize);
1295 	iwuqp->buf_size = totalqpsize;
1296 
1297 	if (!info->sq)
1298 		return ENOMEM;
1299 
1300 	memset(info->sq, 0, totalqpsize);
1301 	info->rq = &info->sq[sqsize / IRDMA_QP_WQE_MIN_SIZE];
1302 	info->shadow_area = info->rq[rqsize / IRDMA_QP_WQE_MIN_SIZE].elem;
1303 
1304 	reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_QP;
1305 	reg_mr_cmd.sq_pages = sqsize >> IRDMA_HW_PAGE_SHIFT;
1306 	reg_mr_cmd.rq_pages = rqsize >> IRDMA_HW_PAGE_SHIFT;
1307 
1308 	ret = ibv_cmd_reg_mr(pd, info->sq, totalqpsize,
1309 			     (uintptr_t)info->sq, IBV_ACCESS_LOCAL_WRITE,
1310 			     &iwuqp->vmr.ibv_mr, &reg_mr_cmd.ibv_cmd,
1311 			     sizeof(reg_mr_cmd), &reg_mr_resp,
1312 			     sizeof(reg_mr_resp));
1313 	if (ret)
1314 		goto err_dereg_mr;
1315 
1316 	cmd.user_wqe_bufs = (__u64) ((uintptr_t)info->sq);
1317 	cmd.user_compl_ctx = (__u64) (uintptr_t)&iwuqp->qp;
1318 	cmd.comp_mask |= IRDMA_CREATE_QP_USE_START_WQE_IDX;
1319 
1320 	ret = ibv_cmd_create_qp(pd, &iwuqp->ibv_qp, attr, &cmd.ibv_cmd,
1321 				sizeof(cmd), &resp.ibv_resp,
1322 				sizeof(struct irdma_ucreate_qp_resp));
1323 	if (ret)
1324 		goto err_qp;
1325 
1326 	info->sq_size = resp.actual_sq_size;
1327 	info->rq_size = resp.actual_rq_size;
1328 	info->first_sq_wq = legacy_mode ? 1 : resp.lsmm;
1329 	if (resp.comp_mask & IRDMA_CREATE_QP_USE_START_WQE_IDX)
1330 		info->start_wqe_idx = resp.start_wqe_idx;
1331 	info->qp_caps = resp.qp_caps;
1332 	info->qp_id = resp.qp_id;
1333 	iwuqp->irdma_drv_opt = resp.irdma_drv_opt;
1334 	iwuqp->ibv_qp.qp_num = resp.qp_id;
1335 
1336 	iwuqp->send_cq = container_of(attr->send_cq, struct irdma_ucq,
1337 				      verbs_cq.cq);
1338 	iwuqp->recv_cq = container_of(attr->recv_cq, struct irdma_ucq,
1339 				      verbs_cq.cq);
1340 	iwuqp->send_cq->uqp = iwuqp;
1341 	iwuqp->recv_cq->uqp = iwuqp;
1342 
1343 	return 0;
1344 err_qp:
1345 	ibv_cmd_dereg_mr(&iwuqp->vmr.ibv_mr);
1346 err_dereg_mr:
1347 	printf("%s: failed to create QP, status %d\n", __func__, ret);
1348 	irdma_free_hw_buf(info->sq, iwuqp->buf_size);
1349 	return ret;
1350 }
1351 
1352 /**
1353  * irdma_ucreate_qp - create qp on user app
1354  * @pd: pd for the qp
1355  * @attr: attributes of the qp to be created (sizes, sge, cq)
1356  */
1357 struct ibv_qp *
1358 irdma_ucreate_qp(struct ibv_pd *pd,
1359 		 struct ibv_qp_init_attr *attr)
1360 {
1361 	struct irdma_qp_uk_init_info info = {};
1362 	struct irdma_uk_attrs *uk_attrs;
1363 	struct irdma_uvcontext *iwvctx;
1364 	struct irdma_uqp *iwuqp;
1365 	int status;
1366 
1367 	if (attr->qp_type != IBV_QPT_RC && attr->qp_type != IBV_QPT_UD) {
1368 		printf("%s: failed to create QP, unsupported QP type: 0x%x\n",
1369 		       __func__, attr->qp_type);
1370 		errno = EOPNOTSUPP;
1371 		return NULL;
1372 	}
1373 
1374 	iwvctx = container_of(pd->context, struct irdma_uvcontext, ibv_ctx);
1375 	uk_attrs = &iwvctx->uk_attrs;
1376 
1377 	if (attr->cap.max_send_sge > uk_attrs->max_hw_wq_frags ||
1378 	    attr->cap.max_recv_sge > uk_attrs->max_hw_wq_frags ||
1379 	    attr->cap.max_send_wr > uk_attrs->max_hw_wq_quanta ||
1380 	    attr->cap.max_recv_wr > uk_attrs->max_hw_rq_quanta ||
1381 	    attr->cap.max_inline_data > uk_attrs->max_hw_inline) {
1382 		errno = EINVAL;
1383 		return NULL;
1384 	}
1385 
1386 	info.uk_attrs = uk_attrs;
1387 	info.sq_size = attr->cap.max_send_wr;
1388 	info.rq_size = attr->cap.max_recv_wr;
1389 	info.max_sq_frag_cnt = attr->cap.max_send_sge;
1390 	info.max_rq_frag_cnt = attr->cap.max_recv_sge;
1391 	info.max_inline_data = attr->cap.max_inline_data;
1392 	info.abi_ver = iwvctx->abi_ver;
1393 
1394 	status = irdma_uk_calc_depth_shift_sq(&info, &info.sq_depth, &info.sq_shift);
1395 	if (status) {
1396 		printf("%s: invalid SQ attributes, max_send_wr=%d max_send_sge=%d max_inline=%d\n",
1397 		       __func__, attr->cap.max_send_wr, attr->cap.max_send_sge,
1398 		       attr->cap.max_inline_data);
1399 		errno = status;
1400 		return NULL;
1401 	}
1402 
1403 	status = irdma_uk_calc_depth_shift_rq(&info, &info.rq_depth, &info.rq_shift);
1404 	if (status) {
1405 		printf("%s: invalid RQ attributes, recv_wr=%d recv_sge=%d\n",
1406 		       __func__, attr->cap.max_recv_wr, attr->cap.max_recv_sge);
1407 		errno = status;
1408 		return NULL;
1409 	}
1410 
1411 	iwuqp = memalign(1024, sizeof(*iwuqp));
1412 	if (!iwuqp)
1413 		return NULL;
1414 
1415 	memset(iwuqp, 0, sizeof(*iwuqp));
1416 
1417 	status = pthread_spin_init(&iwuqp->lock, PTHREAD_PROCESS_PRIVATE);
1418 	if (status)
1419 		goto err_free_qp;
1420 
1421 	info.sq_size = info.sq_depth >> info.sq_shift;
1422 	info.rq_size = info.rq_depth >> info.rq_shift;
1423 	/**
1424 	 * Maintain backward compatibility with older ABI which pass sq
1425 	 * and rq depth (in quanta) in cap.max_send_wr a cap.max_recv_wr
1426 	 */
1427 	if (!iwvctx->use_raw_attrs) {
1428 		attr->cap.max_send_wr = info.sq_size;
1429 		attr->cap.max_recv_wr = info.rq_size;
1430 	}
1431 
1432 	info.wqe_alloc_db = (u32 *)iwvctx->db;
1433 	info.legacy_mode = iwvctx->legacy_mode;
1434 	info.sq_wrtrk_array = calloc(info.sq_depth, sizeof(*info.sq_wrtrk_array));
1435 	if (!info.sq_wrtrk_array) {
1436 		status = errno;	/* preserve errno */
1437 		goto err_destroy_lock;
1438 	}
1439 
1440 	info.rq_wrid_array = calloc(info.rq_depth, sizeof(*info.rq_wrid_array));
1441 	if (!info.rq_wrid_array) {
1442 		status = errno;	/* preserve errno */
1443 		goto err_free_sq_wrtrk;
1444 	}
1445 
1446 	iwuqp->sq_sig_all = attr->sq_sig_all;
1447 	iwuqp->qp_type = attr->qp_type;
1448 	status = irdma_vmapped_qp(iwuqp, pd, attr, &info, iwvctx->legacy_mode);
1449 	if (status)
1450 		goto err_free_rq_wrid;
1451 
1452 	iwuqp->qp.back_qp = iwuqp;
1453 	iwuqp->qp.lock = &iwuqp->lock;
1454 
1455 	status = irdma_uk_qp_init(&iwuqp->qp, &info);
1456 	if (status)
1457 		goto err_free_vmap_qp;
1458 
1459 	attr->cap.max_send_wr = (info.sq_depth - IRDMA_SQ_RSVD) >> info.sq_shift;
1460 	attr->cap.max_recv_wr = (info.rq_depth - IRDMA_RQ_RSVD) >> info.rq_shift;
1461 
1462 	return &iwuqp->ibv_qp;
1463 
1464 err_free_vmap_qp:
1465 	irdma_destroy_vmapped_qp(iwuqp);
1466 	irdma_free_hw_buf(info.sq, iwuqp->buf_size);
1467 err_free_rq_wrid:
1468 	free(info.rq_wrid_array);
1469 err_free_sq_wrtrk:
1470 	free(info.sq_wrtrk_array);
1471 err_destroy_lock:
1472 	pthread_spin_destroy(&iwuqp->lock);
1473 err_free_qp:
1474 	printf("%s: failed to create QP\n", __func__);
1475 	free(iwuqp);
1476 
1477 	errno = status;
1478 	return NULL;
1479 }
1480 
1481 /**
1482  * irdma_uquery_qp - query qp for some attribute
1483  * @qp: qp for the attributes query
1484  * @attr: to return the attributes
1485  * @attr_mask: mask of what is query for
1486  * @init_attr: initial attributes during create_qp
1487  */
1488 int
1489 irdma_uquery_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask,
1490 		struct ibv_qp_init_attr *init_attr)
1491 {
1492 	struct ibv_query_qp cmd;
1493 
1494 	return ibv_cmd_query_qp(qp, attr, attr_mask, init_attr, &cmd,
1495 				sizeof(cmd));
1496 }
1497 
1498 /**
1499  * irdma_umodify_qp - send qp modify to driver
1500  * @qp: qp to modify
1501  * @attr: attribute to modify
1502  * @attr_mask: mask of the attribute
1503  */
1504 int
1505 irdma_umodify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask)
1506 {
1507 	struct irdma_umodify_qp_resp resp = {};
1508 	struct ibv_modify_qp cmd = {};
1509 	struct irdma_modify_qp_cmd cmd_ex = {};
1510 	struct irdma_uvcontext *iwvctx;
1511 	struct irdma_uqp *iwuqp;
1512 
1513 	iwuqp = container_of(qp, struct irdma_uqp, ibv_qp);
1514 	iwvctx = container_of(qp->context, struct irdma_uvcontext, ibv_ctx);
1515 
1516 	if (iwuqp->qp.qp_caps & IRDMA_PUSH_MODE && attr_mask & IBV_QP_STATE &&
1517 	    iwvctx->uk_attrs.hw_rev > IRDMA_GEN_1) {
1518 		u64 offset;
1519 		void *map;
1520 		int ret;
1521 
1522 		ret = ibv_cmd_modify_qp_ex(qp, attr, attr_mask, &cmd_ex.ibv_cmd,
1523 					   sizeof(cmd_ex.ibv_cmd),
1524 					   sizeof(cmd_ex), &resp.ibv_resp,
1525 					   sizeof(resp.ibv_resp),
1526 					   sizeof(resp));
1527 		if (!ret)
1528 			iwuqp->qp.rd_fence_rate = resp.rd_fence_rate;
1529 		if (ret || !resp.push_valid)
1530 			return ret;
1531 
1532 		if (iwuqp->qp.push_wqe)
1533 			return ret;
1534 
1535 		offset = resp.push_wqe_mmap_key;
1536 		map = irdma_mmap(qp->context->cmd_fd, offset);
1537 		if (map == MAP_FAILED)
1538 			return ret;
1539 
1540 		iwuqp->qp.push_wqe = map;
1541 
1542 		offset = resp.push_db_mmap_key;
1543 		map = irdma_mmap(qp->context->cmd_fd, offset);
1544 		if (map == MAP_FAILED) {
1545 			irdma_munmap(iwuqp->qp.push_wqe);
1546 			iwuqp->qp.push_wqe = NULL;
1547 			printf("failed to map push page, errno %d\n", errno);
1548 			return ret;
1549 		}
1550 		iwuqp->qp.push_wqe += resp.push_offset;
1551 		iwuqp->qp.push_db = map + resp.push_offset;
1552 
1553 		return ret;
1554 	} else {
1555 		return ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd));
1556 	}
1557 }
1558 
1559 static void
1560 irdma_issue_flush(struct ibv_qp *qp, bool sq_flush, bool rq_flush)
1561 {
1562 	struct irdma_umodify_qp_resp resp = {};
1563 	struct irdma_modify_qp_cmd cmd_ex = {};
1564 	struct ibv_qp_attr attr = {};
1565 
1566 	attr.qp_state = IBV_QPS_ERR;
1567 	cmd_ex.sq_flush = sq_flush;
1568 	cmd_ex.rq_flush = rq_flush;
1569 
1570 	ibv_cmd_modify_qp_ex(qp, &attr, IBV_QP_STATE,
1571 			     &cmd_ex.ibv_cmd,
1572 			     sizeof(cmd_ex.ibv_cmd),
1573 			     sizeof(cmd_ex), &resp.ibv_resp,
1574 			     sizeof(resp.ibv_resp),
1575 			     sizeof(resp));
1576 }
1577 
1578 /**
1579  * irdma_clean_cqes - clean cq entries for qp
1580  * @qp: qp for which completions are cleaned
1581  * @iwcq: cq to be cleaned
1582  */
1583 static void
1584 irdma_clean_cqes(struct irdma_qp_uk *qp, struct irdma_ucq *iwucq)
1585 {
1586 	struct irdma_cq_uk *ukcq = &iwucq->cq;
1587 	int ret;
1588 
1589 	ret = pthread_spin_lock(&iwucq->lock);
1590 	if (ret)
1591 		return;
1592 
1593 	irdma_uk_clean_cq(qp, ukcq);
1594 	pthread_spin_unlock(&iwucq->lock);
1595 }
1596 
1597 /**
1598  * irdma_udestroy_qp - destroy qp
1599  * @qp: qp to destroy
1600  */
1601 int
1602 irdma_udestroy_qp(struct ibv_qp *qp)
1603 {
1604 	struct irdma_uqp *iwuqp;
1605 	int ret;
1606 
1607 	iwuqp = container_of(qp, struct irdma_uqp, ibv_qp);
1608 	ret = pthread_spin_destroy(&iwuqp->lock);
1609 	if (ret)
1610 		goto err;
1611 
1612 	ret = irdma_destroy_vmapped_qp(iwuqp);
1613 	if (ret)
1614 		goto err;
1615 
1616 	/* Clean any pending completions from the cq(s) */
1617 	if (iwuqp->send_cq)
1618 		irdma_clean_cqes(&iwuqp->qp, iwuqp->send_cq);
1619 
1620 	if (iwuqp->recv_cq && iwuqp->recv_cq != iwuqp->send_cq)
1621 		irdma_clean_cqes(&iwuqp->qp, iwuqp->recv_cq);
1622 
1623 	if (iwuqp->qp.sq_wrtrk_array)
1624 		free(iwuqp->qp.sq_wrtrk_array);
1625 	if (iwuqp->qp.rq_wrid_array)
1626 		free(iwuqp->qp.rq_wrid_array);
1627 
1628 	irdma_free_hw_buf(iwuqp->qp.sq_base, iwuqp->buf_size);
1629 	free(iwuqp);
1630 	return 0;
1631 
1632 err:
1633 	printf("%s: failed to destroy QP, status %d\n",
1634 	       __func__, ret);
1635 	return ret;
1636 }
1637 
1638 /**
1639  * calc_type2_mw_stag - calculate type 2 MW stag
1640  * @rkey: desired rkey of the MW
1641  * @mw_rkey: type2 memory window rkey
1642  *
1643  * compute type2 memory window stag by taking lower 8 bits
1644  * of the desired rkey and leaving 24 bits if mw->rkey unchanged
1645  */
1646 static inline u32 calc_type2_mw_stag(u32 rkey, u32 mw_rkey) {
1647 	const u32 mask = 0xff;
1648 
1649 	return (rkey & mask) | (mw_rkey & ~mask);
1650 }
1651 
1652 /**
1653  * irdma_post_send -  post send wr for user application
1654  * @ib_qp: qp to post wr
1655  * @ib_wr: work request ptr
1656  * @bad_wr: return of bad wr if err
1657  */
1658 int
1659 irdma_upost_send(struct ibv_qp *ib_qp, struct ibv_send_wr *ib_wr,
1660 		 struct ibv_send_wr **bad_wr)
1661 {
1662 	struct irdma_post_sq_info info;
1663 	struct irdma_uvcontext *iwvctx;
1664 	struct irdma_uk_attrs *uk_attrs;
1665 	struct irdma_uqp *iwuqp;
1666 	bool reflush = false;
1667 	int err = 0;
1668 
1669 	iwuqp = container_of(ib_qp, struct irdma_uqp, ibv_qp);
1670 	iwvctx = container_of(ib_qp->context, struct irdma_uvcontext, ibv_ctx);
1671 	uk_attrs = &iwvctx->uk_attrs;
1672 
1673 	err = pthread_spin_lock(&iwuqp->lock);
1674 	if (err)
1675 		return err;
1676 
1677 	if (!IRDMA_RING_MORE_WORK(iwuqp->qp.sq_ring) &&
1678 	    ib_qp->state == IBV_QPS_ERR)
1679 		reflush = true;
1680 
1681 	while (ib_wr) {
1682 		memset(&info, 0, sizeof(info));
1683 		info.wr_id = (u64)(ib_wr->wr_id);
1684 		if ((ib_wr->send_flags & IBV_SEND_SIGNALED) ||
1685 		    iwuqp->sq_sig_all)
1686 			info.signaled = true;
1687 		if (ib_wr->send_flags & IBV_SEND_FENCE)
1688 			info.read_fence = true;
1689 
1690 		switch (ib_wr->opcode) {
1691 		case IBV_WR_SEND_WITH_IMM:
1692 			if (iwuqp->qp.qp_caps & IRDMA_SEND_WITH_IMM) {
1693 				info.imm_data_valid = true;
1694 				info.imm_data = ntohl(ib_wr->imm_data);
1695 			} else {
1696 				err = EINVAL;
1697 				break;
1698 			}
1699 			/* fallthrough */
1700 		case IBV_WR_SEND:
1701 		case IBV_WR_SEND_WITH_INV:
1702 			if (ib_wr->opcode == IBV_WR_SEND ||
1703 			    ib_wr->opcode == IBV_WR_SEND_WITH_IMM) {
1704 				if (ib_wr->send_flags & IBV_SEND_SOLICITED)
1705 					info.op_type = IRDMA_OP_TYPE_SEND_SOL;
1706 				else
1707 					info.op_type = IRDMA_OP_TYPE_SEND;
1708 			} else {
1709 				if (ib_wr->send_flags & IBV_SEND_SOLICITED)
1710 					info.op_type = IRDMA_OP_TYPE_SEND_SOL_INV;
1711 				else
1712 					info.op_type = IRDMA_OP_TYPE_SEND_INV;
1713 				info.stag_to_inv = ib_wr->imm_data;
1714 			}
1715 			info.op.send.num_sges = ib_wr->num_sge;
1716 			info.op.send.sg_list = (struct ibv_sge *)ib_wr->sg_list;
1717 			if (ib_qp->qp_type == IBV_QPT_UD) {
1718 				struct irdma_uah *ah = container_of(ib_wr->wr.ud.ah,
1719 								    struct irdma_uah, ibv_ah);
1720 
1721 				info.op.send.ah_id = ah->ah_id;
1722 				info.op.send.qkey = ib_wr->wr.ud.remote_qkey;
1723 				info.op.send.dest_qp = ib_wr->wr.ud.remote_qpn;
1724 			}
1725 
1726 			if (ib_wr->send_flags & IBV_SEND_INLINE)
1727 				err = irdma_uk_inline_send(&iwuqp->qp, &info, false);
1728 			else
1729 				err = irdma_uk_send(&iwuqp->qp, &info, false);
1730 			break;
1731 		case IBV_WR_RDMA_WRITE_WITH_IMM:
1732 			if (iwuqp->qp.qp_caps & IRDMA_WRITE_WITH_IMM) {
1733 				info.imm_data_valid = true;
1734 				info.imm_data = ntohl(ib_wr->imm_data);
1735 			} else {
1736 				err = EINVAL;
1737 				break;
1738 			}
1739 			/* fallthrough */
1740 		case IBV_WR_RDMA_WRITE:
1741 			if (ib_wr->send_flags & IBV_SEND_SOLICITED)
1742 				info.op_type = IRDMA_OP_TYPE_RDMA_WRITE_SOL;
1743 			else
1744 				info.op_type = IRDMA_OP_TYPE_RDMA_WRITE;
1745 
1746 			info.op.rdma_write.num_lo_sges = ib_wr->num_sge;
1747 			info.op.rdma_write.lo_sg_list = ib_wr->sg_list;
1748 			info.op.rdma_write.rem_addr.addr = ib_wr->wr.rdma.remote_addr;
1749 			info.op.rdma_write.rem_addr.lkey = ib_wr->wr.rdma.rkey;
1750 			if (ib_wr->send_flags & IBV_SEND_INLINE)
1751 				err = irdma_uk_inline_rdma_write(&iwuqp->qp, &info, false);
1752 			else
1753 				err = irdma_uk_rdma_write(&iwuqp->qp, &info, false);
1754 			break;
1755 		case IBV_WR_RDMA_READ:
1756 			if (ib_wr->num_sge > uk_attrs->max_hw_read_sges) {
1757 				err = EINVAL;
1758 				break;
1759 			}
1760 			info.op_type = IRDMA_OP_TYPE_RDMA_READ;
1761 			info.op.rdma_read.rem_addr.addr = ib_wr->wr.rdma.remote_addr;
1762 			info.op.rdma_read.rem_addr.lkey = ib_wr->wr.rdma.rkey;
1763 
1764 			info.op.rdma_read.lo_sg_list = ib_wr->sg_list;
1765 			info.op.rdma_read.num_lo_sges = ib_wr->num_sge;
1766 			err = irdma_uk_rdma_read(&iwuqp->qp, &info, false, false);
1767 			break;
1768 		case IBV_WR_BIND_MW:
1769 			if (ib_qp->qp_type != IBV_QPT_RC) {
1770 				err = EINVAL;
1771 				break;
1772 			}
1773 			info.op_type = IRDMA_OP_TYPE_BIND_MW;
1774 			info.op.bind_window.mr_stag = ib_wr->bind_mw.bind_info.mr->rkey;
1775 			if (ib_wr->bind_mw.mw->type == IBV_MW_TYPE_1) {
1776 				info.op.bind_window.mem_window_type_1 = true;
1777 				info.op.bind_window.mw_stag = ib_wr->bind_mw.rkey;
1778 			} else {
1779 				struct verbs_mr *vmr = verbs_get_mr(ib_wr->bind_mw.bind_info.mr);
1780 
1781 				if (vmr->access & IBV_ACCESS_ZERO_BASED) {
1782 					err = EINVAL;
1783 					break;
1784 				}
1785 				info.op.bind_window.mw_stag =
1786 				    calc_type2_mw_stag(ib_wr->bind_mw.rkey, ib_wr->bind_mw.mw->rkey);
1787 				ib_wr->bind_mw.mw->rkey = info.op.bind_window.mw_stag;
1788 
1789 			}
1790 
1791 			if (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_ZERO_BASED) {
1792 				info.op.bind_window.addressing_type = IRDMA_ADDR_TYPE_ZERO_BASED;
1793 				info.op.bind_window.va = NULL;
1794 			} else {
1795 				info.op.bind_window.addressing_type = IRDMA_ADDR_TYPE_VA_BASED;
1796 				info.op.bind_window.va = (void *)(uintptr_t)ib_wr->bind_mw.bind_info.addr;
1797 			}
1798 			info.op.bind_window.bind_len = ib_wr->bind_mw.bind_info.length;
1799 			info.op.bind_window.ena_reads =
1800 			    (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_REMOTE_READ) ? 1 : 0;
1801 			info.op.bind_window.ena_writes =
1802 			    (ib_wr->bind_mw.bind_info.mw_access_flags & IBV_ACCESS_REMOTE_WRITE) ? 1 : 0;
1803 
1804 			err = irdma_uk_mw_bind(&iwuqp->qp, &info, false);
1805 			break;
1806 		case IBV_WR_LOCAL_INV:
1807 			info.op_type = IRDMA_OP_TYPE_INV_STAG;
1808 			info.op.inv_local_stag.target_stag = ib_wr->imm_data;
1809 			err = irdma_uk_stag_local_invalidate(&iwuqp->qp, &info, true);
1810 			break;
1811 		default:
1812 			/* error */
1813 			err = EINVAL;
1814 			printf("%s: post work request failed, invalid opcode: 0x%x\n",
1815 			       __func__, ib_wr->opcode);
1816 			break;
1817 		}
1818 		if (err)
1819 			break;
1820 
1821 		ib_wr = ib_wr->next;
1822 	}
1823 
1824 	if (err)
1825 		*bad_wr = ib_wr;
1826 
1827 	irdma_uk_qp_post_wr(&iwuqp->qp);
1828 	if (reflush)
1829 		irdma_issue_flush(ib_qp, 1, 0);
1830 
1831 	pthread_spin_unlock(&iwuqp->lock);
1832 
1833 	return err;
1834 }
1835 
1836 /**
1837  * irdma_post_recv - post receive wr for user application
1838  * @ib_wr: work request for receive
1839  * @bad_wr: bad wr caused an error
1840  */
1841 int
1842 irdma_upost_recv(struct ibv_qp *ib_qp, struct ibv_recv_wr *ib_wr,
1843 		 struct ibv_recv_wr **bad_wr)
1844 {
1845 	struct irdma_post_rq_info post_recv = {};
1846 	struct irdma_uqp *iwuqp;
1847 	bool reflush = false;
1848 	int err = 0;
1849 
1850 	iwuqp = container_of(ib_qp, struct irdma_uqp, ibv_qp);
1851 	err = pthread_spin_lock(&iwuqp->lock);
1852 	if (err)
1853 		return err;
1854 
1855 	if (!IRDMA_RING_MORE_WORK(iwuqp->qp.rq_ring) &&
1856 	    ib_qp->state == IBV_QPS_ERR)
1857 		reflush = true;
1858 
1859 	while (ib_wr) {
1860 		if (ib_wr->num_sge > iwuqp->qp.max_rq_frag_cnt) {
1861 			*bad_wr = ib_wr;
1862 			err = EINVAL;
1863 			goto error;
1864 		}
1865 		post_recv.num_sges = ib_wr->num_sge;
1866 		post_recv.wr_id = ib_wr->wr_id;
1867 		post_recv.sg_list = ib_wr->sg_list;
1868 		err = irdma_uk_post_receive(&iwuqp->qp, &post_recv);
1869 		if (err) {
1870 			*bad_wr = ib_wr;
1871 			goto error;
1872 		}
1873 
1874 		if (reflush)
1875 			irdma_issue_flush(ib_qp, 0, 1);
1876 
1877 		ib_wr = ib_wr->next;
1878 	}
1879 error:
1880 	pthread_spin_unlock(&iwuqp->lock);
1881 
1882 	return err;
1883 }
1884 
1885 /**
1886  * irdma_ucreate_ah - create address handle associated with a pd
1887  * @ibpd: pd for the address handle
1888  * @attr: attributes of address handle
1889  */
1890 struct ibv_ah *
1891 irdma_ucreate_ah(struct ibv_pd *ibpd, struct ibv_ah_attr *attr)
1892 {
1893 	struct irdma_uah *ah;
1894 	union ibv_gid sgid;
1895 	struct irdma_ucreate_ah_resp resp = {};
1896 	int err;
1897 
1898 	if (ibv_query_gid(ibpd->context, attr->port_num, attr->grh.sgid_index,
1899 			  &sgid)) {
1900 		fprintf(stderr, "irdma: Error from ibv_query_gid.\n");
1901 		errno = ENOENT;
1902 		return NULL;
1903 	}
1904 
1905 	ah = calloc(1, sizeof(*ah));
1906 	if (!ah)
1907 		return NULL;
1908 
1909 	err = ibv_cmd_create_ah(ibpd, &ah->ibv_ah, attr, &resp.ibv_resp,
1910 				sizeof(resp));
1911 	if (err) {
1912 		free(ah);
1913 		errno = err;
1914 		return NULL;
1915 	}
1916 
1917 	ah->ah_id = resp.ah_id;
1918 
1919 	return &ah->ibv_ah;
1920 }
1921 
1922 /**
1923  * irdma_udestroy_ah - destroy the address handle
1924  * @ibah: address handle
1925  */
1926 int
1927 irdma_udestroy_ah(struct ibv_ah *ibah)
1928 {
1929 	struct irdma_uah *ah;
1930 	int ret;
1931 
1932 	ah = container_of(ibah, struct irdma_uah, ibv_ah);
1933 
1934 	ret = ibv_cmd_destroy_ah(ibah);
1935 	if (ret)
1936 		return ret;
1937 
1938 	free(ah);
1939 
1940 	return 0;
1941 }
1942 
1943 /**
1944  * irdma_uattach_mcast - Attach qp to multicast group implemented
1945  * @qp: The queue pair
1946  * @gid:The Global ID for multicast group
1947  * @lid: The Local ID
1948  */
1949 int
1950 irdma_uattach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
1951 		    uint16_t lid)
1952 {
1953 	return ibv_cmd_attach_mcast(qp, gid, lid);
1954 }
1955 
1956 /**
1957  * irdma_udetach_mcast - Detach qp from multicast group
1958  * @qp: The queue pair
1959  * @gid:The Global ID for multicast group
1960  * @lid: The Local ID
1961  */
1962 int
1963 irdma_udetach_mcast(struct ibv_qp *qp, const union ibv_gid *gid,
1964 		    uint16_t lid)
1965 {
1966 	return ibv_cmd_detach_mcast(qp, gid, lid);
1967 }
1968 
1969 /**
1970  * irdma_uresize_cq - resizes a cq
1971  * @cq: cq to resize
1972  * @cqe: the number of cqes of the new cq
1973  */
1974 int
1975 irdma_uresize_cq(struct ibv_cq *cq, int cqe)
1976 {
1977 	struct irdma_uvcontext *iwvctx;
1978 	struct irdma_uk_attrs *uk_attrs;
1979 	struct irdma_uresize_cq cmd = {};
1980 	struct ibv_resize_cq_resp resp = {};
1981 	struct irdma_ureg_mr reg_mr_cmd = {};
1982 	struct ibv_reg_mr_resp reg_mr_resp = {};
1983 	struct irdma_cq_buf *cq_buf = NULL;
1984 	struct irdma_cqe *cq_base = NULL;
1985 	struct verbs_mr new_mr = {};
1986 	struct irdma_ucq *iwucq;
1987 	size_t cq_size;
1988 	u32 cq_pages;
1989 	int cqe_needed;
1990 	int ret = 0;
1991 
1992 	iwucq = container_of(cq, struct irdma_ucq, verbs_cq.cq);
1993 	iwvctx = container_of(cq->context, struct irdma_uvcontext, ibv_ctx);
1994 	uk_attrs = &iwvctx->uk_attrs;
1995 
1996 	if (!(uk_attrs->feature_flags & IRDMA_FEATURE_CQ_RESIZE))
1997 		return EOPNOTSUPP;
1998 
1999 	if (cqe < uk_attrs->min_hw_cq_size || cqe > uk_attrs->max_hw_cq_size - 1)
2000 		return EINVAL;
2001 
2002 	cqe_needed = get_cq_size(cqe, uk_attrs->hw_rev);
2003 	if (cqe_needed == iwucq->cq.cq_size)
2004 		return 0;
2005 
2006 	cq_size = get_cq_total_bytes(cqe_needed);
2007 	cq_pages = cq_size >> IRDMA_HW_PAGE_SHIFT;
2008 	cq_base = irdma_alloc_hw_buf(cq_size);
2009 	if (!cq_base)
2010 		return ENOMEM;
2011 
2012 	memset(cq_base, 0, cq_size);
2013 
2014 	cq_buf = malloc(sizeof(*cq_buf));
2015 	if (!cq_buf) {
2016 		ret = ENOMEM;
2017 		goto err_buf;
2018 	}
2019 
2020 	new_mr.ibv_mr.pd = iwucq->vmr.ibv_mr.pd;
2021 	reg_mr_cmd.reg_type = IRDMA_MEMREG_TYPE_CQ;
2022 	reg_mr_cmd.cq_pages = cq_pages;
2023 
2024 	ret = ibv_cmd_reg_mr(new_mr.ibv_mr.pd, cq_base, cq_size,
2025 			     (uintptr_t)cq_base, IBV_ACCESS_LOCAL_WRITE,
2026 			     &new_mr.ibv_mr, &reg_mr_cmd.ibv_cmd, sizeof(reg_mr_cmd),
2027 			     &reg_mr_resp, sizeof(reg_mr_resp));
2028 	if (ret)
2029 		goto err_dereg_mr;
2030 
2031 	ret = pthread_spin_lock(&iwucq->lock);
2032 	if (ret)
2033 		goto err_lock;
2034 
2035 	cmd.user_cq_buffer = (__u64) ((uintptr_t)cq_base);
2036 	ret = ibv_cmd_resize_cq(&iwucq->verbs_cq.cq, cqe_needed, &cmd.ibv_cmd,
2037 				sizeof(cmd), &resp, sizeof(resp));
2038 	if (ret)
2039 		goto err_resize;
2040 
2041 	memcpy(&cq_buf->cq, &iwucq->cq, sizeof(cq_buf->cq));
2042 	cq_buf->vmr = iwucq->vmr;
2043 	iwucq->vmr = new_mr;
2044 	irdma_uk_cq_resize(&iwucq->cq, cq_base, cqe_needed);
2045 	iwucq->verbs_cq.cq.cqe = cqe;
2046 	LIST_INSERT_HEAD(&iwucq->resize_list, cq_buf, list);
2047 
2048 	pthread_spin_unlock(&iwucq->lock);
2049 
2050 	return ret;
2051 
2052 err_resize:
2053 	pthread_spin_unlock(&iwucq->lock);
2054 err_lock:
2055 	ibv_cmd_dereg_mr(&new_mr.ibv_mr);
2056 err_dereg_mr:
2057 	free(cq_buf);
2058 err_buf:
2059 	fprintf(stderr, "failed to resize CQ cq_id=%d ret=%d\n", iwucq->cq.cq_id, ret);
2060 	irdma_free_hw_buf(cq_base, cq_size);
2061 	return ret;
2062 }
2063