xref: /linux/drivers/infiniband/hw/efa/efa_verbs.c (revision 8a922b7728a93d837954315c98b84f6b78de0c4f)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /*
3  * Copyright 2018-2022 Amazon.com, Inc. or its affiliates. All rights reserved.
4  */
5 
6 #include <linux/dma-buf.h>
7 #include <linux/dma-resv.h>
8 #include <linux/vmalloc.h>
9 #include <linux/log2.h>
10 
11 #include <rdma/ib_addr.h>
12 #include <rdma/ib_umem.h>
13 #include <rdma/ib_user_verbs.h>
14 #include <rdma/ib_verbs.h>
15 #include <rdma/uverbs_ioctl.h>
16 
17 #include "efa.h"
18 #include "efa_io_defs.h"
19 
20 enum {
21 	EFA_MMAP_DMA_PAGE = 0,
22 	EFA_MMAP_IO_WC,
23 	EFA_MMAP_IO_NC,
24 };
25 
26 #define EFA_AENQ_ENABLED_GROUPS \
27 	(BIT(EFA_ADMIN_FATAL_ERROR) | BIT(EFA_ADMIN_WARNING) | \
28 	 BIT(EFA_ADMIN_NOTIFICATION) | BIT(EFA_ADMIN_KEEP_ALIVE))
29 
30 struct efa_user_mmap_entry {
31 	struct rdma_user_mmap_entry rdma_entry;
32 	u64 address;
33 	u8 mmap_flag;
34 };
35 
36 #define EFA_DEFINE_DEVICE_STATS(op) \
37 	op(EFA_SUBMITTED_CMDS, "submitted_cmds") \
38 	op(EFA_COMPLETED_CMDS, "completed_cmds") \
39 	op(EFA_CMDS_ERR, "cmds_err") \
40 	op(EFA_NO_COMPLETION_CMDS, "no_completion_cmds") \
41 	op(EFA_KEEP_ALIVE_RCVD, "keep_alive_rcvd") \
42 	op(EFA_ALLOC_PD_ERR, "alloc_pd_err") \
43 	op(EFA_CREATE_QP_ERR, "create_qp_err") \
44 	op(EFA_CREATE_CQ_ERR, "create_cq_err") \
45 	op(EFA_REG_MR_ERR, "reg_mr_err") \
46 	op(EFA_ALLOC_UCONTEXT_ERR, "alloc_ucontext_err") \
47 	op(EFA_CREATE_AH_ERR, "create_ah_err") \
48 	op(EFA_MMAP_ERR, "mmap_err")
49 
50 #define EFA_DEFINE_PORT_STATS(op) \
51 	op(EFA_TX_BYTES, "tx_bytes") \
52 	op(EFA_TX_PKTS, "tx_pkts") \
53 	op(EFA_RX_BYTES, "rx_bytes") \
54 	op(EFA_RX_PKTS, "rx_pkts") \
55 	op(EFA_RX_DROPS, "rx_drops") \
56 	op(EFA_SEND_BYTES, "send_bytes") \
57 	op(EFA_SEND_WRS, "send_wrs") \
58 	op(EFA_RECV_BYTES, "recv_bytes") \
59 	op(EFA_RECV_WRS, "recv_wrs") \
60 	op(EFA_RDMA_READ_WRS, "rdma_read_wrs") \
61 	op(EFA_RDMA_READ_BYTES, "rdma_read_bytes") \
62 	op(EFA_RDMA_READ_WR_ERR, "rdma_read_wr_err") \
63 	op(EFA_RDMA_READ_RESP_BYTES, "rdma_read_resp_bytes") \
64 
65 #define EFA_STATS_ENUM(ename, name) ename,
66 #define EFA_STATS_STR(ename, nam) \
67 	[ename].name = nam,
68 
69 enum efa_hw_device_stats {
70 	EFA_DEFINE_DEVICE_STATS(EFA_STATS_ENUM)
71 };
72 
73 static const struct rdma_stat_desc efa_device_stats_descs[] = {
74 	EFA_DEFINE_DEVICE_STATS(EFA_STATS_STR)
75 };
76 
77 enum efa_hw_port_stats {
78 	EFA_DEFINE_PORT_STATS(EFA_STATS_ENUM)
79 };
80 
81 static const struct rdma_stat_desc efa_port_stats_descs[] = {
82 	EFA_DEFINE_PORT_STATS(EFA_STATS_STR)
83 };
84 
85 #define EFA_CHUNK_PAYLOAD_SHIFT       12
86 #define EFA_CHUNK_PAYLOAD_SIZE        BIT(EFA_CHUNK_PAYLOAD_SHIFT)
87 #define EFA_CHUNK_PAYLOAD_PTR_SIZE    8
88 
89 #define EFA_CHUNK_SHIFT               12
90 #define EFA_CHUNK_SIZE                BIT(EFA_CHUNK_SHIFT)
91 #define EFA_CHUNK_PTR_SIZE            sizeof(struct efa_com_ctrl_buff_info)
92 
93 #define EFA_PTRS_PER_CHUNK \
94 	((EFA_CHUNK_SIZE - EFA_CHUNK_PTR_SIZE) / EFA_CHUNK_PAYLOAD_PTR_SIZE)
95 
96 #define EFA_CHUNK_USED_SIZE \
97 	((EFA_PTRS_PER_CHUNK * EFA_CHUNK_PAYLOAD_PTR_SIZE) + EFA_CHUNK_PTR_SIZE)
98 
99 struct pbl_chunk {
100 	dma_addr_t dma_addr;
101 	u64 *buf;
102 	u32 length;
103 };
104 
105 struct pbl_chunk_list {
106 	struct pbl_chunk *chunks;
107 	unsigned int size;
108 };
109 
110 struct pbl_context {
111 	union {
112 		struct {
113 			dma_addr_t dma_addr;
114 		} continuous;
115 		struct {
116 			u32 pbl_buf_size_in_pages;
117 			struct scatterlist *sgl;
118 			int sg_dma_cnt;
119 			struct pbl_chunk_list chunk_list;
120 		} indirect;
121 	} phys;
122 	u64 *pbl_buf;
123 	u32 pbl_buf_size_in_bytes;
124 	u8 physically_continuous;
125 };
126 
127 static inline struct efa_dev *to_edev(struct ib_device *ibdev)
128 {
129 	return container_of(ibdev, struct efa_dev, ibdev);
130 }
131 
132 static inline struct efa_ucontext *to_eucontext(struct ib_ucontext *ibucontext)
133 {
134 	return container_of(ibucontext, struct efa_ucontext, ibucontext);
135 }
136 
137 static inline struct efa_pd *to_epd(struct ib_pd *ibpd)
138 {
139 	return container_of(ibpd, struct efa_pd, ibpd);
140 }
141 
142 static inline struct efa_mr *to_emr(struct ib_mr *ibmr)
143 {
144 	return container_of(ibmr, struct efa_mr, ibmr);
145 }
146 
147 static inline struct efa_qp *to_eqp(struct ib_qp *ibqp)
148 {
149 	return container_of(ibqp, struct efa_qp, ibqp);
150 }
151 
152 static inline struct efa_cq *to_ecq(struct ib_cq *ibcq)
153 {
154 	return container_of(ibcq, struct efa_cq, ibcq);
155 }
156 
157 static inline struct efa_ah *to_eah(struct ib_ah *ibah)
158 {
159 	return container_of(ibah, struct efa_ah, ibah);
160 }
161 
162 static inline struct efa_user_mmap_entry *
163 to_emmap(struct rdma_user_mmap_entry *rdma_entry)
164 {
165 	return container_of(rdma_entry, struct efa_user_mmap_entry, rdma_entry);
166 }
167 
168 #define EFA_DEV_CAP(dev, cap) \
169 	((dev)->dev_attr.device_caps & \
170 	 EFA_ADMIN_FEATURE_DEVICE_ATTR_DESC_##cap##_MASK)
171 
172 #define is_reserved_cleared(reserved) \
173 	!memchr_inv(reserved, 0, sizeof(reserved))
174 
175 static void *efa_zalloc_mapped(struct efa_dev *dev, dma_addr_t *dma_addr,
176 			       size_t size, enum dma_data_direction dir)
177 {
178 	void *addr;
179 
180 	addr = alloc_pages_exact(size, GFP_KERNEL | __GFP_ZERO);
181 	if (!addr)
182 		return NULL;
183 
184 	*dma_addr = dma_map_single(&dev->pdev->dev, addr, size, dir);
185 	if (dma_mapping_error(&dev->pdev->dev, *dma_addr)) {
186 		ibdev_err(&dev->ibdev, "Failed to map DMA address\n");
187 		free_pages_exact(addr, size);
188 		return NULL;
189 	}
190 
191 	return addr;
192 }
193 
194 static void efa_free_mapped(struct efa_dev *dev, void *cpu_addr,
195 			    dma_addr_t dma_addr,
196 			    size_t size, enum dma_data_direction dir)
197 {
198 	dma_unmap_single(&dev->pdev->dev, dma_addr, size, dir);
199 	free_pages_exact(cpu_addr, size);
200 }
201 
202 int efa_query_device(struct ib_device *ibdev,
203 		     struct ib_device_attr *props,
204 		     struct ib_udata *udata)
205 {
206 	struct efa_com_get_device_attr_result *dev_attr;
207 	struct efa_ibv_ex_query_device_resp resp = {};
208 	struct efa_dev *dev = to_edev(ibdev);
209 	int err;
210 
211 	if (udata && udata->inlen &&
212 	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
213 		ibdev_dbg(ibdev,
214 			  "Incompatible ABI params, udata not cleared\n");
215 		return -EINVAL;
216 	}
217 
218 	dev_attr = &dev->dev_attr;
219 
220 	memset(props, 0, sizeof(*props));
221 	props->max_mr_size = dev_attr->max_mr_pages * PAGE_SIZE;
222 	props->page_size_cap = dev_attr->page_size_cap;
223 	props->vendor_id = dev->pdev->vendor;
224 	props->vendor_part_id = dev->pdev->device;
225 	props->hw_ver = dev->pdev->subsystem_device;
226 	props->max_qp = dev_attr->max_qp;
227 	props->max_cq = dev_attr->max_cq;
228 	props->max_pd = dev_attr->max_pd;
229 	props->max_mr = dev_attr->max_mr;
230 	props->max_ah = dev_attr->max_ah;
231 	props->max_cqe = dev_attr->max_cq_depth;
232 	props->max_qp_wr = min_t(u32, dev_attr->max_sq_depth,
233 				 dev_attr->max_rq_depth);
234 	props->max_send_sge = dev_attr->max_sq_sge;
235 	props->max_recv_sge = dev_attr->max_rq_sge;
236 	props->max_sge_rd = dev_attr->max_wr_rdma_sge;
237 	props->max_pkeys = 1;
238 
239 	if (udata && udata->outlen) {
240 		resp.max_sq_sge = dev_attr->max_sq_sge;
241 		resp.max_rq_sge = dev_attr->max_rq_sge;
242 		resp.max_sq_wr = dev_attr->max_sq_depth;
243 		resp.max_rq_wr = dev_attr->max_rq_depth;
244 		resp.max_rdma_size = dev_attr->max_rdma_size;
245 
246 		resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_WITH_SGID;
247 		if (EFA_DEV_CAP(dev, RDMA_READ))
248 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RDMA_READ;
249 
250 		if (EFA_DEV_CAP(dev, RNR_RETRY))
251 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_RNR_RETRY;
252 
253 		if (dev->neqs)
254 			resp.device_caps |= EFA_QUERY_DEVICE_CAPS_CQ_NOTIFICATIONS;
255 
256 		err = ib_copy_to_udata(udata, &resp,
257 				       min(sizeof(resp), udata->outlen));
258 		if (err) {
259 			ibdev_dbg(ibdev,
260 				  "Failed to copy udata for query_device\n");
261 			return err;
262 		}
263 	}
264 
265 	return 0;
266 }
267 
268 int efa_query_port(struct ib_device *ibdev, u32 port,
269 		   struct ib_port_attr *props)
270 {
271 	struct efa_dev *dev = to_edev(ibdev);
272 
273 	props->lmc = 1;
274 
275 	props->state = IB_PORT_ACTIVE;
276 	props->phys_state = IB_PORT_PHYS_STATE_LINK_UP;
277 	props->gid_tbl_len = 1;
278 	props->pkey_tbl_len = 1;
279 	props->active_speed = IB_SPEED_EDR;
280 	props->active_width = IB_WIDTH_4X;
281 	props->max_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu);
282 	props->active_mtu = ib_mtu_int_to_enum(dev->dev_attr.mtu);
283 	props->max_msg_sz = dev->dev_attr.mtu;
284 	props->max_vl_num = 1;
285 
286 	return 0;
287 }
288 
289 int efa_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
290 		 int qp_attr_mask,
291 		 struct ib_qp_init_attr *qp_init_attr)
292 {
293 	struct efa_dev *dev = to_edev(ibqp->device);
294 	struct efa_com_query_qp_params params = {};
295 	struct efa_com_query_qp_result result;
296 	struct efa_qp *qp = to_eqp(ibqp);
297 	int err;
298 
299 #define EFA_QUERY_QP_SUPP_MASK \
300 	(IB_QP_STATE | IB_QP_PKEY_INDEX | IB_QP_PORT | \
301 	 IB_QP_QKEY | IB_QP_SQ_PSN | IB_QP_CAP | IB_QP_RNR_RETRY)
302 
303 	if (qp_attr_mask & ~EFA_QUERY_QP_SUPP_MASK) {
304 		ibdev_dbg(&dev->ibdev,
305 			  "Unsupported qp_attr_mask[%#x] supported[%#x]\n",
306 			  qp_attr_mask, EFA_QUERY_QP_SUPP_MASK);
307 		return -EOPNOTSUPP;
308 	}
309 
310 	memset(qp_attr, 0, sizeof(*qp_attr));
311 	memset(qp_init_attr, 0, sizeof(*qp_init_attr));
312 
313 	params.qp_handle = qp->qp_handle;
314 	err = efa_com_query_qp(&dev->edev, &params, &result);
315 	if (err)
316 		return err;
317 
318 	qp_attr->qp_state = result.qp_state;
319 	qp_attr->qkey = result.qkey;
320 	qp_attr->sq_psn = result.sq_psn;
321 	qp_attr->sq_draining = result.sq_draining;
322 	qp_attr->port_num = 1;
323 	qp_attr->rnr_retry = result.rnr_retry;
324 
325 	qp_attr->cap.max_send_wr = qp->max_send_wr;
326 	qp_attr->cap.max_recv_wr = qp->max_recv_wr;
327 	qp_attr->cap.max_send_sge = qp->max_send_sge;
328 	qp_attr->cap.max_recv_sge = qp->max_recv_sge;
329 	qp_attr->cap.max_inline_data = qp->max_inline_data;
330 
331 	qp_init_attr->qp_type = ibqp->qp_type;
332 	qp_init_attr->recv_cq = ibqp->recv_cq;
333 	qp_init_attr->send_cq = ibqp->send_cq;
334 	qp_init_attr->qp_context = ibqp->qp_context;
335 	qp_init_attr->cap = qp_attr->cap;
336 
337 	return 0;
338 }
339 
340 int efa_query_gid(struct ib_device *ibdev, u32 port, int index,
341 		  union ib_gid *gid)
342 {
343 	struct efa_dev *dev = to_edev(ibdev);
344 
345 	memcpy(gid->raw, dev->dev_attr.addr, sizeof(dev->dev_attr.addr));
346 
347 	return 0;
348 }
349 
350 int efa_query_pkey(struct ib_device *ibdev, u32 port, u16 index,
351 		   u16 *pkey)
352 {
353 	if (index > 0)
354 		return -EINVAL;
355 
356 	*pkey = 0xffff;
357 	return 0;
358 }
359 
360 static int efa_pd_dealloc(struct efa_dev *dev, u16 pdn)
361 {
362 	struct efa_com_dealloc_pd_params params = {
363 		.pdn = pdn,
364 	};
365 
366 	return efa_com_dealloc_pd(&dev->edev, &params);
367 }
368 
369 int efa_alloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
370 {
371 	struct efa_dev *dev = to_edev(ibpd->device);
372 	struct efa_ibv_alloc_pd_resp resp = {};
373 	struct efa_com_alloc_pd_result result;
374 	struct efa_pd *pd = to_epd(ibpd);
375 	int err;
376 
377 	if (udata->inlen &&
378 	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
379 		ibdev_dbg(&dev->ibdev,
380 			  "Incompatible ABI params, udata not cleared\n");
381 		err = -EINVAL;
382 		goto err_out;
383 	}
384 
385 	err = efa_com_alloc_pd(&dev->edev, &result);
386 	if (err)
387 		goto err_out;
388 
389 	pd->pdn = result.pdn;
390 	resp.pdn = result.pdn;
391 
392 	if (udata->outlen) {
393 		err = ib_copy_to_udata(udata, &resp,
394 				       min(sizeof(resp), udata->outlen));
395 		if (err) {
396 			ibdev_dbg(&dev->ibdev,
397 				  "Failed to copy udata for alloc_pd\n");
398 			goto err_dealloc_pd;
399 		}
400 	}
401 
402 	ibdev_dbg(&dev->ibdev, "Allocated pd[%d]\n", pd->pdn);
403 
404 	return 0;
405 
406 err_dealloc_pd:
407 	efa_pd_dealloc(dev, result.pdn);
408 err_out:
409 	atomic64_inc(&dev->stats.alloc_pd_err);
410 	return err;
411 }
412 
413 int efa_dealloc_pd(struct ib_pd *ibpd, struct ib_udata *udata)
414 {
415 	struct efa_dev *dev = to_edev(ibpd->device);
416 	struct efa_pd *pd = to_epd(ibpd);
417 
418 	ibdev_dbg(&dev->ibdev, "Dealloc pd[%d]\n", pd->pdn);
419 	efa_pd_dealloc(dev, pd->pdn);
420 	return 0;
421 }
422 
423 static int efa_destroy_qp_handle(struct efa_dev *dev, u32 qp_handle)
424 {
425 	struct efa_com_destroy_qp_params params = { .qp_handle = qp_handle };
426 
427 	return efa_com_destroy_qp(&dev->edev, &params);
428 }
429 
430 static void efa_qp_user_mmap_entries_remove(struct efa_qp *qp)
431 {
432 	rdma_user_mmap_entry_remove(qp->rq_mmap_entry);
433 	rdma_user_mmap_entry_remove(qp->rq_db_mmap_entry);
434 	rdma_user_mmap_entry_remove(qp->llq_desc_mmap_entry);
435 	rdma_user_mmap_entry_remove(qp->sq_db_mmap_entry);
436 }
437 
438 int efa_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)
439 {
440 	struct efa_dev *dev = to_edev(ibqp->pd->device);
441 	struct efa_qp *qp = to_eqp(ibqp);
442 	int err;
443 
444 	ibdev_dbg(&dev->ibdev, "Destroy qp[%u]\n", ibqp->qp_num);
445 
446 	efa_qp_user_mmap_entries_remove(qp);
447 
448 	err = efa_destroy_qp_handle(dev, qp->qp_handle);
449 	if (err)
450 		return err;
451 
452 	if (qp->rq_cpu_addr) {
453 		ibdev_dbg(&dev->ibdev,
454 			  "qp->cpu_addr[0x%p] freed: size[%lu], dma[%pad]\n",
455 			  qp->rq_cpu_addr, qp->rq_size,
456 			  &qp->rq_dma_addr);
457 		efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr,
458 				qp->rq_size, DMA_TO_DEVICE);
459 	}
460 
461 	return 0;
462 }
463 
464 static struct rdma_user_mmap_entry*
465 efa_user_mmap_entry_insert(struct ib_ucontext *ucontext,
466 			   u64 address, size_t length,
467 			   u8 mmap_flag, u64 *offset)
468 {
469 	struct efa_user_mmap_entry *entry = kzalloc(sizeof(*entry), GFP_KERNEL);
470 	int err;
471 
472 	if (!entry)
473 		return NULL;
474 
475 	entry->address = address;
476 	entry->mmap_flag = mmap_flag;
477 
478 	err = rdma_user_mmap_entry_insert(ucontext, &entry->rdma_entry,
479 					  length);
480 	if (err) {
481 		kfree(entry);
482 		return NULL;
483 	}
484 	*offset = rdma_user_mmap_get_offset(&entry->rdma_entry);
485 
486 	return &entry->rdma_entry;
487 }
488 
489 static int qp_mmap_entries_setup(struct efa_qp *qp,
490 				 struct efa_dev *dev,
491 				 struct efa_ucontext *ucontext,
492 				 struct efa_com_create_qp_params *params,
493 				 struct efa_ibv_create_qp_resp *resp)
494 {
495 	size_t length;
496 	u64 address;
497 
498 	address = dev->db_bar_addr + resp->sq_db_offset;
499 	qp->sq_db_mmap_entry =
500 		efa_user_mmap_entry_insert(&ucontext->ibucontext,
501 					   address,
502 					   PAGE_SIZE, EFA_MMAP_IO_NC,
503 					   &resp->sq_db_mmap_key);
504 	if (!qp->sq_db_mmap_entry)
505 		return -ENOMEM;
506 
507 	resp->sq_db_offset &= ~PAGE_MASK;
508 
509 	address = dev->mem_bar_addr + resp->llq_desc_offset;
510 	length = PAGE_ALIGN(params->sq_ring_size_in_bytes +
511 			    (resp->llq_desc_offset & ~PAGE_MASK));
512 
513 	qp->llq_desc_mmap_entry =
514 		efa_user_mmap_entry_insert(&ucontext->ibucontext,
515 					   address, length,
516 					   EFA_MMAP_IO_WC,
517 					   &resp->llq_desc_mmap_key);
518 	if (!qp->llq_desc_mmap_entry)
519 		goto err_remove_mmap;
520 
521 	resp->llq_desc_offset &= ~PAGE_MASK;
522 
523 	if (qp->rq_size) {
524 		address = dev->db_bar_addr + resp->rq_db_offset;
525 
526 		qp->rq_db_mmap_entry =
527 			efa_user_mmap_entry_insert(&ucontext->ibucontext,
528 						   address, PAGE_SIZE,
529 						   EFA_MMAP_IO_NC,
530 						   &resp->rq_db_mmap_key);
531 		if (!qp->rq_db_mmap_entry)
532 			goto err_remove_mmap;
533 
534 		resp->rq_db_offset &= ~PAGE_MASK;
535 
536 		address = virt_to_phys(qp->rq_cpu_addr);
537 		qp->rq_mmap_entry =
538 			efa_user_mmap_entry_insert(&ucontext->ibucontext,
539 						   address, qp->rq_size,
540 						   EFA_MMAP_DMA_PAGE,
541 						   &resp->rq_mmap_key);
542 		if (!qp->rq_mmap_entry)
543 			goto err_remove_mmap;
544 
545 		resp->rq_mmap_size = qp->rq_size;
546 	}
547 
548 	return 0;
549 
550 err_remove_mmap:
551 	efa_qp_user_mmap_entries_remove(qp);
552 
553 	return -ENOMEM;
554 }
555 
556 static int efa_qp_validate_cap(struct efa_dev *dev,
557 			       struct ib_qp_init_attr *init_attr)
558 {
559 	if (init_attr->cap.max_send_wr > dev->dev_attr.max_sq_depth) {
560 		ibdev_dbg(&dev->ibdev,
561 			  "qp: requested send wr[%u] exceeds the max[%u]\n",
562 			  init_attr->cap.max_send_wr,
563 			  dev->dev_attr.max_sq_depth);
564 		return -EINVAL;
565 	}
566 	if (init_attr->cap.max_recv_wr > dev->dev_attr.max_rq_depth) {
567 		ibdev_dbg(&dev->ibdev,
568 			  "qp: requested receive wr[%u] exceeds the max[%u]\n",
569 			  init_attr->cap.max_recv_wr,
570 			  dev->dev_attr.max_rq_depth);
571 		return -EINVAL;
572 	}
573 	if (init_attr->cap.max_send_sge > dev->dev_attr.max_sq_sge) {
574 		ibdev_dbg(&dev->ibdev,
575 			  "qp: requested sge send[%u] exceeds the max[%u]\n",
576 			  init_attr->cap.max_send_sge, dev->dev_attr.max_sq_sge);
577 		return -EINVAL;
578 	}
579 	if (init_attr->cap.max_recv_sge > dev->dev_attr.max_rq_sge) {
580 		ibdev_dbg(&dev->ibdev,
581 			  "qp: requested sge recv[%u] exceeds the max[%u]\n",
582 			  init_attr->cap.max_recv_sge, dev->dev_attr.max_rq_sge);
583 		return -EINVAL;
584 	}
585 	if (init_attr->cap.max_inline_data > dev->dev_attr.inline_buf_size) {
586 		ibdev_dbg(&dev->ibdev,
587 			  "qp: requested inline data[%u] exceeds the max[%u]\n",
588 			  init_attr->cap.max_inline_data,
589 			  dev->dev_attr.inline_buf_size);
590 		return -EINVAL;
591 	}
592 
593 	return 0;
594 }
595 
596 static int efa_qp_validate_attr(struct efa_dev *dev,
597 				struct ib_qp_init_attr *init_attr)
598 {
599 	if (init_attr->qp_type != IB_QPT_DRIVER &&
600 	    init_attr->qp_type != IB_QPT_UD) {
601 		ibdev_dbg(&dev->ibdev,
602 			  "Unsupported qp type %d\n", init_attr->qp_type);
603 		return -EOPNOTSUPP;
604 	}
605 
606 	if (init_attr->srq) {
607 		ibdev_dbg(&dev->ibdev, "SRQ is not supported\n");
608 		return -EOPNOTSUPP;
609 	}
610 
611 	if (init_attr->create_flags) {
612 		ibdev_dbg(&dev->ibdev, "Unsupported create flags\n");
613 		return -EOPNOTSUPP;
614 	}
615 
616 	return 0;
617 }
618 
619 int efa_create_qp(struct ib_qp *ibqp, struct ib_qp_init_attr *init_attr,
620 		  struct ib_udata *udata)
621 {
622 	struct efa_com_create_qp_params create_qp_params = {};
623 	struct efa_com_create_qp_result create_qp_resp;
624 	struct efa_dev *dev = to_edev(ibqp->device);
625 	struct efa_ibv_create_qp_resp resp = {};
626 	struct efa_ibv_create_qp cmd = {};
627 	struct efa_qp *qp = to_eqp(ibqp);
628 	struct efa_ucontext *ucontext;
629 	int err;
630 
631 	ucontext = rdma_udata_to_drv_context(udata, struct efa_ucontext,
632 					     ibucontext);
633 
634 	err = efa_qp_validate_cap(dev, init_attr);
635 	if (err)
636 		goto err_out;
637 
638 	err = efa_qp_validate_attr(dev, init_attr);
639 	if (err)
640 		goto err_out;
641 
642 	if (offsetofend(typeof(cmd), driver_qp_type) > udata->inlen) {
643 		ibdev_dbg(&dev->ibdev,
644 			  "Incompatible ABI params, no input udata\n");
645 		err = -EINVAL;
646 		goto err_out;
647 	}
648 
649 	if (udata->inlen > sizeof(cmd) &&
650 	    !ib_is_udata_cleared(udata, sizeof(cmd),
651 				 udata->inlen - sizeof(cmd))) {
652 		ibdev_dbg(&dev->ibdev,
653 			  "Incompatible ABI params, unknown fields in udata\n");
654 		err = -EINVAL;
655 		goto err_out;
656 	}
657 
658 	err = ib_copy_from_udata(&cmd, udata,
659 				 min(sizeof(cmd), udata->inlen));
660 	if (err) {
661 		ibdev_dbg(&dev->ibdev,
662 			  "Cannot copy udata for create_qp\n");
663 		goto err_out;
664 	}
665 
666 	if (cmd.comp_mask) {
667 		ibdev_dbg(&dev->ibdev,
668 			  "Incompatible ABI params, unknown fields in udata\n");
669 		err = -EINVAL;
670 		goto err_out;
671 	}
672 
673 	create_qp_params.uarn = ucontext->uarn;
674 	create_qp_params.pd = to_epd(ibqp->pd)->pdn;
675 
676 	if (init_attr->qp_type == IB_QPT_UD) {
677 		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_UD;
678 	} else if (cmd.driver_qp_type == EFA_QP_DRIVER_TYPE_SRD) {
679 		create_qp_params.qp_type = EFA_ADMIN_QP_TYPE_SRD;
680 	} else {
681 		ibdev_dbg(&dev->ibdev,
682 			  "Unsupported qp type %d driver qp type %d\n",
683 			  init_attr->qp_type, cmd.driver_qp_type);
684 		err = -EOPNOTSUPP;
685 		goto err_out;
686 	}
687 
688 	ibdev_dbg(&dev->ibdev, "Create QP: qp type %d driver qp type %#x\n",
689 		  init_attr->qp_type, cmd.driver_qp_type);
690 	create_qp_params.send_cq_idx = to_ecq(init_attr->send_cq)->cq_idx;
691 	create_qp_params.recv_cq_idx = to_ecq(init_attr->recv_cq)->cq_idx;
692 	create_qp_params.sq_depth = init_attr->cap.max_send_wr;
693 	create_qp_params.sq_ring_size_in_bytes = cmd.sq_ring_size;
694 
695 	create_qp_params.rq_depth = init_attr->cap.max_recv_wr;
696 	create_qp_params.rq_ring_size_in_bytes = cmd.rq_ring_size;
697 	qp->rq_size = PAGE_ALIGN(create_qp_params.rq_ring_size_in_bytes);
698 	if (qp->rq_size) {
699 		qp->rq_cpu_addr = efa_zalloc_mapped(dev, &qp->rq_dma_addr,
700 						    qp->rq_size, DMA_TO_DEVICE);
701 		if (!qp->rq_cpu_addr) {
702 			err = -ENOMEM;
703 			goto err_out;
704 		}
705 
706 		ibdev_dbg(&dev->ibdev,
707 			  "qp->cpu_addr[0x%p] allocated: size[%lu], dma[%pad]\n",
708 			  qp->rq_cpu_addr, qp->rq_size, &qp->rq_dma_addr);
709 		create_qp_params.rq_base_addr = qp->rq_dma_addr;
710 	}
711 
712 	err = efa_com_create_qp(&dev->edev, &create_qp_params,
713 				&create_qp_resp);
714 	if (err)
715 		goto err_free_mapped;
716 
717 	resp.sq_db_offset = create_qp_resp.sq_db_offset;
718 	resp.rq_db_offset = create_qp_resp.rq_db_offset;
719 	resp.llq_desc_offset = create_qp_resp.llq_descriptors_offset;
720 	resp.send_sub_cq_idx = create_qp_resp.send_sub_cq_idx;
721 	resp.recv_sub_cq_idx = create_qp_resp.recv_sub_cq_idx;
722 
723 	err = qp_mmap_entries_setup(qp, dev, ucontext, &create_qp_params,
724 				    &resp);
725 	if (err)
726 		goto err_destroy_qp;
727 
728 	qp->qp_handle = create_qp_resp.qp_handle;
729 	qp->ibqp.qp_num = create_qp_resp.qp_num;
730 	qp->max_send_wr = init_attr->cap.max_send_wr;
731 	qp->max_recv_wr = init_attr->cap.max_recv_wr;
732 	qp->max_send_sge = init_attr->cap.max_send_sge;
733 	qp->max_recv_sge = init_attr->cap.max_recv_sge;
734 	qp->max_inline_data = init_attr->cap.max_inline_data;
735 
736 	if (udata->outlen) {
737 		err = ib_copy_to_udata(udata, &resp,
738 				       min(sizeof(resp), udata->outlen));
739 		if (err) {
740 			ibdev_dbg(&dev->ibdev,
741 				  "Failed to copy udata for qp[%u]\n",
742 				  create_qp_resp.qp_num);
743 			goto err_remove_mmap_entries;
744 		}
745 	}
746 
747 	ibdev_dbg(&dev->ibdev, "Created qp[%d]\n", qp->ibqp.qp_num);
748 
749 	return 0;
750 
751 err_remove_mmap_entries:
752 	efa_qp_user_mmap_entries_remove(qp);
753 err_destroy_qp:
754 	efa_destroy_qp_handle(dev, create_qp_resp.qp_handle);
755 err_free_mapped:
756 	if (qp->rq_size)
757 		efa_free_mapped(dev, qp->rq_cpu_addr, qp->rq_dma_addr,
758 				qp->rq_size, DMA_TO_DEVICE);
759 err_out:
760 	atomic64_inc(&dev->stats.create_qp_err);
761 	return err;
762 }
763 
764 static const struct {
765 	int			valid;
766 	enum ib_qp_attr_mask	req_param;
767 	enum ib_qp_attr_mask	opt_param;
768 } srd_qp_state_table[IB_QPS_ERR + 1][IB_QPS_ERR + 1] = {
769 	[IB_QPS_RESET] = {
770 		[IB_QPS_RESET] = { .valid = 1 },
771 		[IB_QPS_INIT]  = {
772 			.valid = 1,
773 			.req_param = IB_QP_PKEY_INDEX |
774 				     IB_QP_PORT |
775 				     IB_QP_QKEY,
776 		},
777 	},
778 	[IB_QPS_INIT] = {
779 		[IB_QPS_RESET] = { .valid = 1 },
780 		[IB_QPS_ERR]   = { .valid = 1 },
781 		[IB_QPS_INIT]  = {
782 			.valid = 1,
783 			.opt_param = IB_QP_PKEY_INDEX |
784 				     IB_QP_PORT |
785 				     IB_QP_QKEY,
786 		},
787 		[IB_QPS_RTR]   = {
788 			.valid = 1,
789 			.opt_param = IB_QP_PKEY_INDEX |
790 				     IB_QP_QKEY,
791 		},
792 	},
793 	[IB_QPS_RTR] = {
794 		[IB_QPS_RESET] = { .valid = 1 },
795 		[IB_QPS_ERR]   = { .valid = 1 },
796 		[IB_QPS_RTS]   = {
797 			.valid = 1,
798 			.req_param = IB_QP_SQ_PSN,
799 			.opt_param = IB_QP_CUR_STATE |
800 				     IB_QP_QKEY |
801 				     IB_QP_RNR_RETRY,
802 
803 		}
804 	},
805 	[IB_QPS_RTS] = {
806 		[IB_QPS_RESET] = { .valid = 1 },
807 		[IB_QPS_ERR]   = { .valid = 1 },
808 		[IB_QPS_RTS]   = {
809 			.valid = 1,
810 			.opt_param = IB_QP_CUR_STATE |
811 				     IB_QP_QKEY,
812 		},
813 		[IB_QPS_SQD] = {
814 			.valid = 1,
815 			.opt_param = IB_QP_EN_SQD_ASYNC_NOTIFY,
816 		},
817 	},
818 	[IB_QPS_SQD] = {
819 		[IB_QPS_RESET] = { .valid = 1 },
820 		[IB_QPS_ERR]   = { .valid = 1 },
821 		[IB_QPS_RTS]   = {
822 			.valid = 1,
823 			.opt_param = IB_QP_CUR_STATE |
824 				     IB_QP_QKEY,
825 		},
826 		[IB_QPS_SQD] = {
827 			.valid = 1,
828 			.opt_param = IB_QP_PKEY_INDEX |
829 				     IB_QP_QKEY,
830 		}
831 	},
832 	[IB_QPS_SQE] = {
833 		[IB_QPS_RESET] = { .valid = 1 },
834 		[IB_QPS_ERR]   = { .valid = 1 },
835 		[IB_QPS_RTS]   = {
836 			.valid = 1,
837 			.opt_param = IB_QP_CUR_STATE |
838 				     IB_QP_QKEY,
839 		}
840 	},
841 	[IB_QPS_ERR] = {
842 		[IB_QPS_RESET] = { .valid = 1 },
843 		[IB_QPS_ERR]   = { .valid = 1 },
844 	}
845 };
846 
847 static bool efa_modify_srd_qp_is_ok(enum ib_qp_state cur_state,
848 				    enum ib_qp_state next_state,
849 				    enum ib_qp_attr_mask mask)
850 {
851 	enum ib_qp_attr_mask req_param, opt_param;
852 
853 	if (mask & IB_QP_CUR_STATE  &&
854 	    cur_state != IB_QPS_RTR && cur_state != IB_QPS_RTS &&
855 	    cur_state != IB_QPS_SQD && cur_state != IB_QPS_SQE)
856 		return false;
857 
858 	if (!srd_qp_state_table[cur_state][next_state].valid)
859 		return false;
860 
861 	req_param = srd_qp_state_table[cur_state][next_state].req_param;
862 	opt_param = srd_qp_state_table[cur_state][next_state].opt_param;
863 
864 	if ((mask & req_param) != req_param)
865 		return false;
866 
867 	if (mask & ~(req_param | opt_param | IB_QP_STATE))
868 		return false;
869 
870 	return true;
871 }
872 
873 static int efa_modify_qp_validate(struct efa_dev *dev, struct efa_qp *qp,
874 				  struct ib_qp_attr *qp_attr, int qp_attr_mask,
875 				  enum ib_qp_state cur_state,
876 				  enum ib_qp_state new_state)
877 {
878 	int err;
879 
880 #define EFA_MODIFY_QP_SUPP_MASK \
881 	(IB_QP_STATE | IB_QP_CUR_STATE | IB_QP_EN_SQD_ASYNC_NOTIFY | \
882 	 IB_QP_PKEY_INDEX | IB_QP_PORT | IB_QP_QKEY | IB_QP_SQ_PSN | \
883 	 IB_QP_RNR_RETRY)
884 
885 	if (qp_attr_mask & ~EFA_MODIFY_QP_SUPP_MASK) {
886 		ibdev_dbg(&dev->ibdev,
887 			  "Unsupported qp_attr_mask[%#x] supported[%#x]\n",
888 			  qp_attr_mask, EFA_MODIFY_QP_SUPP_MASK);
889 		return -EOPNOTSUPP;
890 	}
891 
892 	if (qp->ibqp.qp_type == IB_QPT_DRIVER)
893 		err = !efa_modify_srd_qp_is_ok(cur_state, new_state,
894 					       qp_attr_mask);
895 	else
896 		err = !ib_modify_qp_is_ok(cur_state, new_state, IB_QPT_UD,
897 					  qp_attr_mask);
898 
899 	if (err) {
900 		ibdev_dbg(&dev->ibdev, "Invalid modify QP parameters\n");
901 		return -EINVAL;
902 	}
903 
904 	if ((qp_attr_mask & IB_QP_PORT) && qp_attr->port_num != 1) {
905 		ibdev_dbg(&dev->ibdev, "Can't change port num\n");
906 		return -EOPNOTSUPP;
907 	}
908 
909 	if ((qp_attr_mask & IB_QP_PKEY_INDEX) && qp_attr->pkey_index) {
910 		ibdev_dbg(&dev->ibdev, "Can't change pkey index\n");
911 		return -EOPNOTSUPP;
912 	}
913 
914 	return 0;
915 }
916 
917 int efa_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,
918 		  int qp_attr_mask, struct ib_udata *udata)
919 {
920 	struct efa_dev *dev = to_edev(ibqp->device);
921 	struct efa_com_modify_qp_params params = {};
922 	struct efa_qp *qp = to_eqp(ibqp);
923 	enum ib_qp_state cur_state;
924 	enum ib_qp_state new_state;
925 	int err;
926 
927 	if (qp_attr_mask & ~IB_QP_ATTR_STANDARD_BITS)
928 		return -EOPNOTSUPP;
929 
930 	if (udata->inlen &&
931 	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
932 		ibdev_dbg(&dev->ibdev,
933 			  "Incompatible ABI params, udata not cleared\n");
934 		return -EINVAL;
935 	}
936 
937 	cur_state = qp_attr_mask & IB_QP_CUR_STATE ? qp_attr->cur_qp_state :
938 						     qp->state;
939 	new_state = qp_attr_mask & IB_QP_STATE ? qp_attr->qp_state : cur_state;
940 
941 	err = efa_modify_qp_validate(dev, qp, qp_attr, qp_attr_mask, cur_state,
942 				     new_state);
943 	if (err)
944 		return err;
945 
946 	params.qp_handle = qp->qp_handle;
947 
948 	if (qp_attr_mask & IB_QP_STATE) {
949 		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_QP_STATE,
950 			1);
951 		EFA_SET(&params.modify_mask,
952 			EFA_ADMIN_MODIFY_QP_CMD_CUR_QP_STATE, 1);
953 		params.cur_qp_state = cur_state;
954 		params.qp_state = new_state;
955 	}
956 
957 	if (qp_attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY) {
958 		EFA_SET(&params.modify_mask,
959 			EFA_ADMIN_MODIFY_QP_CMD_SQ_DRAINED_ASYNC_NOTIFY, 1);
960 		params.sq_drained_async_notify = qp_attr->en_sqd_async_notify;
961 	}
962 
963 	if (qp_attr_mask & IB_QP_QKEY) {
964 		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_QKEY, 1);
965 		params.qkey = qp_attr->qkey;
966 	}
967 
968 	if (qp_attr_mask & IB_QP_SQ_PSN) {
969 		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_SQ_PSN, 1);
970 		params.sq_psn = qp_attr->sq_psn;
971 	}
972 
973 	if (qp_attr_mask & IB_QP_RNR_RETRY) {
974 		EFA_SET(&params.modify_mask, EFA_ADMIN_MODIFY_QP_CMD_RNR_RETRY,
975 			1);
976 		params.rnr_retry = qp_attr->rnr_retry;
977 	}
978 
979 	err = efa_com_modify_qp(&dev->edev, &params);
980 	if (err)
981 		return err;
982 
983 	qp->state = new_state;
984 
985 	return 0;
986 }
987 
988 static int efa_destroy_cq_idx(struct efa_dev *dev, int cq_idx)
989 {
990 	struct efa_com_destroy_cq_params params = { .cq_idx = cq_idx };
991 
992 	return efa_com_destroy_cq(&dev->edev, &params);
993 }
994 
995 static void efa_cq_user_mmap_entries_remove(struct efa_cq *cq)
996 {
997 	rdma_user_mmap_entry_remove(cq->db_mmap_entry);
998 	rdma_user_mmap_entry_remove(cq->mmap_entry);
999 }
1000 
1001 int efa_destroy_cq(struct ib_cq *ibcq, struct ib_udata *udata)
1002 {
1003 	struct efa_dev *dev = to_edev(ibcq->device);
1004 	struct efa_cq *cq = to_ecq(ibcq);
1005 
1006 	ibdev_dbg(&dev->ibdev,
1007 		  "Destroy cq[%d] virt[0x%p] freed: size[%lu], dma[%pad]\n",
1008 		  cq->cq_idx, cq->cpu_addr, cq->size, &cq->dma_addr);
1009 
1010 	efa_cq_user_mmap_entries_remove(cq);
1011 	efa_destroy_cq_idx(dev, cq->cq_idx);
1012 	if (cq->eq) {
1013 		xa_erase(&dev->cqs_xa, cq->cq_idx);
1014 		synchronize_irq(cq->eq->irq.irqn);
1015 	}
1016 	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
1017 			DMA_FROM_DEVICE);
1018 	return 0;
1019 }
1020 
1021 static struct efa_eq *efa_vec2eq(struct efa_dev *dev, int vec)
1022 {
1023 	return &dev->eqs[vec];
1024 }
1025 
1026 static int cq_mmap_entries_setup(struct efa_dev *dev, struct efa_cq *cq,
1027 				 struct efa_ibv_create_cq_resp *resp,
1028 				 bool db_valid)
1029 {
1030 	resp->q_mmap_size = cq->size;
1031 	cq->mmap_entry = efa_user_mmap_entry_insert(&cq->ucontext->ibucontext,
1032 						    virt_to_phys(cq->cpu_addr),
1033 						    cq->size, EFA_MMAP_DMA_PAGE,
1034 						    &resp->q_mmap_key);
1035 	if (!cq->mmap_entry)
1036 		return -ENOMEM;
1037 
1038 	if (db_valid) {
1039 		cq->db_mmap_entry =
1040 			efa_user_mmap_entry_insert(&cq->ucontext->ibucontext,
1041 						   dev->db_bar_addr + resp->db_off,
1042 						   PAGE_SIZE, EFA_MMAP_IO_NC,
1043 						   &resp->db_mmap_key);
1044 		if (!cq->db_mmap_entry) {
1045 			rdma_user_mmap_entry_remove(cq->mmap_entry);
1046 			return -ENOMEM;
1047 		}
1048 
1049 		resp->db_off &= ~PAGE_MASK;
1050 		resp->comp_mask |= EFA_CREATE_CQ_RESP_DB_OFF;
1051 	}
1052 
1053 	return 0;
1054 }
1055 
1056 int efa_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
1057 		  struct ib_udata *udata)
1058 {
1059 	struct efa_ucontext *ucontext = rdma_udata_to_drv_context(
1060 		udata, struct efa_ucontext, ibucontext);
1061 	struct efa_com_create_cq_params params = {};
1062 	struct efa_ibv_create_cq_resp resp = {};
1063 	struct efa_com_create_cq_result result;
1064 	struct ib_device *ibdev = ibcq->device;
1065 	struct efa_dev *dev = to_edev(ibdev);
1066 	struct efa_ibv_create_cq cmd = {};
1067 	struct efa_cq *cq = to_ecq(ibcq);
1068 	int entries = attr->cqe;
1069 	bool set_src_addr;
1070 	int err;
1071 
1072 	ibdev_dbg(ibdev, "create_cq entries %d\n", entries);
1073 
1074 	if (attr->flags)
1075 		return -EOPNOTSUPP;
1076 
1077 	if (entries < 1 || entries > dev->dev_attr.max_cq_depth) {
1078 		ibdev_dbg(ibdev,
1079 			  "cq: requested entries[%u] non-positive or greater than max[%u]\n",
1080 			  entries, dev->dev_attr.max_cq_depth);
1081 		err = -EINVAL;
1082 		goto err_out;
1083 	}
1084 
1085 	if (offsetofend(typeof(cmd), num_sub_cqs) > udata->inlen) {
1086 		ibdev_dbg(ibdev,
1087 			  "Incompatible ABI params, no input udata\n");
1088 		err = -EINVAL;
1089 		goto err_out;
1090 	}
1091 
1092 	if (udata->inlen > sizeof(cmd) &&
1093 	    !ib_is_udata_cleared(udata, sizeof(cmd),
1094 				 udata->inlen - sizeof(cmd))) {
1095 		ibdev_dbg(ibdev,
1096 			  "Incompatible ABI params, unknown fields in udata\n");
1097 		err = -EINVAL;
1098 		goto err_out;
1099 	}
1100 
1101 	err = ib_copy_from_udata(&cmd, udata,
1102 				 min(sizeof(cmd), udata->inlen));
1103 	if (err) {
1104 		ibdev_dbg(ibdev, "Cannot copy udata for create_cq\n");
1105 		goto err_out;
1106 	}
1107 
1108 	if (cmd.comp_mask || !is_reserved_cleared(cmd.reserved_58)) {
1109 		ibdev_dbg(ibdev,
1110 			  "Incompatible ABI params, unknown fields in udata\n");
1111 		err = -EINVAL;
1112 		goto err_out;
1113 	}
1114 
1115 	set_src_addr = !!(cmd.flags & EFA_CREATE_CQ_WITH_SGID);
1116 	if ((cmd.cq_entry_size != sizeof(struct efa_io_rx_cdesc_ex)) &&
1117 	    (set_src_addr ||
1118 	     cmd.cq_entry_size != sizeof(struct efa_io_rx_cdesc))) {
1119 		ibdev_dbg(ibdev,
1120 			  "Invalid entry size [%u]\n", cmd.cq_entry_size);
1121 		err = -EINVAL;
1122 		goto err_out;
1123 	}
1124 
1125 	if (cmd.num_sub_cqs != dev->dev_attr.sub_cqs_per_cq) {
1126 		ibdev_dbg(ibdev,
1127 			  "Invalid number of sub cqs[%u] expected[%u]\n",
1128 			  cmd.num_sub_cqs, dev->dev_attr.sub_cqs_per_cq);
1129 		err = -EINVAL;
1130 		goto err_out;
1131 	}
1132 
1133 	cq->ucontext = ucontext;
1134 	cq->size = PAGE_ALIGN(cmd.cq_entry_size * entries * cmd.num_sub_cqs);
1135 	cq->cpu_addr = efa_zalloc_mapped(dev, &cq->dma_addr, cq->size,
1136 					 DMA_FROM_DEVICE);
1137 	if (!cq->cpu_addr) {
1138 		err = -ENOMEM;
1139 		goto err_out;
1140 	}
1141 
1142 	params.uarn = cq->ucontext->uarn;
1143 	params.cq_depth = entries;
1144 	params.dma_addr = cq->dma_addr;
1145 	params.entry_size_in_bytes = cmd.cq_entry_size;
1146 	params.num_sub_cqs = cmd.num_sub_cqs;
1147 	params.set_src_addr = set_src_addr;
1148 	if (cmd.flags & EFA_CREATE_CQ_WITH_COMPLETION_CHANNEL) {
1149 		cq->eq = efa_vec2eq(dev, attr->comp_vector);
1150 		params.eqn = cq->eq->eeq.eqn;
1151 		params.interrupt_mode_enabled = true;
1152 	}
1153 
1154 	err = efa_com_create_cq(&dev->edev, &params, &result);
1155 	if (err)
1156 		goto err_free_mapped;
1157 
1158 	resp.db_off = result.db_off;
1159 	resp.cq_idx = result.cq_idx;
1160 	cq->cq_idx = result.cq_idx;
1161 	cq->ibcq.cqe = result.actual_depth;
1162 	WARN_ON_ONCE(entries != result.actual_depth);
1163 
1164 	err = cq_mmap_entries_setup(dev, cq, &resp, result.db_valid);
1165 	if (err) {
1166 		ibdev_dbg(ibdev, "Could not setup cq[%u] mmap entries\n",
1167 			  cq->cq_idx);
1168 		goto err_destroy_cq;
1169 	}
1170 
1171 	if (cq->eq) {
1172 		err = xa_err(xa_store(&dev->cqs_xa, cq->cq_idx, cq, GFP_KERNEL));
1173 		if (err) {
1174 			ibdev_dbg(ibdev, "Failed to store cq[%u] in xarray\n",
1175 				  cq->cq_idx);
1176 			goto err_remove_mmap;
1177 		}
1178 	}
1179 
1180 	if (udata->outlen) {
1181 		err = ib_copy_to_udata(udata, &resp,
1182 				       min(sizeof(resp), udata->outlen));
1183 		if (err) {
1184 			ibdev_dbg(ibdev,
1185 				  "Failed to copy udata for create_cq\n");
1186 			goto err_xa_erase;
1187 		}
1188 	}
1189 
1190 	ibdev_dbg(ibdev, "Created cq[%d], cq depth[%u]. dma[%pad] virt[0x%p]\n",
1191 		  cq->cq_idx, result.actual_depth, &cq->dma_addr, cq->cpu_addr);
1192 
1193 	return 0;
1194 
1195 err_xa_erase:
1196 	if (cq->eq)
1197 		xa_erase(&dev->cqs_xa, cq->cq_idx);
1198 err_remove_mmap:
1199 	efa_cq_user_mmap_entries_remove(cq);
1200 err_destroy_cq:
1201 	efa_destroy_cq_idx(dev, cq->cq_idx);
1202 err_free_mapped:
1203 	efa_free_mapped(dev, cq->cpu_addr, cq->dma_addr, cq->size,
1204 			DMA_FROM_DEVICE);
1205 
1206 err_out:
1207 	atomic64_inc(&dev->stats.create_cq_err);
1208 	return err;
1209 }
1210 
1211 static int umem_to_page_list(struct efa_dev *dev,
1212 			     struct ib_umem *umem,
1213 			     u64 *page_list,
1214 			     u32 hp_cnt,
1215 			     u8 hp_shift)
1216 {
1217 	u32 pages_in_hp = BIT(hp_shift - PAGE_SHIFT);
1218 	struct ib_block_iter biter;
1219 	unsigned int hp_idx = 0;
1220 
1221 	ibdev_dbg(&dev->ibdev, "hp_cnt[%u], pages_in_hp[%u]\n",
1222 		  hp_cnt, pages_in_hp);
1223 
1224 	rdma_umem_for_each_dma_block(umem, &biter, BIT(hp_shift))
1225 		page_list[hp_idx++] = rdma_block_iter_dma_address(&biter);
1226 
1227 	return 0;
1228 }
1229 
1230 static struct scatterlist *efa_vmalloc_buf_to_sg(u64 *buf, int page_cnt)
1231 {
1232 	struct scatterlist *sglist;
1233 	struct page *pg;
1234 	int i;
1235 
1236 	sglist = kmalloc_array(page_cnt, sizeof(*sglist), GFP_KERNEL);
1237 	if (!sglist)
1238 		return NULL;
1239 	sg_init_table(sglist, page_cnt);
1240 	for (i = 0; i < page_cnt; i++) {
1241 		pg = vmalloc_to_page(buf);
1242 		if (!pg)
1243 			goto err;
1244 		sg_set_page(&sglist[i], pg, PAGE_SIZE, 0);
1245 		buf += PAGE_SIZE / sizeof(*buf);
1246 	}
1247 	return sglist;
1248 
1249 err:
1250 	kfree(sglist);
1251 	return NULL;
1252 }
1253 
1254 /*
1255  * create a chunk list of physical pages dma addresses from the supplied
1256  * scatter gather list
1257  */
1258 static int pbl_chunk_list_create(struct efa_dev *dev, struct pbl_context *pbl)
1259 {
1260 	struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
1261 	int page_cnt = pbl->phys.indirect.pbl_buf_size_in_pages;
1262 	struct scatterlist *pages_sgl = pbl->phys.indirect.sgl;
1263 	unsigned int chunk_list_size, chunk_idx, payload_idx;
1264 	int sg_dma_cnt = pbl->phys.indirect.sg_dma_cnt;
1265 	struct efa_com_ctrl_buff_info *ctrl_buf;
1266 	u64 *cur_chunk_buf, *prev_chunk_buf;
1267 	struct ib_block_iter biter;
1268 	dma_addr_t dma_addr;
1269 	int i;
1270 
1271 	/* allocate a chunk list that consists of 4KB chunks */
1272 	chunk_list_size = DIV_ROUND_UP(page_cnt, EFA_PTRS_PER_CHUNK);
1273 
1274 	chunk_list->size = chunk_list_size;
1275 	chunk_list->chunks = kcalloc(chunk_list_size,
1276 				     sizeof(*chunk_list->chunks),
1277 				     GFP_KERNEL);
1278 	if (!chunk_list->chunks)
1279 		return -ENOMEM;
1280 
1281 	ibdev_dbg(&dev->ibdev,
1282 		  "chunk_list_size[%u] - pages[%u]\n", chunk_list_size,
1283 		  page_cnt);
1284 
1285 	/* allocate chunk buffers: */
1286 	for (i = 0; i < chunk_list_size; i++) {
1287 		chunk_list->chunks[i].buf = kzalloc(EFA_CHUNK_SIZE, GFP_KERNEL);
1288 		if (!chunk_list->chunks[i].buf)
1289 			goto chunk_list_dealloc;
1290 
1291 		chunk_list->chunks[i].length = EFA_CHUNK_USED_SIZE;
1292 	}
1293 	chunk_list->chunks[chunk_list_size - 1].length =
1294 		((page_cnt % EFA_PTRS_PER_CHUNK) * EFA_CHUNK_PAYLOAD_PTR_SIZE) +
1295 			EFA_CHUNK_PTR_SIZE;
1296 
1297 	/* fill the dma addresses of sg list pages to chunks: */
1298 	chunk_idx = 0;
1299 	payload_idx = 0;
1300 	cur_chunk_buf = chunk_list->chunks[0].buf;
1301 	rdma_for_each_block(pages_sgl, &biter, sg_dma_cnt,
1302 			    EFA_CHUNK_PAYLOAD_SIZE) {
1303 		cur_chunk_buf[payload_idx++] =
1304 			rdma_block_iter_dma_address(&biter);
1305 
1306 		if (payload_idx == EFA_PTRS_PER_CHUNK) {
1307 			chunk_idx++;
1308 			cur_chunk_buf = chunk_list->chunks[chunk_idx].buf;
1309 			payload_idx = 0;
1310 		}
1311 	}
1312 
1313 	/* map chunks to dma and fill chunks next ptrs */
1314 	for (i = chunk_list_size - 1; i >= 0; i--) {
1315 		dma_addr = dma_map_single(&dev->pdev->dev,
1316 					  chunk_list->chunks[i].buf,
1317 					  chunk_list->chunks[i].length,
1318 					  DMA_TO_DEVICE);
1319 		if (dma_mapping_error(&dev->pdev->dev, dma_addr)) {
1320 			ibdev_err(&dev->ibdev,
1321 				  "chunk[%u] dma_map_failed\n", i);
1322 			goto chunk_list_unmap;
1323 		}
1324 
1325 		chunk_list->chunks[i].dma_addr = dma_addr;
1326 		ibdev_dbg(&dev->ibdev,
1327 			  "chunk[%u] mapped at [%pad]\n", i, &dma_addr);
1328 
1329 		if (!i)
1330 			break;
1331 
1332 		prev_chunk_buf = chunk_list->chunks[i - 1].buf;
1333 
1334 		ctrl_buf = (struct efa_com_ctrl_buff_info *)
1335 				&prev_chunk_buf[EFA_PTRS_PER_CHUNK];
1336 		ctrl_buf->length = chunk_list->chunks[i].length;
1337 
1338 		efa_com_set_dma_addr(dma_addr,
1339 				     &ctrl_buf->address.mem_addr_high,
1340 				     &ctrl_buf->address.mem_addr_low);
1341 	}
1342 
1343 	return 0;
1344 
1345 chunk_list_unmap:
1346 	for (; i < chunk_list_size; i++) {
1347 		dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr,
1348 				 chunk_list->chunks[i].length, DMA_TO_DEVICE);
1349 	}
1350 chunk_list_dealloc:
1351 	for (i = 0; i < chunk_list_size; i++)
1352 		kfree(chunk_list->chunks[i].buf);
1353 
1354 	kfree(chunk_list->chunks);
1355 	return -ENOMEM;
1356 }
1357 
1358 static void pbl_chunk_list_destroy(struct efa_dev *dev, struct pbl_context *pbl)
1359 {
1360 	struct pbl_chunk_list *chunk_list = &pbl->phys.indirect.chunk_list;
1361 	int i;
1362 
1363 	for (i = 0; i < chunk_list->size; i++) {
1364 		dma_unmap_single(&dev->pdev->dev, chunk_list->chunks[i].dma_addr,
1365 				 chunk_list->chunks[i].length, DMA_TO_DEVICE);
1366 		kfree(chunk_list->chunks[i].buf);
1367 	}
1368 
1369 	kfree(chunk_list->chunks);
1370 }
1371 
1372 /* initialize pbl continuous mode: map pbl buffer to a dma address. */
1373 static int pbl_continuous_initialize(struct efa_dev *dev,
1374 				     struct pbl_context *pbl)
1375 {
1376 	dma_addr_t dma_addr;
1377 
1378 	dma_addr = dma_map_single(&dev->pdev->dev, pbl->pbl_buf,
1379 				  pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
1380 	if (dma_mapping_error(&dev->pdev->dev, dma_addr)) {
1381 		ibdev_err(&dev->ibdev, "Unable to map pbl to DMA address\n");
1382 		return -ENOMEM;
1383 	}
1384 
1385 	pbl->phys.continuous.dma_addr = dma_addr;
1386 	ibdev_dbg(&dev->ibdev,
1387 		  "pbl continuous - dma_addr = %pad, size[%u]\n",
1388 		  &dma_addr, pbl->pbl_buf_size_in_bytes);
1389 
1390 	return 0;
1391 }
1392 
1393 /*
1394  * initialize pbl indirect mode:
1395  * create a chunk list out of the dma addresses of the physical pages of
1396  * pbl buffer.
1397  */
1398 static int pbl_indirect_initialize(struct efa_dev *dev, struct pbl_context *pbl)
1399 {
1400 	u32 size_in_pages = DIV_ROUND_UP(pbl->pbl_buf_size_in_bytes, PAGE_SIZE);
1401 	struct scatterlist *sgl;
1402 	int sg_dma_cnt, err;
1403 
1404 	BUILD_BUG_ON(EFA_CHUNK_PAYLOAD_SIZE > PAGE_SIZE);
1405 	sgl = efa_vmalloc_buf_to_sg(pbl->pbl_buf, size_in_pages);
1406 	if (!sgl)
1407 		return -ENOMEM;
1408 
1409 	sg_dma_cnt = dma_map_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE);
1410 	if (!sg_dma_cnt) {
1411 		err = -EINVAL;
1412 		goto err_map;
1413 	}
1414 
1415 	pbl->phys.indirect.pbl_buf_size_in_pages = size_in_pages;
1416 	pbl->phys.indirect.sgl = sgl;
1417 	pbl->phys.indirect.sg_dma_cnt = sg_dma_cnt;
1418 	err = pbl_chunk_list_create(dev, pbl);
1419 	if (err) {
1420 		ibdev_dbg(&dev->ibdev,
1421 			  "chunk_list creation failed[%d]\n", err);
1422 		goto err_chunk;
1423 	}
1424 
1425 	ibdev_dbg(&dev->ibdev,
1426 		  "pbl indirect - size[%u], chunks[%u]\n",
1427 		  pbl->pbl_buf_size_in_bytes,
1428 		  pbl->phys.indirect.chunk_list.size);
1429 
1430 	return 0;
1431 
1432 err_chunk:
1433 	dma_unmap_sg(&dev->pdev->dev, sgl, size_in_pages, DMA_TO_DEVICE);
1434 err_map:
1435 	kfree(sgl);
1436 	return err;
1437 }
1438 
1439 static void pbl_indirect_terminate(struct efa_dev *dev, struct pbl_context *pbl)
1440 {
1441 	pbl_chunk_list_destroy(dev, pbl);
1442 	dma_unmap_sg(&dev->pdev->dev, pbl->phys.indirect.sgl,
1443 		     pbl->phys.indirect.pbl_buf_size_in_pages, DMA_TO_DEVICE);
1444 	kfree(pbl->phys.indirect.sgl);
1445 }
1446 
1447 /* create a page buffer list from a mapped user memory region */
1448 static int pbl_create(struct efa_dev *dev,
1449 		      struct pbl_context *pbl,
1450 		      struct ib_umem *umem,
1451 		      int hp_cnt,
1452 		      u8 hp_shift)
1453 {
1454 	int err;
1455 
1456 	pbl->pbl_buf_size_in_bytes = hp_cnt * EFA_CHUNK_PAYLOAD_PTR_SIZE;
1457 	pbl->pbl_buf = kvzalloc(pbl->pbl_buf_size_in_bytes, GFP_KERNEL);
1458 	if (!pbl->pbl_buf)
1459 		return -ENOMEM;
1460 
1461 	if (is_vmalloc_addr(pbl->pbl_buf)) {
1462 		pbl->physically_continuous = 0;
1463 		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
1464 					hp_shift);
1465 		if (err)
1466 			goto err_free;
1467 
1468 		err = pbl_indirect_initialize(dev, pbl);
1469 		if (err)
1470 			goto err_free;
1471 	} else {
1472 		pbl->physically_continuous = 1;
1473 		err = umem_to_page_list(dev, umem, pbl->pbl_buf, hp_cnt,
1474 					hp_shift);
1475 		if (err)
1476 			goto err_free;
1477 
1478 		err = pbl_continuous_initialize(dev, pbl);
1479 		if (err)
1480 			goto err_free;
1481 	}
1482 
1483 	ibdev_dbg(&dev->ibdev,
1484 		  "user_pbl_created: user_pages[%u], continuous[%u]\n",
1485 		  hp_cnt, pbl->physically_continuous);
1486 
1487 	return 0;
1488 
1489 err_free:
1490 	kvfree(pbl->pbl_buf);
1491 	return err;
1492 }
1493 
1494 static void pbl_destroy(struct efa_dev *dev, struct pbl_context *pbl)
1495 {
1496 	if (pbl->physically_continuous)
1497 		dma_unmap_single(&dev->pdev->dev, pbl->phys.continuous.dma_addr,
1498 				 pbl->pbl_buf_size_in_bytes, DMA_TO_DEVICE);
1499 	else
1500 		pbl_indirect_terminate(dev, pbl);
1501 
1502 	kvfree(pbl->pbl_buf);
1503 }
1504 
1505 static int efa_create_inline_pbl(struct efa_dev *dev, struct efa_mr *mr,
1506 				 struct efa_com_reg_mr_params *params)
1507 {
1508 	int err;
1509 
1510 	params->inline_pbl = 1;
1511 	err = umem_to_page_list(dev, mr->umem, params->pbl.inline_pbl_array,
1512 				params->page_num, params->page_shift);
1513 	if (err)
1514 		return err;
1515 
1516 	ibdev_dbg(&dev->ibdev,
1517 		  "inline_pbl_array - pages[%u]\n", params->page_num);
1518 
1519 	return 0;
1520 }
1521 
1522 static int efa_create_pbl(struct efa_dev *dev,
1523 			  struct pbl_context *pbl,
1524 			  struct efa_mr *mr,
1525 			  struct efa_com_reg_mr_params *params)
1526 {
1527 	int err;
1528 
1529 	err = pbl_create(dev, pbl, mr->umem, params->page_num,
1530 			 params->page_shift);
1531 	if (err) {
1532 		ibdev_dbg(&dev->ibdev, "Failed to create pbl[%d]\n", err);
1533 		return err;
1534 	}
1535 
1536 	params->inline_pbl = 0;
1537 	params->indirect = !pbl->physically_continuous;
1538 	if (pbl->physically_continuous) {
1539 		params->pbl.pbl.length = pbl->pbl_buf_size_in_bytes;
1540 
1541 		efa_com_set_dma_addr(pbl->phys.continuous.dma_addr,
1542 				     &params->pbl.pbl.address.mem_addr_high,
1543 				     &params->pbl.pbl.address.mem_addr_low);
1544 	} else {
1545 		params->pbl.pbl.length =
1546 			pbl->phys.indirect.chunk_list.chunks[0].length;
1547 
1548 		efa_com_set_dma_addr(pbl->phys.indirect.chunk_list.chunks[0].dma_addr,
1549 				     &params->pbl.pbl.address.mem_addr_high,
1550 				     &params->pbl.pbl.address.mem_addr_low);
1551 	}
1552 
1553 	return 0;
1554 }
1555 
1556 static struct efa_mr *efa_alloc_mr(struct ib_pd *ibpd, int access_flags,
1557 				   struct ib_udata *udata)
1558 {
1559 	struct efa_dev *dev = to_edev(ibpd->device);
1560 	int supp_access_flags;
1561 	struct efa_mr *mr;
1562 
1563 	if (udata && udata->inlen &&
1564 	    !ib_is_udata_cleared(udata, 0, sizeof(udata->inlen))) {
1565 		ibdev_dbg(&dev->ibdev,
1566 			  "Incompatible ABI params, udata not cleared\n");
1567 		return ERR_PTR(-EINVAL);
1568 	}
1569 
1570 	supp_access_flags =
1571 		IB_ACCESS_LOCAL_WRITE |
1572 		(EFA_DEV_CAP(dev, RDMA_READ) ? IB_ACCESS_REMOTE_READ : 0);
1573 
1574 	access_flags &= ~IB_ACCESS_OPTIONAL;
1575 	if (access_flags & ~supp_access_flags) {
1576 		ibdev_dbg(&dev->ibdev,
1577 			  "Unsupported access flags[%#x], supported[%#x]\n",
1578 			  access_flags, supp_access_flags);
1579 		return ERR_PTR(-EOPNOTSUPP);
1580 	}
1581 
1582 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1583 	if (!mr)
1584 		return ERR_PTR(-ENOMEM);
1585 
1586 	return mr;
1587 }
1588 
1589 static int efa_register_mr(struct ib_pd *ibpd, struct efa_mr *mr, u64 start,
1590 			   u64 length, u64 virt_addr, int access_flags)
1591 {
1592 	struct efa_dev *dev = to_edev(ibpd->device);
1593 	struct efa_com_reg_mr_params params = {};
1594 	struct efa_com_reg_mr_result result = {};
1595 	struct pbl_context pbl;
1596 	unsigned int pg_sz;
1597 	int inline_size;
1598 	int err;
1599 
1600 	params.pd = to_epd(ibpd)->pdn;
1601 	params.iova = virt_addr;
1602 	params.mr_length_in_bytes = length;
1603 	params.permissions = access_flags;
1604 
1605 	pg_sz = ib_umem_find_best_pgsz(mr->umem,
1606 				       dev->dev_attr.page_size_cap,
1607 				       virt_addr);
1608 	if (!pg_sz) {
1609 		ibdev_dbg(&dev->ibdev, "Failed to find a suitable page size in page_size_cap %#llx\n",
1610 			  dev->dev_attr.page_size_cap);
1611 		return -EOPNOTSUPP;
1612 	}
1613 
1614 	params.page_shift = order_base_2(pg_sz);
1615 	params.page_num = ib_umem_num_dma_blocks(mr->umem, pg_sz);
1616 
1617 	ibdev_dbg(&dev->ibdev,
1618 		  "start %#llx length %#llx params.page_shift %u params.page_num %u\n",
1619 		  start, length, params.page_shift, params.page_num);
1620 
1621 	inline_size = ARRAY_SIZE(params.pbl.inline_pbl_array);
1622 	if (params.page_num <= inline_size) {
1623 		err = efa_create_inline_pbl(dev, mr, &params);
1624 		if (err)
1625 			return err;
1626 
1627 		err = efa_com_register_mr(&dev->edev, &params, &result);
1628 		if (err)
1629 			return err;
1630 	} else {
1631 		err = efa_create_pbl(dev, &pbl, mr, &params);
1632 		if (err)
1633 			return err;
1634 
1635 		err = efa_com_register_mr(&dev->edev, &params, &result);
1636 		pbl_destroy(dev, &pbl);
1637 
1638 		if (err)
1639 			return err;
1640 	}
1641 
1642 	mr->ibmr.lkey = result.l_key;
1643 	mr->ibmr.rkey = result.r_key;
1644 	mr->ibmr.length = length;
1645 	ibdev_dbg(&dev->ibdev, "Registered mr[%d]\n", mr->ibmr.lkey);
1646 
1647 	return 0;
1648 }
1649 
1650 struct ib_mr *efa_reg_user_mr_dmabuf(struct ib_pd *ibpd, u64 start,
1651 				     u64 length, u64 virt_addr,
1652 				     int fd, int access_flags,
1653 				     struct ib_udata *udata)
1654 {
1655 	struct efa_dev *dev = to_edev(ibpd->device);
1656 	struct ib_umem_dmabuf *umem_dmabuf;
1657 	struct efa_mr *mr;
1658 	int err;
1659 
1660 	mr = efa_alloc_mr(ibpd, access_flags, udata);
1661 	if (IS_ERR(mr)) {
1662 		err = PTR_ERR(mr);
1663 		goto err_out;
1664 	}
1665 
1666 	umem_dmabuf = ib_umem_dmabuf_get_pinned(ibpd->device, start, length, fd,
1667 						access_flags);
1668 	if (IS_ERR(umem_dmabuf)) {
1669 		err = PTR_ERR(umem_dmabuf);
1670 		ibdev_dbg(&dev->ibdev, "Failed to get dmabuf umem[%d]\n", err);
1671 		goto err_free;
1672 	}
1673 
1674 	mr->umem = &umem_dmabuf->umem;
1675 	err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags);
1676 	if (err)
1677 		goto err_release;
1678 
1679 	return &mr->ibmr;
1680 
1681 err_release:
1682 	ib_umem_release(mr->umem);
1683 err_free:
1684 	kfree(mr);
1685 err_out:
1686 	atomic64_inc(&dev->stats.reg_mr_err);
1687 	return ERR_PTR(err);
1688 }
1689 
1690 struct ib_mr *efa_reg_mr(struct ib_pd *ibpd, u64 start, u64 length,
1691 			 u64 virt_addr, int access_flags,
1692 			 struct ib_udata *udata)
1693 {
1694 	struct efa_dev *dev = to_edev(ibpd->device);
1695 	struct efa_mr *mr;
1696 	int err;
1697 
1698 	mr = efa_alloc_mr(ibpd, access_flags, udata);
1699 	if (IS_ERR(mr)) {
1700 		err = PTR_ERR(mr);
1701 		goto err_out;
1702 	}
1703 
1704 	mr->umem = ib_umem_get(ibpd->device, start, length, access_flags);
1705 	if (IS_ERR(mr->umem)) {
1706 		err = PTR_ERR(mr->umem);
1707 		ibdev_dbg(&dev->ibdev,
1708 			  "Failed to pin and map user space memory[%d]\n", err);
1709 		goto err_free;
1710 	}
1711 
1712 	err = efa_register_mr(ibpd, mr, start, length, virt_addr, access_flags);
1713 	if (err)
1714 		goto err_release;
1715 
1716 	return &mr->ibmr;
1717 
1718 err_release:
1719 	ib_umem_release(mr->umem);
1720 err_free:
1721 	kfree(mr);
1722 err_out:
1723 	atomic64_inc(&dev->stats.reg_mr_err);
1724 	return ERR_PTR(err);
1725 }
1726 
1727 int efa_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1728 {
1729 	struct efa_dev *dev = to_edev(ibmr->device);
1730 	struct efa_com_dereg_mr_params params;
1731 	struct efa_mr *mr = to_emr(ibmr);
1732 	int err;
1733 
1734 	ibdev_dbg(&dev->ibdev, "Deregister mr[%d]\n", ibmr->lkey);
1735 
1736 	params.l_key = mr->ibmr.lkey;
1737 	err = efa_com_dereg_mr(&dev->edev, &params);
1738 	if (err)
1739 		return err;
1740 
1741 	ib_umem_release(mr->umem);
1742 	kfree(mr);
1743 
1744 	return 0;
1745 }
1746 
1747 int efa_get_port_immutable(struct ib_device *ibdev, u32 port_num,
1748 			   struct ib_port_immutable *immutable)
1749 {
1750 	struct ib_port_attr attr;
1751 	int err;
1752 
1753 	err = ib_query_port(ibdev, port_num, &attr);
1754 	if (err) {
1755 		ibdev_dbg(ibdev, "Couldn't query port err[%d]\n", err);
1756 		return err;
1757 	}
1758 
1759 	immutable->pkey_tbl_len = attr.pkey_tbl_len;
1760 	immutable->gid_tbl_len = attr.gid_tbl_len;
1761 
1762 	return 0;
1763 }
1764 
1765 static int efa_dealloc_uar(struct efa_dev *dev, u16 uarn)
1766 {
1767 	struct efa_com_dealloc_uar_params params = {
1768 		.uarn = uarn,
1769 	};
1770 
1771 	return efa_com_dealloc_uar(&dev->edev, &params);
1772 }
1773 
1774 #define EFA_CHECK_USER_COMP(_dev, _comp_mask, _attr, _mask, _attr_str) \
1775 	(_attr_str = (!(_dev)->dev_attr._attr || ((_comp_mask) & (_mask))) ? \
1776 		     NULL : #_attr)
1777 
1778 static int efa_user_comp_handshake(const struct ib_ucontext *ibucontext,
1779 				   const struct efa_ibv_alloc_ucontext_cmd *cmd)
1780 {
1781 	struct efa_dev *dev = to_edev(ibucontext->device);
1782 	char *attr_str;
1783 
1784 	if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, max_tx_batch,
1785 				EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH, attr_str))
1786 		goto err;
1787 
1788 	if (EFA_CHECK_USER_COMP(dev, cmd->comp_mask, min_sq_depth,
1789 				EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR,
1790 				attr_str))
1791 		goto err;
1792 
1793 	return 0;
1794 
1795 err:
1796 	ibdev_dbg(&dev->ibdev, "Userspace handshake failed for %s attribute\n",
1797 		  attr_str);
1798 	return -EOPNOTSUPP;
1799 }
1800 
1801 int efa_alloc_ucontext(struct ib_ucontext *ibucontext, struct ib_udata *udata)
1802 {
1803 	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
1804 	struct efa_dev *dev = to_edev(ibucontext->device);
1805 	struct efa_ibv_alloc_ucontext_resp resp = {};
1806 	struct efa_ibv_alloc_ucontext_cmd cmd = {};
1807 	struct efa_com_alloc_uar_result result;
1808 	int err;
1809 
1810 	/*
1811 	 * it's fine if the driver does not know all request fields,
1812 	 * we will ack input fields in our response.
1813 	 */
1814 
1815 	err = ib_copy_from_udata(&cmd, udata,
1816 				 min(sizeof(cmd), udata->inlen));
1817 	if (err) {
1818 		ibdev_dbg(&dev->ibdev,
1819 			  "Cannot copy udata for alloc_ucontext\n");
1820 		goto err_out;
1821 	}
1822 
1823 	err = efa_user_comp_handshake(ibucontext, &cmd);
1824 	if (err)
1825 		goto err_out;
1826 
1827 	err = efa_com_alloc_uar(&dev->edev, &result);
1828 	if (err)
1829 		goto err_out;
1830 
1831 	ucontext->uarn = result.uarn;
1832 
1833 	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE;
1834 	resp.cmds_supp_udata_mask |= EFA_USER_CMDS_SUPP_UDATA_CREATE_AH;
1835 	resp.sub_cqs_per_cq = dev->dev_attr.sub_cqs_per_cq;
1836 	resp.inline_buf_size = dev->dev_attr.inline_buf_size;
1837 	resp.max_llq_size = dev->dev_attr.max_llq_size;
1838 	resp.max_tx_batch = dev->dev_attr.max_tx_batch;
1839 	resp.min_sq_wr = dev->dev_attr.min_sq_depth;
1840 
1841 	err = ib_copy_to_udata(udata, &resp,
1842 			       min(sizeof(resp), udata->outlen));
1843 	if (err)
1844 		goto err_dealloc_uar;
1845 
1846 	return 0;
1847 
1848 err_dealloc_uar:
1849 	efa_dealloc_uar(dev, result.uarn);
1850 err_out:
1851 	atomic64_inc(&dev->stats.alloc_ucontext_err);
1852 	return err;
1853 }
1854 
1855 void efa_dealloc_ucontext(struct ib_ucontext *ibucontext)
1856 {
1857 	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
1858 	struct efa_dev *dev = to_edev(ibucontext->device);
1859 
1860 	efa_dealloc_uar(dev, ucontext->uarn);
1861 }
1862 
1863 void efa_mmap_free(struct rdma_user_mmap_entry *rdma_entry)
1864 {
1865 	struct efa_user_mmap_entry *entry = to_emmap(rdma_entry);
1866 
1867 	kfree(entry);
1868 }
1869 
1870 static int __efa_mmap(struct efa_dev *dev, struct efa_ucontext *ucontext,
1871 		      struct vm_area_struct *vma)
1872 {
1873 	struct rdma_user_mmap_entry *rdma_entry;
1874 	struct efa_user_mmap_entry *entry;
1875 	unsigned long va;
1876 	int err = 0;
1877 	u64 pfn;
1878 
1879 	rdma_entry = rdma_user_mmap_entry_get(&ucontext->ibucontext, vma);
1880 	if (!rdma_entry) {
1881 		ibdev_dbg(&dev->ibdev,
1882 			  "pgoff[%#lx] does not have valid entry\n",
1883 			  vma->vm_pgoff);
1884 		atomic64_inc(&dev->stats.mmap_err);
1885 		return -EINVAL;
1886 	}
1887 	entry = to_emmap(rdma_entry);
1888 
1889 	ibdev_dbg(&dev->ibdev,
1890 		  "Mapping address[%#llx], length[%#zx], mmap_flag[%d]\n",
1891 		  entry->address, rdma_entry->npages * PAGE_SIZE,
1892 		  entry->mmap_flag);
1893 
1894 	pfn = entry->address >> PAGE_SHIFT;
1895 	switch (entry->mmap_flag) {
1896 	case EFA_MMAP_IO_NC:
1897 		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
1898 					entry->rdma_entry.npages * PAGE_SIZE,
1899 					pgprot_noncached(vma->vm_page_prot),
1900 					rdma_entry);
1901 		break;
1902 	case EFA_MMAP_IO_WC:
1903 		err = rdma_user_mmap_io(&ucontext->ibucontext, vma, pfn,
1904 					entry->rdma_entry.npages * PAGE_SIZE,
1905 					pgprot_writecombine(vma->vm_page_prot),
1906 					rdma_entry);
1907 		break;
1908 	case EFA_MMAP_DMA_PAGE:
1909 		for (va = vma->vm_start; va < vma->vm_end;
1910 		     va += PAGE_SIZE, pfn++) {
1911 			err = vm_insert_page(vma, va, pfn_to_page(pfn));
1912 			if (err)
1913 				break;
1914 		}
1915 		break;
1916 	default:
1917 		err = -EINVAL;
1918 	}
1919 
1920 	if (err) {
1921 		ibdev_dbg(
1922 			&dev->ibdev,
1923 			"Couldn't mmap address[%#llx] length[%#zx] mmap_flag[%d] err[%d]\n",
1924 			entry->address, rdma_entry->npages * PAGE_SIZE,
1925 			entry->mmap_flag, err);
1926 		atomic64_inc(&dev->stats.mmap_err);
1927 	}
1928 
1929 	rdma_user_mmap_entry_put(rdma_entry);
1930 	return err;
1931 }
1932 
1933 int efa_mmap(struct ib_ucontext *ibucontext,
1934 	     struct vm_area_struct *vma)
1935 {
1936 	struct efa_ucontext *ucontext = to_eucontext(ibucontext);
1937 	struct efa_dev *dev = to_edev(ibucontext->device);
1938 	size_t length = vma->vm_end - vma->vm_start;
1939 
1940 	ibdev_dbg(&dev->ibdev,
1941 		  "start %#lx, end %#lx, length = %#zx, pgoff = %#lx\n",
1942 		  vma->vm_start, vma->vm_end, length, vma->vm_pgoff);
1943 
1944 	return __efa_mmap(dev, ucontext, vma);
1945 }
1946 
1947 static int efa_ah_destroy(struct efa_dev *dev, struct efa_ah *ah)
1948 {
1949 	struct efa_com_destroy_ah_params params = {
1950 		.ah = ah->ah,
1951 		.pdn = to_epd(ah->ibah.pd)->pdn,
1952 	};
1953 
1954 	return efa_com_destroy_ah(&dev->edev, &params);
1955 }
1956 
1957 int efa_create_ah(struct ib_ah *ibah,
1958 		  struct rdma_ah_init_attr *init_attr,
1959 		  struct ib_udata *udata)
1960 {
1961 	struct rdma_ah_attr *ah_attr = init_attr->ah_attr;
1962 	struct efa_dev *dev = to_edev(ibah->device);
1963 	struct efa_com_create_ah_params params = {};
1964 	struct efa_ibv_create_ah_resp resp = {};
1965 	struct efa_com_create_ah_result result;
1966 	struct efa_ah *ah = to_eah(ibah);
1967 	int err;
1968 
1969 	if (!(init_attr->flags & RDMA_CREATE_AH_SLEEPABLE)) {
1970 		ibdev_dbg(&dev->ibdev,
1971 			  "Create address handle is not supported in atomic context\n");
1972 		err = -EOPNOTSUPP;
1973 		goto err_out;
1974 	}
1975 
1976 	if (udata->inlen &&
1977 	    !ib_is_udata_cleared(udata, 0, udata->inlen)) {
1978 		ibdev_dbg(&dev->ibdev, "Incompatible ABI params\n");
1979 		err = -EINVAL;
1980 		goto err_out;
1981 	}
1982 
1983 	memcpy(params.dest_addr, ah_attr->grh.dgid.raw,
1984 	       sizeof(params.dest_addr));
1985 	params.pdn = to_epd(ibah->pd)->pdn;
1986 	err = efa_com_create_ah(&dev->edev, &params, &result);
1987 	if (err)
1988 		goto err_out;
1989 
1990 	memcpy(ah->id, ah_attr->grh.dgid.raw, sizeof(ah->id));
1991 	ah->ah = result.ah;
1992 
1993 	resp.efa_address_handle = result.ah;
1994 
1995 	if (udata->outlen) {
1996 		err = ib_copy_to_udata(udata, &resp,
1997 				       min(sizeof(resp), udata->outlen));
1998 		if (err) {
1999 			ibdev_dbg(&dev->ibdev,
2000 				  "Failed to copy udata for create_ah response\n");
2001 			goto err_destroy_ah;
2002 		}
2003 	}
2004 	ibdev_dbg(&dev->ibdev, "Created ah[%d]\n", ah->ah);
2005 
2006 	return 0;
2007 
2008 err_destroy_ah:
2009 	efa_ah_destroy(dev, ah);
2010 err_out:
2011 	atomic64_inc(&dev->stats.create_ah_err);
2012 	return err;
2013 }
2014 
2015 int efa_destroy_ah(struct ib_ah *ibah, u32 flags)
2016 {
2017 	struct efa_dev *dev = to_edev(ibah->pd->device);
2018 	struct efa_ah *ah = to_eah(ibah);
2019 
2020 	ibdev_dbg(&dev->ibdev, "Destroy ah[%d]\n", ah->ah);
2021 
2022 	if (!(flags & RDMA_DESTROY_AH_SLEEPABLE)) {
2023 		ibdev_dbg(&dev->ibdev,
2024 			  "Destroy address handle is not supported in atomic context\n");
2025 		return -EOPNOTSUPP;
2026 	}
2027 
2028 	efa_ah_destroy(dev, ah);
2029 	return 0;
2030 }
2031 
2032 struct rdma_hw_stats *efa_alloc_hw_port_stats(struct ib_device *ibdev,
2033 					      u32 port_num)
2034 {
2035 	return rdma_alloc_hw_stats_struct(efa_port_stats_descs,
2036 					  ARRAY_SIZE(efa_port_stats_descs),
2037 					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
2038 }
2039 
2040 struct rdma_hw_stats *efa_alloc_hw_device_stats(struct ib_device *ibdev)
2041 {
2042 	return rdma_alloc_hw_stats_struct(efa_device_stats_descs,
2043 					  ARRAY_SIZE(efa_device_stats_descs),
2044 					  RDMA_HW_STATS_DEFAULT_LIFESPAN);
2045 }
2046 
2047 static int efa_fill_device_stats(struct efa_dev *dev,
2048 				 struct rdma_hw_stats *stats)
2049 {
2050 	struct efa_com_stats_admin *as = &dev->edev.aq.stats;
2051 	struct efa_stats *s = &dev->stats;
2052 
2053 	stats->value[EFA_SUBMITTED_CMDS] = atomic64_read(&as->submitted_cmd);
2054 	stats->value[EFA_COMPLETED_CMDS] = atomic64_read(&as->completed_cmd);
2055 	stats->value[EFA_CMDS_ERR] = atomic64_read(&as->cmd_err);
2056 	stats->value[EFA_NO_COMPLETION_CMDS] = atomic64_read(&as->no_completion);
2057 
2058 	stats->value[EFA_KEEP_ALIVE_RCVD] = atomic64_read(&s->keep_alive_rcvd);
2059 	stats->value[EFA_ALLOC_PD_ERR] = atomic64_read(&s->alloc_pd_err);
2060 	stats->value[EFA_CREATE_QP_ERR] = atomic64_read(&s->create_qp_err);
2061 	stats->value[EFA_CREATE_CQ_ERR] = atomic64_read(&s->create_cq_err);
2062 	stats->value[EFA_REG_MR_ERR] = atomic64_read(&s->reg_mr_err);
2063 	stats->value[EFA_ALLOC_UCONTEXT_ERR] =
2064 		atomic64_read(&s->alloc_ucontext_err);
2065 	stats->value[EFA_CREATE_AH_ERR] = atomic64_read(&s->create_ah_err);
2066 	stats->value[EFA_MMAP_ERR] = atomic64_read(&s->mmap_err);
2067 
2068 	return ARRAY_SIZE(efa_device_stats_descs);
2069 }
2070 
2071 static int efa_fill_port_stats(struct efa_dev *dev, struct rdma_hw_stats *stats,
2072 			       u32 port_num)
2073 {
2074 	struct efa_com_get_stats_params params = {};
2075 	union efa_com_get_stats_result result;
2076 	struct efa_com_rdma_read_stats *rrs;
2077 	struct efa_com_messages_stats *ms;
2078 	struct efa_com_basic_stats *bs;
2079 	int err;
2080 
2081 	params.scope = EFA_ADMIN_GET_STATS_SCOPE_ALL;
2082 	params.type = EFA_ADMIN_GET_STATS_TYPE_BASIC;
2083 
2084 	err = efa_com_get_stats(&dev->edev, &params, &result);
2085 	if (err)
2086 		return err;
2087 
2088 	bs = &result.basic_stats;
2089 	stats->value[EFA_TX_BYTES] = bs->tx_bytes;
2090 	stats->value[EFA_TX_PKTS] = bs->tx_pkts;
2091 	stats->value[EFA_RX_BYTES] = bs->rx_bytes;
2092 	stats->value[EFA_RX_PKTS] = bs->rx_pkts;
2093 	stats->value[EFA_RX_DROPS] = bs->rx_drops;
2094 
2095 	params.type = EFA_ADMIN_GET_STATS_TYPE_MESSAGES;
2096 	err = efa_com_get_stats(&dev->edev, &params, &result);
2097 	if (err)
2098 		return err;
2099 
2100 	ms = &result.messages_stats;
2101 	stats->value[EFA_SEND_BYTES] = ms->send_bytes;
2102 	stats->value[EFA_SEND_WRS] = ms->send_wrs;
2103 	stats->value[EFA_RECV_BYTES] = ms->recv_bytes;
2104 	stats->value[EFA_RECV_WRS] = ms->recv_wrs;
2105 
2106 	params.type = EFA_ADMIN_GET_STATS_TYPE_RDMA_READ;
2107 	err = efa_com_get_stats(&dev->edev, &params, &result);
2108 	if (err)
2109 		return err;
2110 
2111 	rrs = &result.rdma_read_stats;
2112 	stats->value[EFA_RDMA_READ_WRS] = rrs->read_wrs;
2113 	stats->value[EFA_RDMA_READ_BYTES] = rrs->read_bytes;
2114 	stats->value[EFA_RDMA_READ_WR_ERR] = rrs->read_wr_err;
2115 	stats->value[EFA_RDMA_READ_RESP_BYTES] = rrs->read_resp_bytes;
2116 
2117 	return ARRAY_SIZE(efa_port_stats_descs);
2118 }
2119 
2120 int efa_get_hw_stats(struct ib_device *ibdev, struct rdma_hw_stats *stats,
2121 		     u32 port_num, int index)
2122 {
2123 	if (port_num)
2124 		return efa_fill_port_stats(to_edev(ibdev), stats, port_num);
2125 	else
2126 		return efa_fill_device_stats(to_edev(ibdev), stats);
2127 }
2128 
2129 enum rdma_link_layer efa_port_link_layer(struct ib_device *ibdev,
2130 					 u32 port_num)
2131 {
2132 	return IB_LINK_LAYER_UNSPECIFIED;
2133 }
2134 
2135