xref: /freebsd/sys/dev/mlx5/mlx5_ib/mlx5_ib_cq.c (revision 31ba4ce8898f9dfa5e7f054fdbc26e50a599a6e3)
1 /*-
2  * Copyright (c) 2013-2020, Mellanox Technologies.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include <linux/kref.h>
29 #include <rdma/ib_umem.h>
30 #include <rdma/ib_user_verbs.h>
31 #include <rdma/ib_cache.h>
32 #include <rdma/uverbs_ioctl.h>
33 #include "mlx5_ib.h"
34 
35 static void mlx5_ib_cq_comp(struct mlx5_core_cq *cq, struct mlx5_eqe *eqe __unused)
36 {
37 	struct ib_cq *ibcq = &to_mibcq(cq)->ibcq;
38 
39 	ibcq->comp_handler(ibcq, ibcq->cq_context);
40 }
41 
42 static void mlx5_ib_cq_event(struct mlx5_core_cq *mcq, int type)
43 {
44 	struct mlx5_ib_cq *cq = container_of(mcq, struct mlx5_ib_cq, mcq);
45 	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
46 	struct ib_cq *ibcq = &cq->ibcq;
47 	struct ib_event event;
48 
49 	if (type != MLX5_EVENT_TYPE_CQ_ERROR) {
50 		mlx5_ib_warn(dev, "Unexpected event type %d on CQ %06x\n",
51 			     type, mcq->cqn);
52 		return;
53 	}
54 
55 	if (ibcq->event_handler) {
56 		event.device     = &dev->ib_dev;
57 		event.event      = IB_EVENT_CQ_ERR;
58 		event.element.cq = ibcq;
59 		ibcq->event_handler(&event, ibcq->cq_context);
60 	}
61 }
62 
63 static void *get_cqe_from_buf(struct mlx5_ib_cq_buf *buf, int n, int size)
64 {
65 	return mlx5_buf_offset(&buf->buf, n * size);
66 }
67 
68 static void *get_cqe(struct mlx5_ib_cq *cq, int n)
69 {
70 	return get_cqe_from_buf(&cq->buf, n, cq->mcq.cqe_sz);
71 }
72 
73 static u8 sw_ownership_bit(int n, int nent)
74 {
75 	return (n & nent) ? 1 : 0;
76 }
77 
78 static void *get_sw_cqe(struct mlx5_ib_cq *cq, int n)
79 {
80 	void *cqe = get_cqe(cq, n & cq->ibcq.cqe);
81 	struct mlx5_cqe64 *cqe64;
82 
83 	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
84 
85 	if (likely((cqe64->op_own) >> 4 != MLX5_CQE_INVALID) &&
86 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibcq.cqe + 1)))) {
87 		return cqe;
88 	} else {
89 		return NULL;
90 	}
91 }
92 
93 static void *next_cqe_sw(struct mlx5_ib_cq *cq)
94 {
95 	return get_sw_cqe(cq, cq->mcq.cons_index);
96 }
97 
98 static enum ib_wc_opcode get_umr_comp(struct mlx5_ib_wq *wq, int idx)
99 {
100 	switch (wq->wr_data[idx]) {
101 	case MLX5_IB_WR_UMR:
102 		return 0;
103 
104 	case IB_WR_LOCAL_INV:
105 		return IB_WC_LOCAL_INV;
106 
107 	case IB_WR_REG_MR:
108 		return IB_WC_REG_MR;
109 
110 	default:
111 		pr_warn("unknown completion status\n");
112 		return 0;
113 	}
114 }
115 
116 static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
117 			    struct mlx5_ib_wq *wq, int idx)
118 {
119 	wc->wc_flags = 0;
120 	switch (be32_to_cpu(cqe->sop_drop_qpn) >> 24) {
121 	case MLX5_OPCODE_RDMA_WRITE_IMM:
122 		wc->wc_flags |= IB_WC_WITH_IMM;
123 	case MLX5_OPCODE_RDMA_WRITE:
124 		wc->opcode    = IB_WC_RDMA_WRITE;
125 		break;
126 	case MLX5_OPCODE_SEND_IMM:
127 		wc->wc_flags |= IB_WC_WITH_IMM;
128 	case MLX5_OPCODE_SEND:
129 	case MLX5_OPCODE_SEND_INVAL:
130 		wc->opcode    = IB_WC_SEND;
131 		break;
132 	case MLX5_OPCODE_RDMA_READ:
133 		wc->opcode    = IB_WC_RDMA_READ;
134 		wc->byte_len  = be32_to_cpu(cqe->byte_cnt);
135 		break;
136 	case MLX5_OPCODE_ATOMIC_CS:
137 		wc->opcode    = IB_WC_COMP_SWAP;
138 		wc->byte_len  = 8;
139 		break;
140 	case MLX5_OPCODE_ATOMIC_FA:
141 		wc->opcode    = IB_WC_FETCH_ADD;
142 		wc->byte_len  = 8;
143 		break;
144 	case MLX5_OPCODE_ATOMIC_MASKED_CS:
145 		wc->opcode    = IB_WC_MASKED_COMP_SWAP;
146 		wc->byte_len  = 8;
147 		break;
148 	case MLX5_OPCODE_ATOMIC_MASKED_FA:
149 		wc->opcode    = IB_WC_MASKED_FETCH_ADD;
150 		wc->byte_len  = 8;
151 		break;
152 	case MLX5_OPCODE_UMR:
153 		wc->opcode = get_umr_comp(wq, idx);
154 		break;
155 	}
156 }
157 
158 enum {
159 	MLX5_GRH_IN_BUFFER = 1,
160 	MLX5_GRH_IN_CQE	   = 2,
161 };
162 
163 static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe,
164 			     struct mlx5_ib_qp *qp)
165 {
166 	enum rdma_link_layer ll = rdma_port_get_link_layer(qp->ibqp.device, 1);
167 	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
168 	struct mlx5_ib_srq *srq;
169 	struct mlx5_ib_wq *wq;
170 	u16 wqe_ctr;
171 	u8  roce_packet_type;
172 	bool vlan_present;
173 	u8 g;
174 
175 	if (qp->ibqp.srq || qp->ibqp.xrcd) {
176 		struct mlx5_core_srq *msrq = NULL;
177 
178 		if (qp->ibqp.xrcd) {
179 			msrq = mlx5_core_get_srq(dev->mdev,
180 						 be32_to_cpu(cqe->srqn));
181 			srq = to_mibsrq(msrq);
182 		} else {
183 			srq = to_msrq(qp->ibqp.srq);
184 		}
185 		if (srq) {
186 			wqe_ctr = be16_to_cpu(cqe->wqe_counter);
187 			wc->wr_id = srq->wrid[wqe_ctr];
188 			mlx5_ib_free_srq_wqe(srq, wqe_ctr);
189 			if (msrq && atomic_dec_and_test(&msrq->refcount))
190 				complete(&msrq->free);
191 		}
192 	} else {
193 		wq	  = &qp->rq;
194 		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
195 		++wq->tail;
196 	}
197 	wc->byte_len = be32_to_cpu(cqe->byte_cnt);
198 
199 	switch (cqe->op_own >> 4) {
200 	case MLX5_CQE_RESP_WR_IMM:
201 		wc->opcode	= IB_WC_RECV_RDMA_WITH_IMM;
202 		wc->wc_flags	= IB_WC_WITH_IMM;
203 		wc->ex.imm_data = cqe->imm_inval_pkey;
204 		break;
205 	case MLX5_CQE_RESP_SEND:
206 		wc->opcode   = IB_WC_RECV;
207 		wc->wc_flags = IB_WC_IP_CSUM_OK;
208 		if (unlikely(!((cqe->hds_ip_ext & CQE_L3_OK) &&
209 			       (cqe->hds_ip_ext & CQE_L4_OK))))
210 			wc->wc_flags = 0;
211 		break;
212 	case MLX5_CQE_RESP_SEND_IMM:
213 		wc->opcode	= IB_WC_RECV;
214 		wc->wc_flags	= IB_WC_WITH_IMM;
215 		wc->ex.imm_data = cqe->imm_inval_pkey;
216 		break;
217 	case MLX5_CQE_RESP_SEND_INV:
218 		wc->opcode	= IB_WC_RECV;
219 		wc->wc_flags	= IB_WC_WITH_INVALIDATE;
220 		wc->ex.invalidate_rkey = be32_to_cpu(cqe->imm_inval_pkey);
221 		break;
222 	}
223 	wc->src_qp	   = be32_to_cpu(cqe->flags_rqpn) & 0xffffff;
224 	wc->dlid_path_bits = cqe->ml_path;
225 	g = (be32_to_cpu(cqe->flags_rqpn) >> 28) & 3;
226 	wc->wc_flags |= g ? IB_WC_GRH : 0;
227 	if (unlikely(is_qp1(qp->ibqp.qp_type))) {
228 		u16 pkey = be32_to_cpu(cqe->imm_inval_pkey) & 0xffff;
229 
230 		ib_find_cached_pkey(&dev->ib_dev, qp->port, pkey,
231 				    &wc->pkey_index);
232 	} else {
233 		wc->pkey_index = 0;
234 	}
235 
236 	if (ll != IB_LINK_LAYER_ETHERNET) {
237 		wc->slid = be16_to_cpu(cqe->slid);
238 		wc->sl = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0xf;
239 		return;
240 	}
241 
242 	wc->slid = 0;
243 	vlan_present = cqe_has_vlan(cqe);
244 	roce_packet_type   = (be32_to_cpu(cqe->flags_rqpn) >> 24) & 0x3;
245 	if (vlan_present) {
246 		wc->vlan_id = (be16_to_cpu(cqe->vlan_info)) & 0xfff;
247 		wc->sl = (be16_to_cpu(cqe->vlan_info) >> 13) & 0x7;
248 		wc->wc_flags |= IB_WC_WITH_VLAN;
249 	} else {
250 		wc->sl = 0;
251 	}
252 
253 	switch (roce_packet_type) {
254 	case MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH:
255 		wc->network_hdr_type = RDMA_NETWORK_IB;
256 		break;
257 	case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6:
258 		wc->network_hdr_type = RDMA_NETWORK_IPV6;
259 		break;
260 	case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4:
261 		wc->network_hdr_type = RDMA_NETWORK_IPV4;
262 		break;
263 	}
264 	wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE;
265 }
266 
267 static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe)
268 {
269 	__be32 *p = (__be32 *)cqe;
270 	int i;
271 
272 	mlx5_ib_warn(dev, "dump error cqe\n");
273 	for (i = 0; i < sizeof(*cqe) / 16; i++, p += 4)
274 		pr_info("%08x %08x %08x %08x\n", be32_to_cpu(p[0]),
275 			be32_to_cpu(p[1]), be32_to_cpu(p[2]),
276 			be32_to_cpu(p[3]));
277 }
278 
279 static void mlx5_handle_error_cqe(struct mlx5_ib_dev *dev,
280 				  struct mlx5_err_cqe *cqe,
281 				  struct ib_wc *wc)
282 {
283 	int dump = 1;
284 
285 	switch (cqe->syndrome) {
286 	case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR:
287 		wc->status = IB_WC_LOC_LEN_ERR;
288 		break;
289 	case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR:
290 		wc->status = IB_WC_LOC_QP_OP_ERR;
291 		break;
292 	case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR:
293 		wc->status = IB_WC_LOC_PROT_ERR;
294 		break;
295 	case MLX5_CQE_SYNDROME_WR_FLUSH_ERR:
296 		dump = 0;
297 		wc->status = IB_WC_WR_FLUSH_ERR;
298 		break;
299 	case MLX5_CQE_SYNDROME_MW_BIND_ERR:
300 		wc->status = IB_WC_MW_BIND_ERR;
301 		break;
302 	case MLX5_CQE_SYNDROME_BAD_RESP_ERR:
303 		wc->status = IB_WC_BAD_RESP_ERR;
304 		break;
305 	case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR:
306 		wc->status = IB_WC_LOC_ACCESS_ERR;
307 		break;
308 	case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
309 		wc->status = IB_WC_REM_INV_REQ_ERR;
310 		break;
311 	case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR:
312 		wc->status = IB_WC_REM_ACCESS_ERR;
313 		break;
314 	case MLX5_CQE_SYNDROME_REMOTE_OP_ERR:
315 		wc->status = IB_WC_REM_OP_ERR;
316 		break;
317 	case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
318 		wc->status = IB_WC_RETRY_EXC_ERR;
319 		dump = 0;
320 		break;
321 	case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
322 		wc->status = IB_WC_RNR_RETRY_EXC_ERR;
323 		dump = 0;
324 		break;
325 	case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR:
326 		wc->status = IB_WC_REM_ABORT_ERR;
327 		break;
328 	default:
329 		wc->status = IB_WC_GENERAL_ERR;
330 		break;
331 	}
332 
333 	wc->vendor_err = cqe->vendor_err_synd;
334 	if (dump)
335 		dump_cqe(dev, cqe);
336 }
337 
338 static int is_atomic_response(struct mlx5_ib_qp *qp, uint16_t idx)
339 {
340 	/* TBD: waiting decision
341 	*/
342 	return 0;
343 }
344 
345 static void *mlx5_get_atomic_laddr(struct mlx5_ib_qp *qp, uint16_t idx)
346 {
347 	struct mlx5_wqe_data_seg *dpseg;
348 	void *addr;
349 
350 	dpseg = mlx5_get_send_wqe(qp, idx) + sizeof(struct mlx5_wqe_ctrl_seg) +
351 		sizeof(struct mlx5_wqe_raddr_seg) +
352 		sizeof(struct mlx5_wqe_atomic_seg);
353 	addr = (void *)(unsigned long)be64_to_cpu(dpseg->addr);
354 	return addr;
355 }
356 
357 static void handle_atomic(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
358 			  uint16_t idx)
359 {
360 	void *addr;
361 	int byte_count;
362 	int i;
363 
364 	if (!is_atomic_response(qp, idx))
365 		return;
366 
367 	byte_count = be32_to_cpu(cqe64->byte_cnt);
368 	addr = mlx5_get_atomic_laddr(qp, idx);
369 
370 	if (byte_count == 4) {
371 		*(uint32_t *)addr = be32_to_cpu(*((__be32 *)addr));
372 	} else {
373 		for (i = 0; i < byte_count; i += 8) {
374 			*(uint64_t *)addr = be64_to_cpu(*((__be64 *)addr));
375 			addr += 8;
376 		}
377 	}
378 
379 	return;
380 }
381 
382 static void handle_atomics(struct mlx5_ib_qp *qp, struct mlx5_cqe64 *cqe64,
383 			   u16 tail, u16 head)
384 {
385 	u16 idx;
386 
387 	do {
388 		idx = tail & (qp->sq.wqe_cnt - 1);
389 		handle_atomic(qp, cqe64, idx);
390 		if (idx == head)
391 			break;
392 
393 		tail = qp->sq.w_list[idx].next;
394 	} while (1);
395 	tail = qp->sq.w_list[idx].next;
396 	qp->sq.last_poll = tail;
397 }
398 
399 static void free_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf)
400 {
401 	mlx5_buf_free(dev->mdev, &buf->buf);
402 }
403 
404 static void get_sig_err_item(struct mlx5_sig_err_cqe *cqe,
405 			     struct ib_sig_err *item)
406 {
407 	u16 syndrome = be16_to_cpu(cqe->syndrome);
408 
409 #define GUARD_ERR   (1 << 13)
410 #define APPTAG_ERR  (1 << 12)
411 #define REFTAG_ERR  (1 << 11)
412 
413 	if (syndrome & GUARD_ERR) {
414 		item->err_type = IB_SIG_BAD_GUARD;
415 		item->expected = be32_to_cpu(cqe->expected_trans_sig) >> 16;
416 		item->actual = be32_to_cpu(cqe->actual_trans_sig) >> 16;
417 	} else
418 	if (syndrome & REFTAG_ERR) {
419 		item->err_type = IB_SIG_BAD_REFTAG;
420 		item->expected = be32_to_cpu(cqe->expected_reftag);
421 		item->actual = be32_to_cpu(cqe->actual_reftag);
422 	} else
423 	if (syndrome & APPTAG_ERR) {
424 		item->err_type = IB_SIG_BAD_APPTAG;
425 		item->expected = be32_to_cpu(cqe->expected_trans_sig) & 0xffff;
426 		item->actual = be32_to_cpu(cqe->actual_trans_sig) & 0xffff;
427 	} else {
428 		pr_err("Got signature completion error with bad syndrome %04x\n",
429 		       syndrome);
430 	}
431 
432 	item->sig_err_offset = be64_to_cpu(cqe->err_offset);
433 	item->key = be32_to_cpu(cqe->mkey);
434 }
435 
436 static void sw_send_comp(struct mlx5_ib_qp *qp, int num_entries,
437 			 struct ib_wc *wc, int *npolled)
438 {
439 	struct mlx5_ib_wq *wq;
440 	unsigned int cur;
441 	unsigned int idx;
442 	int np;
443 	int i;
444 
445 	wq = &qp->sq;
446 	cur = wq->head - wq->tail;
447 	np = *npolled;
448 
449 	if (cur == 0)
450 		return;
451 
452 	for (i = 0;  i < cur && np < num_entries; i++) {
453 		idx = wq->last_poll & (wq->wqe_cnt - 1);
454 		wc->wr_id = wq->wrid[idx];
455 		wc->status = IB_WC_WR_FLUSH_ERR;
456 		wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
457 		wq->tail++;
458 		np++;
459 		wc->qp = &qp->ibqp;
460 		wc++;
461 		wq->last_poll = wq->w_list[idx].next;
462 	}
463 	*npolled = np;
464 }
465 
466 static void sw_recv_comp(struct mlx5_ib_qp *qp, int num_entries,
467 			 struct ib_wc *wc, int *npolled)
468 {
469 	struct mlx5_ib_wq *wq;
470 	unsigned int cur;
471 	int np;
472 	int i;
473 
474 	wq = &qp->rq;
475 	cur = wq->head - wq->tail;
476 	np = *npolled;
477 
478 	if (cur == 0)
479 		return;
480 
481 	for (i = 0;  i < cur && np < num_entries; i++) {
482 		wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
483 		wc->status = IB_WC_WR_FLUSH_ERR;
484 		wc->vendor_err = MLX5_CQE_SYNDROME_WR_FLUSH_ERR;
485 		wq->tail++;
486 		np++;
487 		wc->qp = &qp->ibqp;
488 		wc++;
489 	}
490 	*npolled = np;
491 }
492 
493 static void mlx5_ib_poll_sw_comp(struct mlx5_ib_cq *cq, int num_entries,
494 				 struct ib_wc *wc, int *npolled)
495 {
496 	struct mlx5_ib_qp *qp;
497 
498 	*npolled = 0;
499 	/* Find uncompleted WQEs belonging to that cq and retrun mmics ones */
500 	list_for_each_entry(qp, &cq->list_send_qp, cq_send_list) {
501 		sw_send_comp(qp, num_entries, wc + *npolled, npolled);
502 		if (*npolled >= num_entries)
503 			return;
504 	}
505 
506 	list_for_each_entry(qp, &cq->list_recv_qp, cq_recv_list) {
507 		sw_recv_comp(qp, num_entries, wc + *npolled, npolled);
508 		if (*npolled >= num_entries)
509 			return;
510 	}
511 }
512 
513 static int mlx5_poll_one(struct mlx5_ib_cq *cq,
514 			 struct mlx5_ib_qp **cur_qp,
515 			 struct ib_wc *wc)
516 {
517 	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
518 	struct mlx5_err_cqe *err_cqe;
519 	struct mlx5_cqe64 *cqe64;
520 	struct mlx5_core_qp *mqp;
521 	struct mlx5_ib_wq *wq;
522 	struct mlx5_sig_err_cqe *sig_err_cqe;
523 	struct mlx5_core_mkey *mmkey;
524 	struct mlx5_ib_mr *mr;
525 	unsigned long flags;
526 	uint8_t opcode;
527 	uint32_t qpn;
528 	u16 wqe_ctr;
529 	void *cqe;
530 	int idx;
531 
532 repoll:
533 	cqe = next_cqe_sw(cq);
534 	if (!cqe)
535 		return -EAGAIN;
536 
537 	cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
538 
539 	++cq->mcq.cons_index;
540 
541 	/* Make sure we read CQ entry contents after we've checked the
542 	 * ownership bit.
543 	 */
544 	rmb();
545 
546 	opcode = cqe64->op_own >> 4;
547 	if (unlikely(opcode == MLX5_CQE_RESIZE_CQ)) {
548 		if (likely(cq->resize_buf)) {
549 			free_cq_buf(dev, &cq->buf);
550 			cq->buf = *cq->resize_buf;
551 			kfree(cq->resize_buf);
552 			cq->resize_buf = NULL;
553 			goto repoll;
554 		} else {
555 			mlx5_ib_warn(dev, "unexpected resize cqe\n");
556 		}
557 	}
558 
559 	qpn = ntohl(cqe64->sop_drop_qpn) & 0xffffff;
560 	if (!*cur_qp || (qpn != (*cur_qp)->ibqp.qp_num)) {
561 		/* We do not have to take the QP table lock here,
562 		 * because CQs will be locked while QPs are removed
563 		 * from the table.
564 		 */
565 		mqp = __mlx5_qp_lookup(dev->mdev, qpn);
566 		*cur_qp = to_mibqp(mqp);
567 	}
568 
569 	wc->qp  = &(*cur_qp)->ibqp;
570 	switch (opcode) {
571 	case MLX5_CQE_REQ:
572 		wq = &(*cur_qp)->sq;
573 		wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
574 		idx = wqe_ctr & (wq->wqe_cnt - 1);
575 		handle_good_req(wc, cqe64, wq, idx);
576 		handle_atomics(*cur_qp, cqe64, wq->last_poll, idx);
577 		wc->wr_id = wq->wrid[idx];
578 		wq->tail = wq->wqe_head[idx] + 1;
579 		wc->status = IB_WC_SUCCESS;
580 		break;
581 	case MLX5_CQE_RESP_WR_IMM:
582 	case MLX5_CQE_RESP_SEND:
583 	case MLX5_CQE_RESP_SEND_IMM:
584 	case MLX5_CQE_RESP_SEND_INV:
585 		handle_responder(wc, cqe64, *cur_qp);
586 		wc->status = IB_WC_SUCCESS;
587 		break;
588 	case MLX5_CQE_RESIZE_CQ:
589 		break;
590 	case MLX5_CQE_REQ_ERR:
591 	case MLX5_CQE_RESP_ERR:
592 		err_cqe = (struct mlx5_err_cqe *)cqe64;
593 		mlx5_handle_error_cqe(dev, err_cqe, wc);
594 		mlx5_ib_dbg(dev, "%s error cqe on cqn 0x%x:\n",
595 			    opcode == MLX5_CQE_REQ_ERR ?
596 			    "Requestor" : "Responder", cq->mcq.cqn);
597 		mlx5_ib_dbg(dev, "syndrome 0x%x, vendor syndrome 0x%x\n",
598 			    err_cqe->syndrome, err_cqe->vendor_err_synd);
599 		if (opcode == MLX5_CQE_REQ_ERR) {
600 			wq = &(*cur_qp)->sq;
601 			wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
602 			idx = wqe_ctr & (wq->wqe_cnt - 1);
603 			wc->wr_id = wq->wrid[idx];
604 			wq->tail = wq->wqe_head[idx] + 1;
605 		} else {
606 			struct mlx5_ib_srq *srq;
607 
608 			if ((*cur_qp)->ibqp.srq) {
609 				srq = to_msrq((*cur_qp)->ibqp.srq);
610 				wqe_ctr = be16_to_cpu(cqe64->wqe_counter);
611 				wc->wr_id = srq->wrid[wqe_ctr];
612 				mlx5_ib_free_srq_wqe(srq, wqe_ctr);
613 			} else {
614 				wq = &(*cur_qp)->rq;
615 				wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
616 				++wq->tail;
617 			}
618 		}
619 		break;
620 	case MLX5_CQE_SIG_ERR:
621 		sig_err_cqe = (struct mlx5_sig_err_cqe *)cqe64;
622 
623 		spin_lock_irqsave(&dev->mdev->priv.mr_table.lock, flags);
624 		mmkey = __mlx5_mr_lookup(dev->mdev,
625 					 mlx5_base_mkey(be32_to_cpu(sig_err_cqe->mkey)));
626 		mr = to_mibmr(mmkey);
627 		get_sig_err_item(sig_err_cqe, &mr->sig->err_item);
628 		mr->sig->sig_err_exists = true;
629 		mr->sig->sigerr_count++;
630 
631 		mlx5_ib_warn(dev, "CQN: 0x%x Got SIGERR on key: 0x%x err_type %x err_offset %llx expected %x actual %x\n",
632 			     cq->mcq.cqn, mr->sig->err_item.key,
633 			     mr->sig->err_item.err_type,
634 			     (long long)mr->sig->err_item.sig_err_offset,
635 			     mr->sig->err_item.expected,
636 			     mr->sig->err_item.actual);
637 
638 		spin_unlock_irqrestore(&dev->mdev->priv.mr_table.lock, flags);
639 		goto repoll;
640 	}
641 
642 	return 0;
643 }
644 
645 static int poll_soft_wc(struct mlx5_ib_cq *cq, int num_entries,
646 			struct ib_wc *wc)
647 {
648 	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
649 	struct mlx5_ib_wc *soft_wc, *next;
650 	int npolled = 0;
651 
652 	list_for_each_entry_safe(soft_wc, next, &cq->wc_list, list) {
653 		if (npolled >= num_entries)
654 			break;
655 
656 		mlx5_ib_dbg(dev, "polled software generated completion on CQ 0x%x\n",
657 			    cq->mcq.cqn);
658 
659 		wc[npolled++] = soft_wc->wc;
660 		list_del(&soft_wc->list);
661 		kfree(soft_wc);
662 	}
663 
664 	return npolled;
665 }
666 
667 int mlx5_ib_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc)
668 {
669 	struct mlx5_ib_cq *cq = to_mcq(ibcq);
670 	struct mlx5_ib_qp *cur_qp = NULL;
671 	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
672 	struct mlx5_core_dev *mdev = dev->mdev;
673 	unsigned long flags;
674 	int soft_polled = 0;
675 	int npolled;
676 
677 	spin_lock_irqsave(&cq->lock, flags);
678 	if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)) {
679 		mlx5_ib_poll_sw_comp(cq, num_entries, wc, &npolled);
680 		goto out;
681 	}
682 
683 	if (unlikely(!list_empty(&cq->wc_list)))
684 		soft_polled = poll_soft_wc(cq, num_entries, wc);
685 
686 	for (npolled = 0; npolled < num_entries - soft_polled; npolled++) {
687 		if (mlx5_poll_one(cq, &cur_qp, wc + soft_polled + npolled))
688 			break;
689 	}
690 
691 	if (npolled)
692 		mlx5_cq_set_ci(&cq->mcq);
693 out:
694 	spin_unlock_irqrestore(&cq->lock, flags);
695 
696 	return soft_polled + npolled;
697 }
698 
699 int mlx5_ib_arm_cq(struct ib_cq *ibcq, enum ib_cq_notify_flags flags)
700 {
701 	struct mlx5_core_dev *mdev = to_mdev(ibcq->device)->mdev;
702 	struct mlx5_ib_cq *cq = to_mcq(ibcq);
703 	void __iomem *uar_page = mdev->priv.uar->map;
704 	unsigned long irq_flags;
705 	int ret = 0;
706 
707 	if (unlikely(mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR))
708 		return -1;
709 
710 	spin_lock_irqsave(&cq->lock, irq_flags);
711 	if (cq->notify_flags != IB_CQ_NEXT_COMP)
712 		cq->notify_flags = flags & IB_CQ_SOLICITED_MASK;
713 
714 	if ((flags & IB_CQ_REPORT_MISSED_EVENTS) && !list_empty(&cq->wc_list))
715 		ret = 1;
716 	spin_unlock_irqrestore(&cq->lock, irq_flags);
717 
718 	mlx5_cq_arm(&cq->mcq,
719 		    (flags & IB_CQ_SOLICITED_MASK) == IB_CQ_SOLICITED ?
720 		    MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT,
721 		    uar_page,
722 		    MLX5_GET_DOORBELL_LOCK(&mdev->priv.cq_uar_lock),
723 		    cq->mcq.cons_index);
724 
725 	return ret;
726 }
727 
728 static int alloc_cq_buf(struct mlx5_ib_dev *dev, struct mlx5_ib_cq_buf *buf,
729 			int nent, int cqe_size)
730 {
731 	int err;
732 
733 	err = mlx5_buf_alloc(dev->mdev, nent * cqe_size,
734 	    2 * PAGE_SIZE, &buf->buf);
735 	if (err)
736 		return err;
737 
738 	buf->cqe_size = cqe_size;
739 	buf->nent = nent;
740 
741 	return 0;
742 }
743 
744 static int create_cq_user(struct mlx5_ib_dev *dev, struct ib_udata *udata,
745 			  struct mlx5_ib_cq *cq, int entries, u32 **cqb,
746 			  int *cqe_size, int *index, int *inlen)
747 {
748 	struct mlx5_ib_create_cq ucmd = {};
749 	size_t ucmdlen;
750 	int page_shift;
751 	__be64 *pas;
752 	int npages;
753 	int ncont;
754 	void *cqc;
755 	int err;
756 	struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
757 		udata, struct mlx5_ib_ucontext, ibucontext);
758 
759 	ucmdlen = min(udata->inlen, sizeof(ucmd));
760 	if (ucmdlen < offsetof(struct mlx5_ib_create_cq, flags))
761 		return -EINVAL;
762 
763 	if (ib_copy_from_udata(&ucmd, udata, ucmdlen))
764 		return -EFAULT;
765 
766 	if ((ucmd.flags & ~(MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX)))
767 		return -EINVAL;
768 
769 	if (ucmd.cqe_size != 64 && ucmd.cqe_size != 128)
770 		return -EINVAL;
771 
772 	*cqe_size = ucmd.cqe_size;
773 
774 	cq->buf.umem = ib_umem_get(&context->ibucontext, ucmd.buf_addr,
775 				   entries * ucmd.cqe_size,
776 				   IB_ACCESS_LOCAL_WRITE, 1);
777 	if (IS_ERR(cq->buf.umem)) {
778 		err = PTR_ERR(cq->buf.umem);
779 		return err;
780 	}
781 
782 	err = mlx5_ib_db_map_user(context, ucmd.db_addr,
783 				  &cq->db);
784 	if (err)
785 		goto err_umem;
786 
787 	mlx5_ib_cont_pages(cq->buf.umem, ucmd.buf_addr, 0, &npages, &page_shift,
788 			   &ncont, NULL);
789 	mlx5_ib_dbg(dev, "addr 0x%llx, size %u, npages %d, page_shift %d, ncont %d\n",
790 		    (long long)ucmd.buf_addr, entries * ucmd.cqe_size, npages, page_shift, ncont);
791 
792 	*inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
793 		 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * ncont;
794 	*cqb = mlx5_vzalloc(*inlen);
795 	if (!*cqb) {
796 		err = -ENOMEM;
797 		goto err_db;
798 	}
799 
800 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas);
801 	mlx5_ib_populate_pas(dev, cq->buf.umem, page_shift, pas, 0);
802 
803 	cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context);
804 	MLX5_SET(cqc, cqc, log_page_size,
805 		 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
806 
807 	if (ucmd.flags & MLX5_IB_CREATE_CQ_FLAGS_UAR_PAGE_INDEX) {
808 		*index = ucmd.uar_page_index;
809 	} else if (context->bfregi.lib_uar_dyn) {
810 		err = -EINVAL;
811 		goto err_cqb;
812 	} else {
813 		*index = context->bfregi.sys_pages[0];
814 	}
815 
816 	MLX5_SET(create_cq_in, *cqb, uid, context->devx_uid);
817 	return 0;
818 
819 err_cqb:
820 	kvfree(*cqb);
821 
822 err_db:
823 	mlx5_ib_db_unmap_user(context, &cq->db);
824 
825 err_umem:
826 	ib_umem_release(cq->buf.umem);
827 	return err;
828 }
829 
830 static void destroy_cq_user(struct mlx5_ib_cq *cq, struct ib_udata *udata)
831 {
832 	struct mlx5_ib_ucontext *context = rdma_udata_to_drv_context(
833 		udata, struct mlx5_ib_ucontext, ibucontext);
834 
835 	mlx5_ib_db_unmap_user(context, &cq->db);
836 	ib_umem_release(cq->buf.umem);
837 }
838 
839 static void init_cq_buf(struct mlx5_ib_cq *cq, struct mlx5_ib_cq_buf *buf)
840 {
841 	int i;
842 	void *cqe;
843 	struct mlx5_cqe64 *cqe64;
844 
845 	for (i = 0; i < buf->nent; i++) {
846 		cqe = get_cqe_from_buf(buf, i, buf->cqe_size);
847 		cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
848 		cqe64->op_own = MLX5_CQE_INVALID << 4;
849 	}
850 }
851 
852 static int create_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
853 			    int entries, int cqe_size,
854 			    u32 **cqb, int *index, int *inlen)
855 {
856 	__be64 *pas;
857 	void *cqc;
858 	int err;
859 
860 	err = mlx5_db_alloc(dev->mdev, &cq->db);
861 	if (err)
862 		return err;
863 
864 	cq->mcq.set_ci_db  = cq->db.db;
865 	cq->mcq.arm_db     = cq->db.db + 1;
866 	cq->mcq.cqe_sz = cqe_size;
867 
868 	err = alloc_cq_buf(dev, &cq->buf, entries, cqe_size);
869 	if (err)
870 		goto err_db;
871 
872 	init_cq_buf(cq, &cq->buf);
873 
874 	*inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
875 		 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * cq->buf.buf.npages;
876 	*cqb = mlx5_vzalloc(*inlen);
877 	if (!*cqb) {
878 		err = -ENOMEM;
879 		goto err_buf;
880 	}
881 
882 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, *cqb, pas);
883 	mlx5_fill_page_array(&cq->buf.buf, pas);
884 
885 	cqc = MLX5_ADDR_OF(create_cq_in, *cqb, cq_context);
886 	MLX5_SET(cqc, cqc, log_page_size,
887 		 cq->buf.buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
888 
889 	*index = dev->mdev->priv.uar->index;
890 
891 	return 0;
892 
893 err_buf:
894 	free_cq_buf(dev, &cq->buf);
895 
896 err_db:
897 	mlx5_db_free(dev->mdev, &cq->db);
898 	return err;
899 }
900 
901 static void destroy_cq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq)
902 {
903 	free_cq_buf(dev, &cq->buf);
904 	mlx5_db_free(dev->mdev, &cq->db);
905 }
906 
907 static void notify_soft_wc_handler(struct work_struct *work)
908 {
909 	struct mlx5_ib_cq *cq = container_of(work, struct mlx5_ib_cq,
910 					     notify_work);
911 
912 	cq->ibcq.comp_handler(&cq->ibcq, cq->ibcq.cq_context);
913 }
914 
915 int mlx5_ib_create_cq(struct ib_cq *ibcq, const struct ib_cq_init_attr *attr,
916 		      struct ib_udata *udata)
917 {
918 	struct ib_device *ibdev = ibcq->device;
919 	int entries = attr->cqe;
920 	int vector = attr->comp_vector;
921 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
922 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
923 	struct mlx5_ib_cq *cq = to_mcq(ibcq);
924 	int uninitialized_var(index);
925 	int uninitialized_var(inlen);
926 	u32 *cqb = NULL;
927 	void *cqc;
928 	int cqe_size;
929 	unsigned int irqn;
930 	int eqn;
931 	int err;
932 
933 	if (entries < 0 ||
934 	    (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))))
935 		return -EINVAL;
936 
937 	if (check_cq_create_flags(attr->flags))
938 		return -EOPNOTSUPP;
939 
940 	entries = roundup_pow_of_two(entries + 1);
941 	if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)))
942 		return -EINVAL;
943 
944 	cq->ibcq.cqe = entries - 1;
945 	mutex_init(&cq->resize_mutex);
946 	spin_lock_init(&cq->lock);
947 	cq->resize_buf = NULL;
948 	cq->resize_umem = NULL;
949 	cq->create_flags = attr->flags;
950 	INIT_LIST_HEAD(&cq->list_send_qp);
951 	INIT_LIST_HEAD(&cq->list_recv_qp);
952 
953 	if (udata) {
954 		err = create_cq_user(dev, udata, cq, entries, &cqb, &cqe_size,
955 				     &index, &inlen);
956 		if (err)
957 			return err;
958 	} else {
959 		cqe_size = cache_line_size() == 128 ? 128 : 64;
960 		err = create_cq_kernel(dev, cq, entries, cqe_size, &cqb,
961 				       &index, &inlen);
962 		if (err)
963 			return err;
964 
965 		INIT_WORK(&cq->notify_work, notify_soft_wc_handler);
966 	}
967 
968 	err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn);
969 	if (err)
970 		goto err_cqb;
971 
972 	cq->cqe_size = cqe_size;
973 
974 	cqc = MLX5_ADDR_OF(create_cq_in, cqb, cq_context);
975 	MLX5_SET(cqc, cqc, cqe_sz, cqe_sz_to_mlx_sz(cqe_size));
976 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries));
977 	MLX5_SET(cqc, cqc, uar_page, index);
978 	MLX5_SET(cqc, cqc, c_eqn, eqn);
979 	MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
980 	if (cq->create_flags & IB_CQ_FLAGS_IGNORE_OVERRUN)
981 		MLX5_SET(cqc, cqc, oi, 1);
982 
983 	err = mlx5_core_create_cq(dev->mdev, &cq->mcq, cqb, inlen, out, sizeof(out));
984 	if (err)
985 		goto err_cqb;
986 
987 	mlx5_ib_dbg(dev, "cqn 0x%x\n", cq->mcq.cqn);
988 	cq->mcq.irqn = irqn;
989 	cq->mcq.comp  = mlx5_ib_cq_comp;
990 	cq->mcq.event = mlx5_ib_cq_event;
991 
992 	INIT_LIST_HEAD(&cq->wc_list);
993 
994 	if (udata)
995 		if (ib_copy_to_udata(udata, &cq->mcq.cqn, sizeof(__u32))) {
996 			err = -EFAULT;
997 			goto err_cmd;
998 		}
999 
1000 
1001 	kvfree(cqb);
1002 	return 0;
1003 
1004 err_cmd:
1005 	mlx5_core_destroy_cq(dev->mdev, &cq->mcq);
1006 
1007 err_cqb:
1008 	kvfree(cqb);
1009 	if (udata)
1010 		destroy_cq_user(cq, udata);
1011 	else
1012 		destroy_cq_kernel(dev, cq);
1013 	return err;
1014 }
1015 
1016 void mlx5_ib_destroy_cq(struct ib_cq *cq, struct ib_udata *udata)
1017 {
1018 	struct mlx5_ib_dev *dev = to_mdev(cq->device);
1019 	struct mlx5_ib_cq *mcq = to_mcq(cq);
1020 
1021 	mlx5_core_destroy_cq(dev->mdev, &mcq->mcq);
1022 	if (udata)
1023 		destroy_cq_user(mcq, udata);
1024 	else
1025 		destroy_cq_kernel(dev, mcq);
1026 }
1027 
1028 static int is_equal_rsn(struct mlx5_cqe64 *cqe64, u32 rsn)
1029 {
1030 	return rsn == (ntohl(cqe64->sop_drop_qpn) & 0xffffff);
1031 }
1032 
1033 void __mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 rsn, struct mlx5_ib_srq *srq)
1034 {
1035 	struct mlx5_cqe64 *cqe64, *dest64;
1036 	void *cqe, *dest;
1037 	u32 prod_index;
1038 	int nfreed = 0;
1039 	u8 owner_bit;
1040 
1041 	if (!cq)
1042 		return;
1043 
1044 	/* First we need to find the current producer index, so we
1045 	 * know where to start cleaning from.  It doesn't matter if HW
1046 	 * adds new entries after this loop -- the QP we're worried
1047 	 * about is already in RESET, so the new entries won't come
1048 	 * from our QP and therefore don't need to be checked.
1049 	 */
1050 	for (prod_index = cq->mcq.cons_index; get_sw_cqe(cq, prod_index); prod_index++)
1051 		if (prod_index == cq->mcq.cons_index + cq->ibcq.cqe)
1052 			break;
1053 
1054 	/* Now sweep backwards through the CQ, removing CQ entries
1055 	 * that match our QP by copying older entries on top of them.
1056 	 */
1057 	while ((int) --prod_index - (int) cq->mcq.cons_index >= 0) {
1058 		cqe = get_cqe(cq, prod_index & cq->ibcq.cqe);
1059 		cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1060 		if (is_equal_rsn(cqe64, rsn)) {
1061 			if (srq && (ntohl(cqe64->srqn) & 0xffffff))
1062 				mlx5_ib_free_srq_wqe(srq, be16_to_cpu(cqe64->wqe_counter));
1063 			++nfreed;
1064 		} else if (nfreed) {
1065 			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibcq.cqe);
1066 			dest64 = (cq->mcq.cqe_sz == 64) ? dest : dest + 64;
1067 			owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK;
1068 			memcpy(dest, cqe, cq->mcq.cqe_sz);
1069 			dest64->op_own = owner_bit |
1070 				(dest64->op_own & ~MLX5_CQE_OWNER_MASK);
1071 		}
1072 	}
1073 
1074 	if (nfreed) {
1075 		cq->mcq.cons_index += nfreed;
1076 		/* Make sure update of buffer contents is done before
1077 		 * updating consumer index.
1078 		 */
1079 		wmb();
1080 		mlx5_cq_set_ci(&cq->mcq);
1081 	}
1082 }
1083 
1084 void mlx5_ib_cq_clean(struct mlx5_ib_cq *cq, u32 qpn, struct mlx5_ib_srq *srq)
1085 {
1086 	if (!cq)
1087 		return;
1088 
1089 	spin_lock_irq(&cq->lock);
1090 	__mlx5_ib_cq_clean(cq, qpn, srq);
1091 	spin_unlock_irq(&cq->lock);
1092 }
1093 
1094 int mlx5_ib_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
1095 {
1096 	struct mlx5_ib_dev *dev = to_mdev(cq->device);
1097 	struct mlx5_ib_cq *mcq = to_mcq(cq);
1098 	int err;
1099 
1100 	if (!MLX5_CAP_GEN(dev->mdev, cq_moderation))
1101 		return -ENOSYS;
1102 
1103 	err = mlx5_core_modify_cq_moderation(dev->mdev, &mcq->mcq,
1104 					     cq_period, cq_count);
1105 	if (err)
1106 		mlx5_ib_warn(dev, "modify cq 0x%x failed\n", mcq->mcq.cqn);
1107 
1108 	return err;
1109 }
1110 
1111 static int resize_user(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
1112 		       int entries, struct ib_udata *udata, int *npas,
1113 		       int *page_shift, int *cqe_size)
1114 {
1115 	struct mlx5_ib_resize_cq ucmd;
1116 	struct ib_umem *umem;
1117 	int err;
1118 	int npages;
1119 	struct ib_ucontext *context = cq->buf.umem->context;
1120 
1121 	err = ib_copy_from_udata(&ucmd, udata, sizeof(ucmd));
1122 	if (err)
1123 		return err;
1124 
1125 	if (ucmd.reserved0 || ucmd.reserved1)
1126 		return -EINVAL;
1127 
1128 	/* check multiplication overflow */
1129 	if (ucmd.cqe_size && SIZE_MAX / ucmd.cqe_size <= entries - 1)
1130 		return -EINVAL;
1131 
1132 	umem = ib_umem_get(context, ucmd.buf_addr,
1133 			   (size_t)ucmd.cqe_size * entries,
1134 			   IB_ACCESS_LOCAL_WRITE, 1);
1135 	if (IS_ERR(umem)) {
1136 		err = PTR_ERR(umem);
1137 		return err;
1138 	}
1139 
1140 	mlx5_ib_cont_pages(umem, ucmd.buf_addr, 0, &npages, page_shift,
1141 			   npas, NULL);
1142 
1143 	cq->resize_umem = umem;
1144 	*cqe_size = ucmd.cqe_size;
1145 
1146 	return 0;
1147 }
1148 
1149 static void un_resize_user(struct mlx5_ib_cq *cq)
1150 {
1151 	ib_umem_release(cq->resize_umem);
1152 }
1153 
1154 static int resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq,
1155 			 int entries, int cqe_size)
1156 {
1157 	int err;
1158 
1159 	cq->resize_buf = kzalloc(sizeof(*cq->resize_buf), GFP_KERNEL);
1160 	if (!cq->resize_buf)
1161 		return -ENOMEM;
1162 
1163 	err = alloc_cq_buf(dev, cq->resize_buf, entries, cqe_size);
1164 	if (err)
1165 		goto ex;
1166 
1167 	init_cq_buf(cq, cq->resize_buf);
1168 
1169 	return 0;
1170 
1171 ex:
1172 	kfree(cq->resize_buf);
1173 	return err;
1174 }
1175 
1176 static void un_resize_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_cq *cq)
1177 {
1178 	free_cq_buf(dev, cq->resize_buf);
1179 	cq->resize_buf = NULL;
1180 }
1181 
1182 static int copy_resize_cqes(struct mlx5_ib_cq *cq)
1183 {
1184 	struct mlx5_ib_dev *dev = to_mdev(cq->ibcq.device);
1185 	struct mlx5_cqe64 *scqe64;
1186 	struct mlx5_cqe64 *dcqe64;
1187 	void *start_cqe;
1188 	void *scqe;
1189 	void *dcqe;
1190 	int ssize;
1191 	int dsize;
1192 	int i;
1193 	u8 sw_own;
1194 
1195 	ssize = cq->buf.cqe_size;
1196 	dsize = cq->resize_buf->cqe_size;
1197 	if (ssize != dsize) {
1198 		mlx5_ib_warn(dev, "resize from different cqe size is not supported\n");
1199 		return -EINVAL;
1200 	}
1201 
1202 	i = cq->mcq.cons_index;
1203 	scqe = get_sw_cqe(cq, i);
1204 	scqe64 = ssize == 64 ? scqe : scqe + 64;
1205 	start_cqe = scqe;
1206 	if (!scqe) {
1207 		mlx5_ib_warn(dev, "expected cqe in sw ownership\n");
1208 		return -EINVAL;
1209 	}
1210 
1211 	while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) {
1212 		dcqe = get_cqe_from_buf(cq->resize_buf,
1213 					(i + 1) & (cq->resize_buf->nent),
1214 					dsize);
1215 		dcqe64 = dsize == 64 ? dcqe : dcqe + 64;
1216 		sw_own = sw_ownership_bit(i + 1, cq->resize_buf->nent);
1217 		memcpy(dcqe, scqe, dsize);
1218 		dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own;
1219 
1220 		++i;
1221 		scqe = get_sw_cqe(cq, i);
1222 		scqe64 = ssize == 64 ? scqe : scqe + 64;
1223 		if (!scqe) {
1224 			mlx5_ib_warn(dev, "expected cqe in sw ownership\n");
1225 			return -EINVAL;
1226 		}
1227 
1228 		if (scqe == start_cqe) {
1229 			pr_warn("resize CQ failed to get resize CQE, CQN 0x%x\n",
1230 				cq->mcq.cqn);
1231 			return -ENOMEM;
1232 		}
1233 	}
1234 	++cq->mcq.cons_index;
1235 	return 0;
1236 }
1237 
1238 int mlx5_ib_resize_cq(struct ib_cq *ibcq, int entries, struct ib_udata *udata)
1239 {
1240 	struct mlx5_ib_dev *dev = to_mdev(ibcq->device);
1241 	struct mlx5_ib_cq *cq = to_mcq(ibcq);
1242 	void *cqc;
1243 	u32 *in;
1244 	int err;
1245 	int npas;
1246 	__be64 *pas;
1247 	int page_shift;
1248 	int inlen;
1249 	int uninitialized_var(cqe_size);
1250 	unsigned long flags;
1251 
1252 	if (!MLX5_CAP_GEN(dev->mdev, cq_resize)) {
1253 		pr_info("Firmware does not support resize CQ\n");
1254 		return -ENOSYS;
1255 	}
1256 
1257 	if (entries < 1 ||
1258 	    entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) {
1259 		mlx5_ib_warn(dev, "wrong entries number %d, max %d\n",
1260 			     entries,
1261 			     1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz));
1262 		return -EINVAL;
1263 	}
1264 
1265 	entries = roundup_pow_of_two(entries + 1);
1266 	if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz)) + 1)
1267 		return -EINVAL;
1268 
1269 	if (entries == ibcq->cqe + 1)
1270 		return 0;
1271 
1272 	mutex_lock(&cq->resize_mutex);
1273 	if (udata) {
1274 		err = resize_user(dev, cq, entries, udata, &npas, &page_shift,
1275 				  &cqe_size);
1276 	} else {
1277 		cqe_size = 64;
1278 		err = resize_kernel(dev, cq, entries, cqe_size);
1279 		if (!err) {
1280 			npas = cq->resize_buf->buf.npages;
1281 			page_shift = cq->resize_buf->buf.page_shift;
1282 		}
1283 	}
1284 
1285 	if (err)
1286 		goto ex;
1287 
1288 	inlen = MLX5_ST_SZ_BYTES(modify_cq_in) +
1289 		MLX5_FLD_SZ_BYTES(modify_cq_in, pas[0]) * npas;
1290 
1291 	in = mlx5_vzalloc(inlen);
1292 	if (!in) {
1293 		err = -ENOMEM;
1294 		goto ex_resize;
1295 	}
1296 
1297 	pas = (__be64 *)MLX5_ADDR_OF(modify_cq_in, in, pas);
1298 	if (udata)
1299 		mlx5_ib_populate_pas(dev, cq->resize_umem, page_shift,
1300 				     pas, 0);
1301 	else
1302 		mlx5_fill_page_array(&cq->resize_buf->buf, pas);
1303 
1304 	MLX5_SET(modify_cq_in, in,
1305 		 modify_field_select_resize_field_select.resize_field_select.resize_field_select,
1306 		 MLX5_MODIFY_CQ_MASK_LOG_SIZE  |
1307 		 MLX5_MODIFY_CQ_MASK_PG_OFFSET |
1308 		 MLX5_MODIFY_CQ_MASK_PG_SIZE);
1309 
1310 	cqc = MLX5_ADDR_OF(modify_cq_in, in, cq_context);
1311 
1312 	MLX5_SET(cqc, cqc, log_page_size,
1313 		 page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1314 	MLX5_SET(cqc, cqc, cqe_sz, cqe_sz_to_mlx_sz(cqe_size));
1315 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(entries));
1316 
1317 	MLX5_SET(modify_cq_in, in, op_mod, MLX5_CQ_OPMOD_RESIZE);
1318 	MLX5_SET(modify_cq_in, in, cqn, cq->mcq.cqn);
1319 
1320 	err = mlx5_core_modify_cq(dev->mdev, &cq->mcq, in, inlen);
1321 	if (err)
1322 		goto ex_alloc;
1323 
1324 	if (udata) {
1325 		cq->ibcq.cqe = entries - 1;
1326 		ib_umem_release(cq->buf.umem);
1327 		cq->buf.umem = cq->resize_umem;
1328 		cq->resize_umem = NULL;
1329 	} else {
1330 		struct mlx5_ib_cq_buf tbuf;
1331 		int resized = 0;
1332 
1333 		spin_lock_irqsave(&cq->lock, flags);
1334 		if (cq->resize_buf) {
1335 			err = copy_resize_cqes(cq);
1336 			if (!err) {
1337 				tbuf = cq->buf;
1338 				cq->buf = *cq->resize_buf;
1339 				kfree(cq->resize_buf);
1340 				cq->resize_buf = NULL;
1341 				resized = 1;
1342 			}
1343 		}
1344 		cq->ibcq.cqe = entries - 1;
1345 		spin_unlock_irqrestore(&cq->lock, flags);
1346 		if (resized)
1347 			free_cq_buf(dev, &tbuf);
1348 	}
1349 	mutex_unlock(&cq->resize_mutex);
1350 
1351 	kvfree(in);
1352 	return 0;
1353 
1354 ex_alloc:
1355 	kvfree(in);
1356 
1357 ex_resize:
1358 	if (udata)
1359 		un_resize_user(cq);
1360 	else
1361 		un_resize_kernel(dev, cq);
1362 ex:
1363 	mutex_unlock(&cq->resize_mutex);
1364 	return err;
1365 }
1366 
1367 int mlx5_ib_get_cqe_size(struct mlx5_ib_dev *dev, struct ib_cq *ibcq)
1368 {
1369 	struct mlx5_ib_cq *cq;
1370 
1371 	if (!ibcq)
1372 		return 128;
1373 
1374 	cq = to_mcq(ibcq);
1375 	return cq->cqe_size;
1376 }
1377 
1378 /* Called from atomic context */
1379 int mlx5_ib_generate_wc(struct ib_cq *ibcq, struct ib_wc *wc)
1380 {
1381 	struct mlx5_ib_wc *soft_wc;
1382 	struct mlx5_ib_cq *cq = to_mcq(ibcq);
1383 	unsigned long flags;
1384 
1385 	soft_wc = kmalloc(sizeof(*soft_wc), GFP_ATOMIC);
1386 	if (!soft_wc)
1387 		return -ENOMEM;
1388 
1389 	soft_wc->wc = *wc;
1390 	spin_lock_irqsave(&cq->lock, flags);
1391 	list_add_tail(&soft_wc->list, &cq->wc_list);
1392 	if (cq->notify_flags == IB_CQ_NEXT_COMP ||
1393 	    wc->status != IB_WC_SUCCESS) {
1394 		cq->notify_flags = 0;
1395 		schedule_work(&cq->notify_work);
1396 	}
1397 	spin_unlock_irqrestore(&cq->lock, flags);
1398 
1399 	return 0;
1400 }
1401