xref: /linux/drivers/infiniband/sw/siw/siw_qp.c (revision 95dbf14b236f3147f716cd159bd29461916c610e)
1 // SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
2 
3 /* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
4 /* Copyright (c) 2008-2019, IBM Corporation */
5 
6 #include <linux/errno.h>
7 #include <linux/types.h>
8 #include <linux/net.h>
9 #include <linux/scatterlist.h>
10 #include <linux/llist.h>
11 #include <asm/barrier.h>
12 #include <net/tcp.h>
13 
14 #include "siw.h"
15 #include "siw_verbs.h"
16 #include "siw_mem.h"
17 
18 static char siw_qp_state_to_string[SIW_QP_STATE_COUNT][sizeof "TERMINATE"] = {
19 	[SIW_QP_STATE_IDLE] = "IDLE",
20 	[SIW_QP_STATE_RTR] = "RTR",
21 	[SIW_QP_STATE_RTS] = "RTS",
22 	[SIW_QP_STATE_CLOSING] = "CLOSING",
23 	[SIW_QP_STATE_TERMINATE] = "TERMINATE",
24 	[SIW_QP_STATE_ERROR] = "ERROR"
25 };
26 
27 /*
28  * iWARP (RDMAP, DDP and MPA) parameters as well as Softiwarp settings on a
29  * per-RDMAP message basis. Please keep order of initializer. All MPA len
30  * is initialized to minimum packet size.
31  */
32 struct iwarp_msg_info iwarp_pktinfo[RDMAP_TERMINATE + 1] = {
33 	{ /* RDMAP_RDMA_WRITE */
34 	  .hdr_len = sizeof(struct iwarp_rdma_write),
35 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_write) - 2),
36 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
37 				 cpu_to_be16(DDP_VERSION << 8) |
38 				 cpu_to_be16(RDMAP_VERSION << 6) |
39 				 cpu_to_be16(RDMAP_RDMA_WRITE),
40 	  .rx_data = siw_proc_write },
41 	{ /* RDMAP_RDMA_READ_REQ */
42 	  .hdr_len = sizeof(struct iwarp_rdma_rreq),
43 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rreq) - 2),
44 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
45 				 cpu_to_be16(RDMAP_VERSION << 6) |
46 				 cpu_to_be16(RDMAP_RDMA_READ_REQ),
47 	  .rx_data = siw_proc_rreq },
48 	{ /* RDMAP_RDMA_READ_RESP */
49 	  .hdr_len = sizeof(struct iwarp_rdma_rresp),
50 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_rdma_rresp) - 2),
51 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_TAGGED | DDP_FLAG_LAST |
52 				 cpu_to_be16(DDP_VERSION << 8) |
53 				 cpu_to_be16(RDMAP_VERSION << 6) |
54 				 cpu_to_be16(RDMAP_RDMA_READ_RESP),
55 	  .rx_data = siw_proc_rresp },
56 	{ /* RDMAP_SEND */
57 	  .hdr_len = sizeof(struct iwarp_send),
58 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
59 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
60 				 cpu_to_be16(RDMAP_VERSION << 6) |
61 				 cpu_to_be16(RDMAP_SEND),
62 	  .rx_data = siw_proc_send },
63 	{ /* RDMAP_SEND_INVAL */
64 	  .hdr_len = sizeof(struct iwarp_send_inv),
65 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
66 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
67 				 cpu_to_be16(RDMAP_VERSION << 6) |
68 				 cpu_to_be16(RDMAP_SEND_INVAL),
69 	  .rx_data = siw_proc_send },
70 	{ /* RDMAP_SEND_SE */
71 	  .hdr_len = sizeof(struct iwarp_send),
72 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_send) - 2),
73 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
74 				 cpu_to_be16(RDMAP_VERSION << 6) |
75 				 cpu_to_be16(RDMAP_SEND_SE),
76 	  .rx_data = siw_proc_send },
77 	{ /* RDMAP_SEND_SE_INVAL */
78 	  .hdr_len = sizeof(struct iwarp_send_inv),
79 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_send_inv) - 2),
80 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
81 				 cpu_to_be16(RDMAP_VERSION << 6) |
82 				 cpu_to_be16(RDMAP_SEND_SE_INVAL),
83 	  .rx_data = siw_proc_send },
84 	{ /* RDMAP_TERMINATE */
85 	  .hdr_len = sizeof(struct iwarp_terminate),
86 	  .ctrl.mpa_len = htons(sizeof(struct iwarp_terminate) - 2),
87 	  .ctrl.ddp_rdmap_ctrl = DDP_FLAG_LAST | cpu_to_be16(DDP_VERSION << 8) |
88 				 cpu_to_be16(RDMAP_VERSION << 6) |
89 				 cpu_to_be16(RDMAP_TERMINATE),
90 	  .rx_data = siw_proc_terminate }
91 };
92 
93 void siw_qp_llp_data_ready(struct sock *sk)
94 {
95 	struct siw_qp *qp;
96 
97 	read_lock(&sk->sk_callback_lock);
98 
99 	if (unlikely(!sk->sk_user_data || !sk_to_qp(sk)))
100 		goto done;
101 
102 	qp = sk_to_qp(sk);
103 
104 	if (likely(!qp->rx_stream.rx_suspend &&
105 		   down_read_trylock(&qp->state_lock))) {
106 		read_descriptor_t rd_desc = { .arg.data = qp, .count = 1 };
107 
108 		if (likely(qp->attrs.state == SIW_QP_STATE_RTS))
109 			/*
110 			 * Implements data receive operation during
111 			 * socket callback. TCP gracefully catches
112 			 * the case where there is nothing to receive
113 			 * (not calling siw_tcp_rx_data() then).
114 			 */
115 			tcp_read_sock(sk, &rd_desc, siw_tcp_rx_data);
116 
117 		up_read(&qp->state_lock);
118 	} else {
119 		siw_dbg_qp(qp, "unable to process RX, suspend: %d\n",
120 			   qp->rx_stream.rx_suspend);
121 	}
122 done:
123 	read_unlock(&sk->sk_callback_lock);
124 }
125 
126 void siw_qp_llp_close(struct siw_qp *qp)
127 {
128 	siw_dbg_qp(qp, "enter llp close, state = %s\n",
129 		   siw_qp_state_to_string[qp->attrs.state]);
130 
131 	down_write(&qp->state_lock);
132 
133 	qp->rx_stream.rx_suspend = 1;
134 	qp->tx_ctx.tx_suspend = 1;
135 	qp->attrs.sk = NULL;
136 
137 	switch (qp->attrs.state) {
138 	case SIW_QP_STATE_RTS:
139 	case SIW_QP_STATE_RTR:
140 	case SIW_QP_STATE_IDLE:
141 	case SIW_QP_STATE_TERMINATE:
142 		qp->attrs.state = SIW_QP_STATE_ERROR;
143 		break;
144 	/*
145 	 * SIW_QP_STATE_CLOSING:
146 	 *
147 	 * This is a forced close. shall the QP be moved to
148 	 * ERROR or IDLE ?
149 	 */
150 	case SIW_QP_STATE_CLOSING:
151 		if (tx_wqe(qp)->wr_status == SIW_WR_IDLE)
152 			qp->attrs.state = SIW_QP_STATE_ERROR;
153 		else
154 			qp->attrs.state = SIW_QP_STATE_IDLE;
155 		break;
156 
157 	default:
158 		siw_dbg_qp(qp, "llp close: no state transition needed: %s\n",
159 			   siw_qp_state_to_string[qp->attrs.state]);
160 		break;
161 	}
162 	siw_sq_flush(qp);
163 	siw_rq_flush(qp);
164 
165 	/*
166 	 * Dereference closing CEP
167 	 */
168 	if (qp->cep) {
169 		siw_cep_put(qp->cep);
170 		qp->cep = NULL;
171 	}
172 
173 	up_write(&qp->state_lock);
174 
175 	siw_dbg_qp(qp, "llp close exit: state %s\n",
176 		   siw_qp_state_to_string[qp->attrs.state]);
177 }
178 
179 /*
180  * socket callback routine informing about newly available send space.
181  * Function schedules SQ work for processing SQ items.
182  */
183 void siw_qp_llp_write_space(struct sock *sk)
184 {
185 	struct siw_cep *cep;
186 
187 	read_lock(&sk->sk_callback_lock);
188 
189 	cep  = sk_to_cep(sk);
190 	if (cep) {
191 		cep->sk_write_space(sk);
192 
193 		if (!test_bit(SOCK_NOSPACE, &sk->sk_socket->flags))
194 			(void)siw_sq_start(cep->qp);
195 	}
196 
197 	read_unlock(&sk->sk_callback_lock);
198 }
199 
200 static int siw_qp_readq_init(struct siw_qp *qp, int irq_size, int orq_size)
201 {
202 	irq_size = roundup_pow_of_two(irq_size);
203 	orq_size = roundup_pow_of_two(orq_size);
204 
205 	qp->attrs.irq_size = irq_size;
206 	qp->attrs.orq_size = orq_size;
207 
208 	qp->irq = vzalloc(irq_size * sizeof(struct siw_sqe));
209 	if (!qp->irq) {
210 		siw_dbg_qp(qp, "irq malloc for %d failed\n", irq_size);
211 		qp->attrs.irq_size = 0;
212 		return -ENOMEM;
213 	}
214 	qp->orq = vzalloc(orq_size * sizeof(struct siw_sqe));
215 	if (!qp->orq) {
216 		siw_dbg_qp(qp, "orq malloc for %d failed\n", orq_size);
217 		qp->attrs.orq_size = 0;
218 		qp->attrs.irq_size = 0;
219 		vfree(qp->irq);
220 		return -ENOMEM;
221 	}
222 	siw_dbg_qp(qp, "ORD %d, IRD %d\n", orq_size, irq_size);
223 	return 0;
224 }
225 
226 static int siw_qp_enable_crc(struct siw_qp *qp)
227 {
228 	struct siw_rx_stream *c_rx = &qp->rx_stream;
229 	struct siw_iwarp_tx *c_tx = &qp->tx_ctx;
230 	int size;
231 
232 	if (siw_crypto_shash == NULL)
233 		return -ENOENT;
234 
235 	size = crypto_shash_descsize(siw_crypto_shash) +
236 		sizeof(struct shash_desc);
237 
238 	c_tx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
239 	c_rx->mpa_crc_hd = kzalloc(size, GFP_KERNEL);
240 	if (!c_tx->mpa_crc_hd || !c_rx->mpa_crc_hd) {
241 		kfree(c_tx->mpa_crc_hd);
242 		kfree(c_rx->mpa_crc_hd);
243 		c_tx->mpa_crc_hd = NULL;
244 		c_rx->mpa_crc_hd = NULL;
245 		return -ENOMEM;
246 	}
247 	c_tx->mpa_crc_hd->tfm = siw_crypto_shash;
248 	c_rx->mpa_crc_hd->tfm = siw_crypto_shash;
249 
250 	return 0;
251 }
252 
253 /*
254  * Send a non signalled READ or WRITE to peer side as negotiated
255  * with MPAv2 P2P setup protocol. The work request is only created
256  * as a current active WR and does not consume Send Queue space.
257  *
258  * Caller must hold QP state lock.
259  */
260 int siw_qp_mpa_rts(struct siw_qp *qp, enum mpa_v2_ctrl ctrl)
261 {
262 	struct siw_wqe *wqe = tx_wqe(qp);
263 	unsigned long flags;
264 	int rv = 0;
265 
266 	spin_lock_irqsave(&qp->sq_lock, flags);
267 
268 	if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
269 		spin_unlock_irqrestore(&qp->sq_lock, flags);
270 		return -EIO;
271 	}
272 	memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
273 
274 	wqe->wr_status = SIW_WR_QUEUED;
275 	wqe->sqe.flags = 0;
276 	wqe->sqe.num_sge = 1;
277 	wqe->sqe.sge[0].length = 0;
278 	wqe->sqe.sge[0].laddr = 0;
279 	wqe->sqe.sge[0].lkey = 0;
280 	/*
281 	 * While it must not be checked for inbound zero length
282 	 * READ/WRITE, some HW may treat STag 0 special.
283 	 */
284 	wqe->sqe.rkey = 1;
285 	wqe->sqe.raddr = 0;
286 	wqe->processed = 0;
287 
288 	if (ctrl & MPA_V2_RDMA_WRITE_RTR)
289 		wqe->sqe.opcode = SIW_OP_WRITE;
290 	else if (ctrl & MPA_V2_RDMA_READ_RTR) {
291 		struct siw_sqe *rreq;
292 
293 		wqe->sqe.opcode = SIW_OP_READ;
294 
295 		spin_lock(&qp->orq_lock);
296 
297 		rreq = orq_get_free(qp);
298 		if (rreq) {
299 			siw_read_to_orq(rreq, &wqe->sqe);
300 			qp->orq_put++;
301 		} else
302 			rv = -EIO;
303 
304 		spin_unlock(&qp->orq_lock);
305 	} else
306 		rv = -EINVAL;
307 
308 	if (rv)
309 		wqe->wr_status = SIW_WR_IDLE;
310 
311 	spin_unlock_irqrestore(&qp->sq_lock, flags);
312 
313 	if (!rv)
314 		rv = siw_sq_start(qp);
315 
316 	return rv;
317 }
318 
319 /*
320  * Map memory access error to DDP tagged error
321  */
322 enum ddp_ecode siw_tagged_error(enum siw_access_state state)
323 {
324 	switch (state) {
325 	case E_STAG_INVALID:
326 		return DDP_ECODE_T_INVALID_STAG;
327 	case E_BASE_BOUNDS:
328 		return DDP_ECODE_T_BASE_BOUNDS;
329 	case E_PD_MISMATCH:
330 		return DDP_ECODE_T_STAG_NOT_ASSOC;
331 	case E_ACCESS_PERM:
332 		/*
333 		 * RFC 5041 (DDP) lacks an ecode for insufficient access
334 		 * permissions. 'Invalid STag' seem to be the closest
335 		 * match though.
336 		 */
337 		return DDP_ECODE_T_INVALID_STAG;
338 	default:
339 		WARN_ON(1);
340 		return DDP_ECODE_T_INVALID_STAG;
341 	}
342 }
343 
344 /*
345  * Map memory access error to RDMAP protection error
346  */
347 enum rdmap_ecode siw_rdmap_error(enum siw_access_state state)
348 {
349 	switch (state) {
350 	case E_STAG_INVALID:
351 		return RDMAP_ECODE_INVALID_STAG;
352 	case E_BASE_BOUNDS:
353 		return RDMAP_ECODE_BASE_BOUNDS;
354 	case E_PD_MISMATCH:
355 		return RDMAP_ECODE_STAG_NOT_ASSOC;
356 	case E_ACCESS_PERM:
357 		return RDMAP_ECODE_ACCESS_RIGHTS;
358 	default:
359 		return RDMAP_ECODE_UNSPECIFIED;
360 	}
361 }
362 
363 void siw_init_terminate(struct siw_qp *qp, enum term_elayer layer, u8 etype,
364 			u8 ecode, int in_tx)
365 {
366 	if (!qp->term_info.valid) {
367 		memset(&qp->term_info, 0, sizeof(qp->term_info));
368 		qp->term_info.layer = layer;
369 		qp->term_info.etype = etype;
370 		qp->term_info.ecode = ecode;
371 		qp->term_info.in_tx = in_tx;
372 		qp->term_info.valid = 1;
373 	}
374 	siw_dbg_qp(qp, "init TERM: layer %d, type %d, code %d, in tx %s\n",
375 		   layer, etype, ecode, in_tx ? "yes" : "no");
376 }
377 
378 /*
379  * Send a TERMINATE message, as defined in RFC's 5040/5041/5044/6581.
380  * Sending TERMINATE messages is best effort - such messages
381  * can only be send if the QP is still connected and it does
382  * not have another outbound message in-progress, i.e. the
383  * TERMINATE message must not interfer with an incomplete current
384  * transmit operation.
385  */
386 void siw_send_terminate(struct siw_qp *qp)
387 {
388 	struct kvec iov[3];
389 	struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_EOR };
390 	struct iwarp_terminate *term = NULL;
391 	union iwarp_hdr *err_hdr = NULL;
392 	struct socket *s = qp->attrs.sk;
393 	struct siw_rx_stream *srx = &qp->rx_stream;
394 	union iwarp_hdr *rx_hdr = &srx->hdr;
395 	u32 crc = 0;
396 	int num_frags, len_terminate, rv;
397 
398 	if (!qp->term_info.valid)
399 		return;
400 
401 	qp->term_info.valid = 0;
402 
403 	if (tx_wqe(qp)->wr_status == SIW_WR_INPROGRESS) {
404 		siw_dbg_qp(qp, "cannot send TERMINATE: op %d in progress\n",
405 			   tx_type(tx_wqe(qp)));
406 		return;
407 	}
408 	if (!s && qp->cep)
409 		/* QP not yet in RTS. Take socket from connection end point */
410 		s = qp->cep->sock;
411 
412 	if (!s) {
413 		siw_dbg_qp(qp, "cannot send TERMINATE: not connected\n");
414 		return;
415 	}
416 
417 	term = kzalloc(sizeof(*term), GFP_KERNEL);
418 	if (!term)
419 		return;
420 
421 	term->ddp_qn = cpu_to_be32(RDMAP_UNTAGGED_QN_TERMINATE);
422 	term->ddp_mo = 0;
423 	term->ddp_msn = cpu_to_be32(1);
424 
425 	iov[0].iov_base = term;
426 	iov[0].iov_len = sizeof(*term);
427 
428 	if ((qp->term_info.layer == TERM_ERROR_LAYER_DDP) ||
429 	    ((qp->term_info.layer == TERM_ERROR_LAYER_RDMAP) &&
430 	     (qp->term_info.etype != RDMAP_ETYPE_CATASTROPHIC))) {
431 		err_hdr = kzalloc(sizeof(*err_hdr), GFP_KERNEL);
432 		if (!err_hdr) {
433 			kfree(term);
434 			return;
435 		}
436 	}
437 	memcpy(&term->ctrl, &iwarp_pktinfo[RDMAP_TERMINATE].ctrl,
438 	       sizeof(struct iwarp_ctrl));
439 
440 	__rdmap_term_set_layer(term, qp->term_info.layer);
441 	__rdmap_term_set_etype(term, qp->term_info.etype);
442 	__rdmap_term_set_ecode(term, qp->term_info.ecode);
443 
444 	switch (qp->term_info.layer) {
445 	case TERM_ERROR_LAYER_RDMAP:
446 		if (qp->term_info.etype == RDMAP_ETYPE_CATASTROPHIC)
447 			/* No additional DDP/RDMAP header to be included */
448 			break;
449 
450 		if (qp->term_info.etype == RDMAP_ETYPE_REMOTE_PROTECTION) {
451 			/*
452 			 * Complete RDMAP frame will get attached, and
453 			 * DDP segment length is valid
454 			 */
455 			term->flag_m = 1;
456 			term->flag_d = 1;
457 			term->flag_r = 1;
458 
459 			if (qp->term_info.in_tx) {
460 				struct iwarp_rdma_rreq *rreq;
461 				struct siw_wqe *wqe = tx_wqe(qp);
462 
463 				/* Inbound RREQ error, detected during
464 				 * RRESP creation. Take state from
465 				 * current TX work queue element to
466 				 * reconstruct peers RREQ.
467 				 */
468 				rreq = (struct iwarp_rdma_rreq *)err_hdr;
469 
470 				memcpy(&rreq->ctrl,
471 				       &iwarp_pktinfo[RDMAP_RDMA_READ_REQ].ctrl,
472 				       sizeof(struct iwarp_ctrl));
473 
474 				rreq->rsvd = 0;
475 				rreq->ddp_qn =
476 					htonl(RDMAP_UNTAGGED_QN_RDMA_READ);
477 
478 				/* Provide RREQ's MSN as kept aside */
479 				rreq->ddp_msn = htonl(wqe->sqe.sge[0].length);
480 
481 				rreq->ddp_mo = htonl(wqe->processed);
482 				rreq->sink_stag = htonl(wqe->sqe.rkey);
483 				rreq->sink_to = cpu_to_be64(wqe->sqe.raddr);
484 				rreq->read_size = htonl(wqe->sqe.sge[0].length);
485 				rreq->source_stag = htonl(wqe->sqe.sge[0].lkey);
486 				rreq->source_to =
487 					cpu_to_be64(wqe->sqe.sge[0].laddr);
488 
489 				iov[1].iov_base = rreq;
490 				iov[1].iov_len = sizeof(*rreq);
491 
492 				rx_hdr = (union iwarp_hdr *)rreq;
493 			} else {
494 				/* Take RDMAP/DDP information from
495 				 * current (failed) inbound frame.
496 				 */
497 				iov[1].iov_base = rx_hdr;
498 
499 				if (__rdmap_get_opcode(&rx_hdr->ctrl) ==
500 				    RDMAP_RDMA_READ_REQ)
501 					iov[1].iov_len =
502 						sizeof(struct iwarp_rdma_rreq);
503 				else /* SEND type */
504 					iov[1].iov_len =
505 						sizeof(struct iwarp_send);
506 			}
507 		} else {
508 			/* Do not report DDP hdr information if packet
509 			 * layout is unknown
510 			 */
511 			if ((qp->term_info.ecode == RDMAP_ECODE_VERSION) ||
512 			    (qp->term_info.ecode == RDMAP_ECODE_OPCODE))
513 				break;
514 
515 			iov[1].iov_base = rx_hdr;
516 
517 			/* Only DDP frame will get attached */
518 			if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
519 				iov[1].iov_len =
520 					sizeof(struct iwarp_rdma_write);
521 			else
522 				iov[1].iov_len = sizeof(struct iwarp_send);
523 
524 			term->flag_m = 1;
525 			term->flag_d = 1;
526 		}
527 		term->ctrl.mpa_len = cpu_to_be16(iov[1].iov_len);
528 		break;
529 
530 	case TERM_ERROR_LAYER_DDP:
531 		/* Report error encountered while DDP processing.
532 		 * This can only happen as a result of inbound
533 		 * DDP processing
534 		 */
535 
536 		/* Do not report DDP hdr information if packet
537 		 * layout is unknown
538 		 */
539 		if (((qp->term_info.etype == DDP_ETYPE_TAGGED_BUF) &&
540 		     (qp->term_info.ecode == DDP_ECODE_T_VERSION)) ||
541 		    ((qp->term_info.etype == DDP_ETYPE_UNTAGGED_BUF) &&
542 		     (qp->term_info.ecode == DDP_ECODE_UT_VERSION)))
543 			break;
544 
545 		iov[1].iov_base = rx_hdr;
546 
547 		if (rx_hdr->ctrl.ddp_rdmap_ctrl & DDP_FLAG_TAGGED)
548 			iov[1].iov_len = sizeof(struct iwarp_ctrl_tagged);
549 		else
550 			iov[1].iov_len = sizeof(struct iwarp_ctrl_untagged);
551 
552 		term->flag_m = 1;
553 		term->flag_d = 1;
554 		break;
555 
556 	default:
557 		break;
558 	}
559 	if (term->flag_m || term->flag_d || term->flag_r) {
560 		iov[2].iov_base = &crc;
561 		iov[2].iov_len = sizeof(crc);
562 		len_terminate = sizeof(*term) + iov[1].iov_len + MPA_CRC_SIZE;
563 		num_frags = 3;
564 	} else {
565 		iov[1].iov_base = &crc;
566 		iov[1].iov_len = sizeof(crc);
567 		len_terminate = sizeof(*term) + MPA_CRC_SIZE;
568 		num_frags = 2;
569 	}
570 
571 	/* Adjust DDP Segment Length parameter, if valid */
572 	if (term->flag_m) {
573 		u32 real_ddp_len = be16_to_cpu(rx_hdr->ctrl.mpa_len);
574 		enum rdma_opcode op = __rdmap_get_opcode(&rx_hdr->ctrl);
575 
576 		real_ddp_len -= iwarp_pktinfo[op].hdr_len - MPA_HDR_SIZE;
577 		rx_hdr->ctrl.mpa_len = cpu_to_be16(real_ddp_len);
578 	}
579 
580 	term->ctrl.mpa_len =
581 		cpu_to_be16(len_terminate - (MPA_HDR_SIZE + MPA_CRC_SIZE));
582 	if (qp->tx_ctx.mpa_crc_hd) {
583 		crypto_shash_init(qp->tx_ctx.mpa_crc_hd);
584 		if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
585 					(u8 *)iov[0].iov_base,
586 					iov[0].iov_len))
587 			goto out;
588 
589 		if (num_frags == 3) {
590 			if (crypto_shash_update(qp->tx_ctx.mpa_crc_hd,
591 						(u8 *)iov[1].iov_base,
592 						iov[1].iov_len))
593 				goto out;
594 		}
595 		crypto_shash_final(qp->tx_ctx.mpa_crc_hd, (u8 *)&crc);
596 	}
597 
598 	rv = kernel_sendmsg(s, &msg, iov, num_frags, len_terminate);
599 	siw_dbg_qp(qp, "sent TERM: %s, layer %d, type %d, code %d (%d bytes)\n",
600 		   rv == len_terminate ? "success" : "failure",
601 		   __rdmap_term_layer(term), __rdmap_term_etype(term),
602 		   __rdmap_term_ecode(term), rv);
603 out:
604 	kfree(term);
605 	kfree(err_hdr);
606 }
607 
608 /*
609  * Handle all attrs other than state
610  */
611 static void siw_qp_modify_nonstate(struct siw_qp *qp,
612 				   struct siw_qp_attrs *attrs,
613 				   enum siw_qp_attr_mask mask)
614 {
615 	if (mask & SIW_QP_ATTR_ACCESS_FLAGS) {
616 		if (attrs->flags & SIW_RDMA_BIND_ENABLED)
617 			qp->attrs.flags |= SIW_RDMA_BIND_ENABLED;
618 		else
619 			qp->attrs.flags &= ~SIW_RDMA_BIND_ENABLED;
620 
621 		if (attrs->flags & SIW_RDMA_WRITE_ENABLED)
622 			qp->attrs.flags |= SIW_RDMA_WRITE_ENABLED;
623 		else
624 			qp->attrs.flags &= ~SIW_RDMA_WRITE_ENABLED;
625 
626 		if (attrs->flags & SIW_RDMA_READ_ENABLED)
627 			qp->attrs.flags |= SIW_RDMA_READ_ENABLED;
628 		else
629 			qp->attrs.flags &= ~SIW_RDMA_READ_ENABLED;
630 	}
631 }
632 
633 static int siw_qp_nextstate_from_idle(struct siw_qp *qp,
634 				      struct siw_qp_attrs *attrs,
635 				      enum siw_qp_attr_mask mask)
636 {
637 	int rv = 0;
638 
639 	switch (attrs->state) {
640 	case SIW_QP_STATE_RTS:
641 		if (attrs->flags & SIW_MPA_CRC) {
642 			rv = siw_qp_enable_crc(qp);
643 			if (rv)
644 				break;
645 		}
646 		if (!(mask & SIW_QP_ATTR_LLP_HANDLE)) {
647 			siw_dbg_qp(qp, "no socket\n");
648 			rv = -EINVAL;
649 			break;
650 		}
651 		if (!(mask & SIW_QP_ATTR_MPA)) {
652 			siw_dbg_qp(qp, "no MPA\n");
653 			rv = -EINVAL;
654 			break;
655 		}
656 		/*
657 		 * Initialize iWARP TX state
658 		 */
659 		qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 0;
660 		qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 0;
661 		qp->tx_ctx.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 0;
662 
663 		/*
664 		 * Initialize iWARP RX state
665 		 */
666 		qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_SEND] = 1;
667 		qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ] = 1;
668 		qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] = 1;
669 
670 		/*
671 		 * init IRD free queue, caller has already checked
672 		 * limits.
673 		 */
674 		rv = siw_qp_readq_init(qp, attrs->irq_size,
675 				       attrs->orq_size);
676 		if (rv)
677 			break;
678 
679 		qp->attrs.sk = attrs->sk;
680 		qp->attrs.state = SIW_QP_STATE_RTS;
681 
682 		siw_dbg_qp(qp, "enter RTS: crc=%s, ord=%u, ird=%u\n",
683 			   attrs->flags & SIW_MPA_CRC ? "y" : "n",
684 			   qp->attrs.orq_size, qp->attrs.irq_size);
685 		break;
686 
687 	case SIW_QP_STATE_ERROR:
688 		siw_rq_flush(qp);
689 		qp->attrs.state = SIW_QP_STATE_ERROR;
690 		if (qp->cep) {
691 			siw_cep_put(qp->cep);
692 			qp->cep = NULL;
693 		}
694 		break;
695 
696 	default:
697 		break;
698 	}
699 	return rv;
700 }
701 
702 static int siw_qp_nextstate_from_rts(struct siw_qp *qp,
703 				     struct siw_qp_attrs *attrs)
704 {
705 	int drop_conn = 0;
706 
707 	switch (attrs->state) {
708 	case SIW_QP_STATE_CLOSING:
709 		/*
710 		 * Verbs: move to IDLE if SQ and ORQ are empty.
711 		 * Move to ERROR otherwise. But first of all we must
712 		 * close the connection. So we keep CLOSING or ERROR
713 		 * as a transient state, schedule connection drop work
714 		 * and wait for the socket state change upcall to
715 		 * come back closed.
716 		 */
717 		if (tx_wqe(qp)->wr_status == SIW_WR_IDLE) {
718 			qp->attrs.state = SIW_QP_STATE_CLOSING;
719 		} else {
720 			qp->attrs.state = SIW_QP_STATE_ERROR;
721 			siw_sq_flush(qp);
722 		}
723 		siw_rq_flush(qp);
724 
725 		drop_conn = 1;
726 		break;
727 
728 	case SIW_QP_STATE_TERMINATE:
729 		qp->attrs.state = SIW_QP_STATE_TERMINATE;
730 
731 		siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
732 				   RDMAP_ETYPE_CATASTROPHIC,
733 				   RDMAP_ECODE_UNSPECIFIED, 1);
734 		drop_conn = 1;
735 		break;
736 
737 	case SIW_QP_STATE_ERROR:
738 		/*
739 		 * This is an emergency close.
740 		 *
741 		 * Any in progress transmit operation will get
742 		 * cancelled.
743 		 * This will likely result in a protocol failure,
744 		 * if a TX operation is in transit. The caller
745 		 * could unconditional wait to give the current
746 		 * operation a chance to complete.
747 		 * Esp., how to handle the non-empty IRQ case?
748 		 * The peer was asking for data transfer at a valid
749 		 * point in time.
750 		 */
751 		siw_sq_flush(qp);
752 		siw_rq_flush(qp);
753 		qp->attrs.state = SIW_QP_STATE_ERROR;
754 		drop_conn = 1;
755 		break;
756 
757 	default:
758 		break;
759 	}
760 	return drop_conn;
761 }
762 
763 static void siw_qp_nextstate_from_term(struct siw_qp *qp,
764 				       struct siw_qp_attrs *attrs)
765 {
766 	switch (attrs->state) {
767 	case SIW_QP_STATE_ERROR:
768 		siw_rq_flush(qp);
769 		qp->attrs.state = SIW_QP_STATE_ERROR;
770 
771 		if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
772 			siw_sq_flush(qp);
773 		break;
774 
775 	default:
776 		break;
777 	}
778 }
779 
780 static int siw_qp_nextstate_from_close(struct siw_qp *qp,
781 				       struct siw_qp_attrs *attrs)
782 {
783 	int rv = 0;
784 
785 	switch (attrs->state) {
786 	case SIW_QP_STATE_IDLE:
787 		WARN_ON(tx_wqe(qp)->wr_status != SIW_WR_IDLE);
788 		qp->attrs.state = SIW_QP_STATE_IDLE;
789 		break;
790 
791 	case SIW_QP_STATE_CLOSING:
792 		/*
793 		 * The LLP may already moved the QP to closing
794 		 * due to graceful peer close init
795 		 */
796 		break;
797 
798 	case SIW_QP_STATE_ERROR:
799 		/*
800 		 * QP was moved to CLOSING by LLP event
801 		 * not yet seen by user.
802 		 */
803 		qp->attrs.state = SIW_QP_STATE_ERROR;
804 
805 		if (tx_wqe(qp)->wr_status != SIW_WR_IDLE)
806 			siw_sq_flush(qp);
807 
808 		siw_rq_flush(qp);
809 		break;
810 
811 	default:
812 		siw_dbg_qp(qp, "state transition undefined: %s => %s\n",
813 			   siw_qp_state_to_string[qp->attrs.state],
814 			   siw_qp_state_to_string[attrs->state]);
815 
816 		rv = -ECONNABORTED;
817 	}
818 	return rv;
819 }
820 
821 /*
822  * Caller must hold qp->state_lock
823  */
824 int siw_qp_modify(struct siw_qp *qp, struct siw_qp_attrs *attrs,
825 		  enum siw_qp_attr_mask mask)
826 {
827 	int drop_conn = 0, rv = 0;
828 
829 	if (!mask)
830 		return 0;
831 
832 	siw_dbg_qp(qp, "state: %s => %s\n",
833 		   siw_qp_state_to_string[qp->attrs.state],
834 		   siw_qp_state_to_string[attrs->state]);
835 
836 	if (mask != SIW_QP_ATTR_STATE)
837 		siw_qp_modify_nonstate(qp, attrs, mask);
838 
839 	if (!(mask & SIW_QP_ATTR_STATE))
840 		return 0;
841 
842 	switch (qp->attrs.state) {
843 	case SIW_QP_STATE_IDLE:
844 	case SIW_QP_STATE_RTR:
845 		rv = siw_qp_nextstate_from_idle(qp, attrs, mask);
846 		break;
847 
848 	case SIW_QP_STATE_RTS:
849 		drop_conn = siw_qp_nextstate_from_rts(qp, attrs);
850 		break;
851 
852 	case SIW_QP_STATE_TERMINATE:
853 		siw_qp_nextstate_from_term(qp, attrs);
854 		break;
855 
856 	case SIW_QP_STATE_CLOSING:
857 		siw_qp_nextstate_from_close(qp, attrs);
858 		break;
859 	default:
860 		break;
861 	}
862 	if (drop_conn)
863 		siw_qp_cm_drop(qp, 0);
864 
865 	return rv;
866 }
867 
868 void siw_read_to_orq(struct siw_sqe *rreq, struct siw_sqe *sqe)
869 {
870 	rreq->id = sqe->id;
871 	rreq->opcode = sqe->opcode;
872 	rreq->sge[0].laddr = sqe->sge[0].laddr;
873 	rreq->sge[0].length = sqe->sge[0].length;
874 	rreq->sge[0].lkey = sqe->sge[0].lkey;
875 	rreq->sge[1].lkey = sqe->sge[1].lkey;
876 	rreq->flags = sqe->flags | SIW_WQE_VALID;
877 	rreq->num_sge = 1;
878 }
879 
880 /*
881  * Must be called with SQ locked.
882  * To avoid complete SQ starvation by constant inbound READ requests,
883  * the active IRQ will not be served after qp->irq_burst, if the
884  * SQ has pending work.
885  */
886 int siw_activate_tx(struct siw_qp *qp)
887 {
888 	struct siw_sqe *irqe, *sqe;
889 	struct siw_wqe *wqe = tx_wqe(qp);
890 	int rv = 1;
891 
892 	irqe = &qp->irq[qp->irq_get % qp->attrs.irq_size];
893 
894 	if (irqe->flags & SIW_WQE_VALID) {
895 		sqe = sq_get_next(qp);
896 
897 		/*
898 		 * Avoid local WQE processing starvation in case
899 		 * of constant inbound READ request stream
900 		 */
901 		if (sqe && ++qp->irq_burst >= SIW_IRQ_MAXBURST_SQ_ACTIVE) {
902 			qp->irq_burst = 0;
903 			goto skip_irq;
904 		}
905 		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
906 		wqe->wr_status = SIW_WR_QUEUED;
907 
908 		/* start READ RESPONSE */
909 		wqe->sqe.opcode = SIW_OP_READ_RESPONSE;
910 		wqe->sqe.flags = 0;
911 		if (irqe->num_sge) {
912 			wqe->sqe.num_sge = 1;
913 			wqe->sqe.sge[0].length = irqe->sge[0].length;
914 			wqe->sqe.sge[0].laddr = irqe->sge[0].laddr;
915 			wqe->sqe.sge[0].lkey = irqe->sge[0].lkey;
916 		} else {
917 			wqe->sqe.num_sge = 0;
918 		}
919 
920 		/* Retain original RREQ's message sequence number for
921 		 * potential error reporting cases.
922 		 */
923 		wqe->sqe.sge[1].length = irqe->sge[1].length;
924 
925 		wqe->sqe.rkey = irqe->rkey;
926 		wqe->sqe.raddr = irqe->raddr;
927 
928 		wqe->processed = 0;
929 		qp->irq_get++;
930 
931 		/* mark current IRQ entry free */
932 		smp_store_mb(irqe->flags, 0);
933 
934 		goto out;
935 	}
936 	sqe = sq_get_next(qp);
937 	if (sqe) {
938 skip_irq:
939 		memset(wqe->mem, 0, sizeof(*wqe->mem) * SIW_MAX_SGE);
940 		wqe->wr_status = SIW_WR_QUEUED;
941 
942 		/* First copy SQE to kernel private memory */
943 		memcpy(&wqe->sqe, sqe, sizeof(*sqe));
944 
945 		if (wqe->sqe.opcode >= SIW_NUM_OPCODES) {
946 			rv = -EINVAL;
947 			goto out;
948 		}
949 		if (wqe->sqe.flags & SIW_WQE_INLINE) {
950 			if (wqe->sqe.opcode != SIW_OP_SEND &&
951 			    wqe->sqe.opcode != SIW_OP_WRITE) {
952 				rv = -EINVAL;
953 				goto out;
954 			}
955 			if (wqe->sqe.sge[0].length > SIW_MAX_INLINE) {
956 				rv = -EINVAL;
957 				goto out;
958 			}
959 			wqe->sqe.sge[0].laddr = (uintptr_t)&wqe->sqe.sge[1];
960 			wqe->sqe.sge[0].lkey = 0;
961 			wqe->sqe.num_sge = 1;
962 		}
963 		if (wqe->sqe.flags & SIW_WQE_READ_FENCE) {
964 			/* A READ cannot be fenced */
965 			if (unlikely(wqe->sqe.opcode == SIW_OP_READ ||
966 				     wqe->sqe.opcode ==
967 					     SIW_OP_READ_LOCAL_INV)) {
968 				siw_dbg_qp(qp, "cannot fence read\n");
969 				rv = -EINVAL;
970 				goto out;
971 			}
972 			spin_lock(&qp->orq_lock);
973 
974 			if (!siw_orq_empty(qp)) {
975 				qp->tx_ctx.orq_fence = 1;
976 				rv = 0;
977 			}
978 			spin_unlock(&qp->orq_lock);
979 
980 		} else if (wqe->sqe.opcode == SIW_OP_READ ||
981 			   wqe->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
982 			struct siw_sqe *rreq;
983 
984 			wqe->sqe.num_sge = 1;
985 
986 			spin_lock(&qp->orq_lock);
987 
988 			rreq = orq_get_free(qp);
989 			if (rreq) {
990 				/*
991 				 * Make an immediate copy in ORQ to be ready
992 				 * to process loopback READ reply
993 				 */
994 				siw_read_to_orq(rreq, &wqe->sqe);
995 				qp->orq_put++;
996 			} else {
997 				qp->tx_ctx.orq_fence = 1;
998 				rv = 0;
999 			}
1000 			spin_unlock(&qp->orq_lock);
1001 		}
1002 
1003 		/* Clear SQE, can be re-used by application */
1004 		smp_store_mb(sqe->flags, 0);
1005 		qp->sq_get++;
1006 	} else {
1007 		rv = 0;
1008 	}
1009 out:
1010 	if (unlikely(rv < 0)) {
1011 		siw_dbg_qp(qp, "error %d\n", rv);
1012 		wqe->wr_status = SIW_WR_IDLE;
1013 	}
1014 	return rv;
1015 }
1016 
1017 /*
1018  * Check if current CQ state qualifies for calling CQ completion
1019  * handler. Must be called with CQ lock held.
1020  */
1021 static bool siw_cq_notify_now(struct siw_cq *cq, u32 flags)
1022 {
1023 	u32 cq_notify;
1024 
1025 	if (!cq->base_cq.comp_handler)
1026 		return false;
1027 
1028 	/* Read application shared notification state */
1029 	cq_notify = READ_ONCE(cq->notify->flags);
1030 
1031 	if ((cq_notify & SIW_NOTIFY_NEXT_COMPLETION) ||
1032 	    ((cq_notify & SIW_NOTIFY_SOLICITED) &&
1033 	     (flags & SIW_WQE_SOLICITED))) {
1034 		/*
1035 		 * CQ notification is one-shot: Since the
1036 		 * current CQE causes user notification,
1037 		 * the CQ gets dis-aremd and must be re-aremd
1038 		 * by the user for a new notification.
1039 		 */
1040 		WRITE_ONCE(cq->notify->flags, SIW_NOTIFY_NOT);
1041 
1042 		return true;
1043 	}
1044 	return false;
1045 }
1046 
1047 int siw_sqe_complete(struct siw_qp *qp, struct siw_sqe *sqe, u32 bytes,
1048 		     enum siw_wc_status status)
1049 {
1050 	struct siw_cq *cq = qp->scq;
1051 	int rv = 0;
1052 
1053 	if (cq) {
1054 		u32 sqe_flags = sqe->flags;
1055 		struct siw_cqe *cqe;
1056 		u32 idx;
1057 		unsigned long flags;
1058 
1059 		spin_lock_irqsave(&cq->lock, flags);
1060 
1061 		idx = cq->cq_put % cq->num_cqe;
1062 		cqe = &cq->queue[idx];
1063 
1064 		if (!READ_ONCE(cqe->flags)) {
1065 			bool notify;
1066 
1067 			cqe->id = sqe->id;
1068 			cqe->opcode = sqe->opcode;
1069 			cqe->status = status;
1070 			cqe->imm_data = 0;
1071 			cqe->bytes = bytes;
1072 
1073 			if (rdma_is_kernel_res(&cq->base_cq.res))
1074 				cqe->base_qp = &qp->base_qp;
1075 			else
1076 				cqe->qp_id = qp_id(qp);
1077 
1078 			/* mark CQE valid for application */
1079 			WRITE_ONCE(cqe->flags, SIW_WQE_VALID);
1080 			/* recycle SQE */
1081 			smp_store_mb(sqe->flags, 0);
1082 
1083 			cq->cq_put++;
1084 			notify = siw_cq_notify_now(cq, sqe_flags);
1085 
1086 			spin_unlock_irqrestore(&cq->lock, flags);
1087 
1088 			if (notify) {
1089 				siw_dbg_cq(cq, "Call completion handler\n");
1090 				cq->base_cq.comp_handler(&cq->base_cq,
1091 						cq->base_cq.cq_context);
1092 			}
1093 		} else {
1094 			spin_unlock_irqrestore(&cq->lock, flags);
1095 			rv = -ENOMEM;
1096 			siw_cq_event(cq, IB_EVENT_CQ_ERR);
1097 		}
1098 	} else {
1099 		/* recycle SQE */
1100 		smp_store_mb(sqe->flags, 0);
1101 	}
1102 	return rv;
1103 }
1104 
1105 int siw_rqe_complete(struct siw_qp *qp, struct siw_rqe *rqe, u32 bytes,
1106 		     u32 inval_stag, enum siw_wc_status status)
1107 {
1108 	struct siw_cq *cq = qp->rcq;
1109 	int rv = 0;
1110 
1111 	if (cq) {
1112 		struct siw_cqe *cqe;
1113 		u32 idx;
1114 		unsigned long flags;
1115 
1116 		spin_lock_irqsave(&cq->lock, flags);
1117 
1118 		idx = cq->cq_put % cq->num_cqe;
1119 		cqe = &cq->queue[idx];
1120 
1121 		if (!READ_ONCE(cqe->flags)) {
1122 			bool notify;
1123 			u8 cqe_flags = SIW_WQE_VALID;
1124 
1125 			cqe->id = rqe->id;
1126 			cqe->opcode = SIW_OP_RECEIVE;
1127 			cqe->status = status;
1128 			cqe->imm_data = 0;
1129 			cqe->bytes = bytes;
1130 
1131 			if (rdma_is_kernel_res(&cq->base_cq.res)) {
1132 				cqe->base_qp = &qp->base_qp;
1133 				if (inval_stag) {
1134 					cqe_flags |= SIW_WQE_REM_INVAL;
1135 					cqe->inval_stag = inval_stag;
1136 				}
1137 			} else {
1138 				cqe->qp_id = qp_id(qp);
1139 			}
1140 			/* mark CQE valid for application */
1141 			WRITE_ONCE(cqe->flags, cqe_flags);
1142 			/* recycle RQE */
1143 			smp_store_mb(rqe->flags, 0);
1144 
1145 			cq->cq_put++;
1146 			notify = siw_cq_notify_now(cq, SIW_WQE_SIGNALLED);
1147 
1148 			spin_unlock_irqrestore(&cq->lock, flags);
1149 
1150 			if (notify) {
1151 				siw_dbg_cq(cq, "Call completion handler\n");
1152 				cq->base_cq.comp_handler(&cq->base_cq,
1153 						cq->base_cq.cq_context);
1154 			}
1155 		} else {
1156 			spin_unlock_irqrestore(&cq->lock, flags);
1157 			rv = -ENOMEM;
1158 			siw_cq_event(cq, IB_EVENT_CQ_ERR);
1159 		}
1160 	} else {
1161 		/* recycle RQE */
1162 		smp_store_mb(rqe->flags, 0);
1163 	}
1164 	return rv;
1165 }
1166 
1167 /*
1168  * siw_sq_flush()
1169  *
1170  * Flush SQ and ORRQ entries to CQ.
1171  *
1172  * Must be called with QP state write lock held.
1173  * Therefore, SQ and ORQ lock must not be taken.
1174  */
1175 void siw_sq_flush(struct siw_qp *qp)
1176 {
1177 	struct siw_sqe *sqe;
1178 	struct siw_wqe *wqe = tx_wqe(qp);
1179 	int async_event = 0;
1180 
1181 	/*
1182 	 * Start with completing any work currently on the ORQ
1183 	 */
1184 	while (qp->attrs.orq_size) {
1185 		sqe = &qp->orq[qp->orq_get % qp->attrs.orq_size];
1186 		if (!READ_ONCE(sqe->flags))
1187 			break;
1188 
1189 		if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
1190 			break;
1191 
1192 		WRITE_ONCE(sqe->flags, 0);
1193 		qp->orq_get++;
1194 	}
1195 	/*
1196 	 * Flush an in-progress WQE if present
1197 	 */
1198 	if (wqe->wr_status != SIW_WR_IDLE) {
1199 		siw_dbg_qp(qp, "flush current SQE, type %d, status %d\n",
1200 			   tx_type(wqe), wqe->wr_status);
1201 
1202 		siw_wqe_put_mem(wqe, tx_type(wqe));
1203 
1204 		if (tx_type(wqe) != SIW_OP_READ_RESPONSE &&
1205 		    ((tx_type(wqe) != SIW_OP_READ &&
1206 		      tx_type(wqe) != SIW_OP_READ_LOCAL_INV) ||
1207 		     wqe->wr_status == SIW_WR_QUEUED))
1208 			/*
1209 			 * An in-progress Read Request is already in
1210 			 * the ORQ
1211 			 */
1212 			siw_sqe_complete(qp, &wqe->sqe, wqe->bytes,
1213 					 SIW_WC_WR_FLUSH_ERR);
1214 
1215 		wqe->wr_status = SIW_WR_IDLE;
1216 	}
1217 	/*
1218 	 * Flush the Send Queue
1219 	 */
1220 	while (qp->attrs.sq_size) {
1221 		sqe = &qp->sendq[qp->sq_get % qp->attrs.sq_size];
1222 		if (!READ_ONCE(sqe->flags))
1223 			break;
1224 
1225 		async_event = 1;
1226 		if (siw_sqe_complete(qp, sqe, 0, SIW_WC_WR_FLUSH_ERR) != 0)
1227 			/*
1228 			 * Shall IB_EVENT_SQ_DRAINED be supressed if work
1229 			 * completion fails?
1230 			 */
1231 			break;
1232 
1233 		WRITE_ONCE(sqe->flags, 0);
1234 		qp->sq_get++;
1235 	}
1236 	if (async_event)
1237 		siw_qp_event(qp, IB_EVENT_SQ_DRAINED);
1238 }
1239 
1240 /*
1241  * siw_rq_flush()
1242  *
1243  * Flush recv queue entries to CQ. Also
1244  * takes care of pending active tagged and untagged
1245  * inbound transfers, which have target memory
1246  * referenced.
1247  *
1248  * Must be called with QP state write lock held.
1249  * Therefore, RQ lock must not be taken.
1250  */
1251 void siw_rq_flush(struct siw_qp *qp)
1252 {
1253 	struct siw_wqe *wqe = &qp->rx_untagged.wqe_active;
1254 
1255 	/*
1256 	 * Flush an in-progress untagged operation if present
1257 	 */
1258 	if (wqe->wr_status != SIW_WR_IDLE) {
1259 		siw_dbg_qp(qp, "flush current rqe, type %d, status %d\n",
1260 			   rx_type(wqe), wqe->wr_status);
1261 
1262 		siw_wqe_put_mem(wqe, rx_type(wqe));
1263 
1264 		if (rx_type(wqe) == SIW_OP_RECEIVE) {
1265 			siw_rqe_complete(qp, &wqe->rqe, wqe->bytes,
1266 					 0, SIW_WC_WR_FLUSH_ERR);
1267 		} else if (rx_type(wqe) != SIW_OP_READ &&
1268 			   rx_type(wqe) != SIW_OP_READ_RESPONSE &&
1269 			   rx_type(wqe) != SIW_OP_WRITE) {
1270 			siw_sqe_complete(qp, &wqe->sqe, 0, SIW_WC_WR_FLUSH_ERR);
1271 		}
1272 		wqe->wr_status = SIW_WR_IDLE;
1273 	}
1274 	wqe = &qp->rx_tagged.wqe_active;
1275 
1276 	if (wqe->wr_status != SIW_WR_IDLE) {
1277 		siw_wqe_put_mem(wqe, rx_type(wqe));
1278 		wqe->wr_status = SIW_WR_IDLE;
1279 	}
1280 	/*
1281 	 * Flush the Receive Queue
1282 	 */
1283 	while (qp->attrs.rq_size) {
1284 		struct siw_rqe *rqe =
1285 			&qp->recvq[qp->rq_get % qp->attrs.rq_size];
1286 
1287 		if (!READ_ONCE(rqe->flags))
1288 			break;
1289 
1290 		if (siw_rqe_complete(qp, rqe, 0, 0, SIW_WC_WR_FLUSH_ERR) != 0)
1291 			break;
1292 
1293 		WRITE_ONCE(rqe->flags, 0);
1294 		qp->rq_get++;
1295 	}
1296 }
1297 
1298 int siw_qp_add(struct siw_device *sdev, struct siw_qp *qp)
1299 {
1300 	int rv = xa_alloc(&sdev->qp_xa, &qp->base_qp.qp_num, qp, xa_limit_32b,
1301 			  GFP_KERNEL);
1302 
1303 	if (!rv) {
1304 		kref_init(&qp->ref);
1305 		qp->sdev = sdev;
1306 		siw_dbg_qp(qp, "new QP\n");
1307 	}
1308 	return rv;
1309 }
1310 
1311 void siw_free_qp(struct kref *ref)
1312 {
1313 	struct siw_qp *found, *qp = container_of(ref, struct siw_qp, ref);
1314 	struct siw_device *sdev = qp->sdev;
1315 	unsigned long flags;
1316 
1317 	if (qp->cep)
1318 		siw_cep_put(qp->cep);
1319 
1320 	found = xa_erase(&sdev->qp_xa, qp_id(qp));
1321 	WARN_ON(found != qp);
1322 	spin_lock_irqsave(&sdev->lock, flags);
1323 	list_del(&qp->devq);
1324 	spin_unlock_irqrestore(&sdev->lock, flags);
1325 
1326 	vfree(qp->sendq);
1327 	vfree(qp->recvq);
1328 	vfree(qp->irq);
1329 	vfree(qp->orq);
1330 
1331 	siw_put_tx_cpu(qp->tx_cpu);
1332 
1333 	atomic_dec(&sdev->num_qp);
1334 	siw_dbg_qp(qp, "free QP\n");
1335 	kfree_rcu(qp, rcu);
1336 }
1337