xref: /linux/drivers/infiniband/hw/hfi1/ruc.c (revision 32786fdc9506aeba98278c1844d4bfb766863832)
1 /*
2  * Copyright(c) 2015, 2016 Intel Corporation.
3  *
4  * This file is provided under a dual BSD/GPLv2 license.  When using or
5  * redistributing this file, you may do so under either license.
6  *
7  * GPL LICENSE SUMMARY
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of version 2 of the GNU General Public License as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful, but
14  * WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * General Public License for more details.
17  *
18  * BSD LICENSE
19  *
20  * Redistribution and use in source and binary forms, with or without
21  * modification, are permitted provided that the following conditions
22  * are met:
23  *
24  *  - Redistributions of source code must retain the above copyright
25  *    notice, this list of conditions and the following disclaimer.
26  *  - Redistributions in binary form must reproduce the above copyright
27  *    notice, this list of conditions and the following disclaimer in
28  *    the documentation and/or other materials provided with the
29  *    distribution.
30  *  - Neither the name of Intel Corporation nor the names of its
31  *    contributors may be used to endorse or promote products derived
32  *    from this software without specific prior written permission.
33  *
34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45  *
46  */
47 
48 #include <linux/spinlock.h>
49 
50 #include "hfi.h"
51 #include "mad.h"
52 #include "qp.h"
53 #include "verbs_txreq.h"
54 #include "trace.h"
55 
56 /*
57  * Convert the AETH RNR timeout code into the number of microseconds.
58  */
59 const u32 ib_hfi1_rnr_table[32] = {
60 	655360,	/* 00: 655.36 */
61 	10,	/* 01:    .01 */
62 	20,	/* 02     .02 */
63 	30,	/* 03:    .03 */
64 	40,	/* 04:    .04 */
65 	60,	/* 05:    .06 */
66 	80,	/* 06:    .08 */
67 	120,	/* 07:    .12 */
68 	160,	/* 08:    .16 */
69 	240,	/* 09:    .24 */
70 	320,	/* 0A:    .32 */
71 	480,	/* 0B:    .48 */
72 	640,	/* 0C:    .64 */
73 	960,	/* 0D:    .96 */
74 	1280,	/* 0E:   1.28 */
75 	1920,	/* 0F:   1.92 */
76 	2560,	/* 10:   2.56 */
77 	3840,	/* 11:   3.84 */
78 	5120,	/* 12:   5.12 */
79 	7680,	/* 13:   7.68 */
80 	10240,	/* 14:  10.24 */
81 	15360,	/* 15:  15.36 */
82 	20480,	/* 16:  20.48 */
83 	30720,	/* 17:  30.72 */
84 	40960,	/* 18:  40.96 */
85 	61440,	/* 19:  61.44 */
86 	81920,	/* 1A:  81.92 */
87 	122880,	/* 1B: 122.88 */
88 	163840,	/* 1C: 163.84 */
89 	245760,	/* 1D: 245.76 */
90 	327680,	/* 1E: 327.68 */
91 	491520	/* 1F: 491.52 */
92 };
93 
94 /*
95  * Validate a RWQE and fill in the SGE state.
96  * Return 1 if OK.
97  */
98 static int init_sge(struct rvt_qp *qp, struct rvt_rwqe *wqe)
99 {
100 	int i, j, ret;
101 	struct ib_wc wc;
102 	struct rvt_lkey_table *rkt;
103 	struct rvt_pd *pd;
104 	struct rvt_sge_state *ss;
105 
106 	rkt = &to_idev(qp->ibqp.device)->rdi.lkey_table;
107 	pd = ibpd_to_rvtpd(qp->ibqp.srq ? qp->ibqp.srq->pd : qp->ibqp.pd);
108 	ss = &qp->r_sge;
109 	ss->sg_list = qp->r_sg_list;
110 	qp->r_len = 0;
111 	for (i = j = 0; i < wqe->num_sge; i++) {
112 		if (wqe->sg_list[i].length == 0)
113 			continue;
114 		/* Check LKEY */
115 		if (!rvt_lkey_ok(rkt, pd, j ? &ss->sg_list[j - 1] : &ss->sge,
116 				 &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
117 			goto bad_lkey;
118 		qp->r_len += wqe->sg_list[i].length;
119 		j++;
120 	}
121 	ss->num_sge = j;
122 	ss->total_len = qp->r_len;
123 	ret = 1;
124 	goto bail;
125 
126 bad_lkey:
127 	while (j) {
128 		struct rvt_sge *sge = --j ? &ss->sg_list[j - 1] : &ss->sge;
129 
130 		rvt_put_mr(sge->mr);
131 	}
132 	ss->num_sge = 0;
133 	memset(&wc, 0, sizeof(wc));
134 	wc.wr_id = wqe->wr_id;
135 	wc.status = IB_WC_LOC_PROT_ERR;
136 	wc.opcode = IB_WC_RECV;
137 	wc.qp = &qp->ibqp;
138 	/* Signal solicited completion event. */
139 	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc, 1);
140 	ret = 0;
141 bail:
142 	return ret;
143 }
144 
145 /**
146  * hfi1_rvt_get_rwqe - copy the next RWQE into the QP's RWQE
147  * @qp: the QP
148  * @wr_id_only: update qp->r_wr_id only, not qp->r_sge
149  *
150  * Return -1 if there is a local error, 0 if no RWQE is available,
151  * otherwise return 1.
152  *
153  * Can be called from interrupt level.
154  */
155 int hfi1_rvt_get_rwqe(struct rvt_qp *qp, int wr_id_only)
156 {
157 	unsigned long flags;
158 	struct rvt_rq *rq;
159 	struct rvt_rwq *wq;
160 	struct rvt_srq *srq;
161 	struct rvt_rwqe *wqe;
162 	void (*handler)(struct ib_event *, void *);
163 	u32 tail;
164 	int ret;
165 
166 	if (qp->ibqp.srq) {
167 		srq = ibsrq_to_rvtsrq(qp->ibqp.srq);
168 		handler = srq->ibsrq.event_handler;
169 		rq = &srq->rq;
170 	} else {
171 		srq = NULL;
172 		handler = NULL;
173 		rq = &qp->r_rq;
174 	}
175 
176 	spin_lock_irqsave(&rq->lock, flags);
177 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK)) {
178 		ret = 0;
179 		goto unlock;
180 	}
181 
182 	wq = rq->wq;
183 	tail = wq->tail;
184 	/* Validate tail before using it since it is user writable. */
185 	if (tail >= rq->size)
186 		tail = 0;
187 	if (unlikely(tail == wq->head)) {
188 		ret = 0;
189 		goto unlock;
190 	}
191 	/* Make sure entry is read after head index is read. */
192 	smp_rmb();
193 	wqe = rvt_get_rwqe_ptr(rq, tail);
194 	/*
195 	 * Even though we update the tail index in memory, the verbs
196 	 * consumer is not supposed to post more entries until a
197 	 * completion is generated.
198 	 */
199 	if (++tail >= rq->size)
200 		tail = 0;
201 	wq->tail = tail;
202 	if (!wr_id_only && !init_sge(qp, wqe)) {
203 		ret = -1;
204 		goto unlock;
205 	}
206 	qp->r_wr_id = wqe->wr_id;
207 
208 	ret = 1;
209 	set_bit(RVT_R_WRID_VALID, &qp->r_aflags);
210 	if (handler) {
211 		u32 n;
212 
213 		/*
214 		 * Validate head pointer value and compute
215 		 * the number of remaining WQEs.
216 		 */
217 		n = wq->head;
218 		if (n >= rq->size)
219 			n = 0;
220 		if (n < tail)
221 			n += rq->size - tail;
222 		else
223 			n -= tail;
224 		if (n < srq->limit) {
225 			struct ib_event ev;
226 
227 			srq->limit = 0;
228 			spin_unlock_irqrestore(&rq->lock, flags);
229 			ev.device = qp->ibqp.device;
230 			ev.element.srq = qp->ibqp.srq;
231 			ev.event = IB_EVENT_SRQ_LIMIT_REACHED;
232 			handler(&ev, srq->ibsrq.srq_context);
233 			goto bail;
234 		}
235 	}
236 unlock:
237 	spin_unlock_irqrestore(&rq->lock, flags);
238 bail:
239 	return ret;
240 }
241 
242 static int gid_ok(union ib_gid *gid, __be64 gid_prefix, __be64 id)
243 {
244 	return (gid->global.interface_id == id &&
245 		(gid->global.subnet_prefix == gid_prefix ||
246 		 gid->global.subnet_prefix == IB_DEFAULT_GID_PREFIX));
247 }
248 
249 /*
250  *
251  * This should be called with the QP r_lock held.
252  *
253  * The s_lock will be acquired around the hfi1_migrate_qp() call.
254  */
255 int hfi1_ruc_check_hdr(struct hfi1_ibport *ibp, struct ib_header *hdr,
256 		       int has_grh, struct rvt_qp *qp, u32 bth0)
257 {
258 	__be64 guid;
259 	unsigned long flags;
260 	u8 sc5 = ibp->sl_to_sc[qp->remote_ah_attr.sl];
261 
262 	if (qp->s_mig_state == IB_MIG_ARMED && (bth0 & IB_BTH_MIG_REQ)) {
263 		if (!has_grh) {
264 			if (qp->alt_ah_attr.ah_flags & IB_AH_GRH)
265 				goto err;
266 		} else {
267 			if (!(qp->alt_ah_attr.ah_flags & IB_AH_GRH))
268 				goto err;
269 			guid = get_sguid(ibp, qp->alt_ah_attr.grh.sgid_index);
270 			if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix,
271 				    guid))
272 				goto err;
273 			if (!gid_ok(
274 				&hdr->u.l.grh.sgid,
275 				qp->alt_ah_attr.grh.dgid.global.subnet_prefix,
276 				qp->alt_ah_attr.grh.dgid.global.interface_id))
277 				goto err;
278 		}
279 		if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
280 					    sc5, be16_to_cpu(hdr->lrh[3])))) {
281 			hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
282 				       (u16)bth0,
283 				       (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
284 				       0, qp->ibqp.qp_num,
285 				       be16_to_cpu(hdr->lrh[3]),
286 				       be16_to_cpu(hdr->lrh[1]));
287 			goto err;
288 		}
289 		/* Validate the SLID. See Ch. 9.6.1.5 and 17.2.8 */
290 		if (be16_to_cpu(hdr->lrh[3]) != qp->alt_ah_attr.dlid ||
291 		    ppd_from_ibp(ibp)->port != qp->alt_ah_attr.port_num)
292 			goto err;
293 		spin_lock_irqsave(&qp->s_lock, flags);
294 		hfi1_migrate_qp(qp);
295 		spin_unlock_irqrestore(&qp->s_lock, flags);
296 	} else {
297 		if (!has_grh) {
298 			if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
299 				goto err;
300 		} else {
301 			if (!(qp->remote_ah_attr.ah_flags & IB_AH_GRH))
302 				goto err;
303 			guid = get_sguid(ibp,
304 					 qp->remote_ah_attr.grh.sgid_index);
305 			if (!gid_ok(&hdr->u.l.grh.dgid, ibp->rvp.gid_prefix,
306 				    guid))
307 				goto err;
308 			if (!gid_ok(
309 			     &hdr->u.l.grh.sgid,
310 			     qp->remote_ah_attr.grh.dgid.global.subnet_prefix,
311 			     qp->remote_ah_attr.grh.dgid.global.interface_id))
312 				goto err;
313 		}
314 		if (unlikely(rcv_pkey_check(ppd_from_ibp(ibp), (u16)bth0,
315 					    sc5, be16_to_cpu(hdr->lrh[3])))) {
316 			hfi1_bad_pqkey(ibp, OPA_TRAP_BAD_P_KEY,
317 				       (u16)bth0,
318 				       (be16_to_cpu(hdr->lrh[0]) >> 4) & 0xF,
319 				       0, qp->ibqp.qp_num,
320 				       be16_to_cpu(hdr->lrh[3]),
321 				       be16_to_cpu(hdr->lrh[1]));
322 			goto err;
323 		}
324 		/* Validate the SLID. See Ch. 9.6.1.5 */
325 		if (be16_to_cpu(hdr->lrh[3]) != qp->remote_ah_attr.dlid ||
326 		    ppd_from_ibp(ibp)->port != qp->port_num)
327 			goto err;
328 		if (qp->s_mig_state == IB_MIG_REARM &&
329 		    !(bth0 & IB_BTH_MIG_REQ))
330 			qp->s_mig_state = IB_MIG_ARMED;
331 	}
332 
333 	return 0;
334 
335 err:
336 	return 1;
337 }
338 
339 /**
340  * ruc_loopback - handle UC and RC loopback requests
341  * @sqp: the sending QP
342  *
343  * This is called from hfi1_do_send() to
344  * forward a WQE addressed to the same HFI.
345  * Note that although we are single threaded due to the send engine, we still
346  * have to protect against post_send().  We don't have to worry about
347  * receive interrupts since this is a connected protocol and all packets
348  * will pass through here.
349  */
350 static void ruc_loopback(struct rvt_qp *sqp)
351 {
352 	struct hfi1_ibport *ibp = to_iport(sqp->ibqp.device, sqp->port_num);
353 	struct rvt_qp *qp;
354 	struct rvt_swqe *wqe;
355 	struct rvt_sge *sge;
356 	unsigned long flags;
357 	struct ib_wc wc;
358 	u64 sdata;
359 	atomic64_t *maddr;
360 	enum ib_wc_status send_status;
361 	int release;
362 	int ret;
363 	int copy_last = 0;
364 	u32 to;
365 	int local_ops = 0;
366 
367 	rcu_read_lock();
368 
369 	/*
370 	 * Note that we check the responder QP state after
371 	 * checking the requester's state.
372 	 */
373 	qp = rvt_lookup_qpn(ib_to_rvt(sqp->ibqp.device), &ibp->rvp,
374 			    sqp->remote_qpn);
375 
376 	spin_lock_irqsave(&sqp->s_lock, flags);
377 
378 	/* Return if we are already busy processing a work request. */
379 	if ((sqp->s_flags & (RVT_S_BUSY | RVT_S_ANY_WAIT)) ||
380 	    !(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_OR_FLUSH_SEND))
381 		goto unlock;
382 
383 	sqp->s_flags |= RVT_S_BUSY;
384 
385 again:
386 	smp_read_barrier_depends(); /* see post_one_send() */
387 	if (sqp->s_last == ACCESS_ONCE(sqp->s_head))
388 		goto clr_busy;
389 	wqe = rvt_get_swqe_ptr(sqp, sqp->s_last);
390 
391 	/* Return if it is not OK to start a new work request. */
392 	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_NEXT_SEND_OK)) {
393 		if (!(ib_rvt_state_ops[sqp->state] & RVT_FLUSH_SEND))
394 			goto clr_busy;
395 		/* We are in the error state, flush the work request. */
396 		send_status = IB_WC_WR_FLUSH_ERR;
397 		goto flush_send;
398 	}
399 
400 	/*
401 	 * We can rely on the entry not changing without the s_lock
402 	 * being held until we update s_last.
403 	 * We increment s_cur to indicate s_last is in progress.
404 	 */
405 	if (sqp->s_last == sqp->s_cur) {
406 		if (++sqp->s_cur >= sqp->s_size)
407 			sqp->s_cur = 0;
408 	}
409 	spin_unlock_irqrestore(&sqp->s_lock, flags);
410 
411 	if (!qp || !(ib_rvt_state_ops[qp->state] & RVT_PROCESS_RECV_OK) ||
412 	    qp->ibqp.qp_type != sqp->ibqp.qp_type) {
413 		ibp->rvp.n_pkt_drops++;
414 		/*
415 		 * For RC, the requester would timeout and retry so
416 		 * shortcut the timeouts and just signal too many retries.
417 		 */
418 		if (sqp->ibqp.qp_type == IB_QPT_RC)
419 			send_status = IB_WC_RETRY_EXC_ERR;
420 		else
421 			send_status = IB_WC_SUCCESS;
422 		goto serr;
423 	}
424 
425 	memset(&wc, 0, sizeof(wc));
426 	send_status = IB_WC_SUCCESS;
427 
428 	release = 1;
429 	sqp->s_sge.sge = wqe->sg_list[0];
430 	sqp->s_sge.sg_list = wqe->sg_list + 1;
431 	sqp->s_sge.num_sge = wqe->wr.num_sge;
432 	sqp->s_len = wqe->length;
433 	switch (wqe->wr.opcode) {
434 	case IB_WR_REG_MR:
435 		goto send_comp;
436 
437 	case IB_WR_LOCAL_INV:
438 		if (!(wqe->wr.send_flags & RVT_SEND_COMPLETION_ONLY)) {
439 			if (rvt_invalidate_rkey(sqp,
440 						wqe->wr.ex.invalidate_rkey))
441 				send_status = IB_WC_LOC_PROT_ERR;
442 			local_ops = 1;
443 		}
444 		goto send_comp;
445 
446 	case IB_WR_SEND_WITH_INV:
447 		if (!rvt_invalidate_rkey(qp, wqe->wr.ex.invalidate_rkey)) {
448 			wc.wc_flags = IB_WC_WITH_INVALIDATE;
449 			wc.ex.invalidate_rkey = wqe->wr.ex.invalidate_rkey;
450 		}
451 		goto send;
452 
453 	case IB_WR_SEND_WITH_IMM:
454 		wc.wc_flags = IB_WC_WITH_IMM;
455 		wc.ex.imm_data = wqe->wr.ex.imm_data;
456 		/* FALLTHROUGH */
457 	case IB_WR_SEND:
458 send:
459 		ret = hfi1_rvt_get_rwqe(qp, 0);
460 		if (ret < 0)
461 			goto op_err;
462 		if (!ret)
463 			goto rnr_nak;
464 		break;
465 
466 	case IB_WR_RDMA_WRITE_WITH_IMM:
467 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
468 			goto inv_err;
469 		wc.wc_flags = IB_WC_WITH_IMM;
470 		wc.ex.imm_data = wqe->wr.ex.imm_data;
471 		ret = hfi1_rvt_get_rwqe(qp, 1);
472 		if (ret < 0)
473 			goto op_err;
474 		if (!ret)
475 			goto rnr_nak;
476 		/* skip copy_last set and qp_access_flags recheck */
477 		goto do_write;
478 	case IB_WR_RDMA_WRITE:
479 		copy_last = ibpd_to_rvtpd(qp->ibqp.pd)->user;
480 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_WRITE)))
481 			goto inv_err;
482 do_write:
483 		if (wqe->length == 0)
484 			break;
485 		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, wqe->length,
486 					  wqe->rdma_wr.remote_addr,
487 					  wqe->rdma_wr.rkey,
488 					  IB_ACCESS_REMOTE_WRITE)))
489 			goto acc_err;
490 		qp->r_sge.sg_list = NULL;
491 		qp->r_sge.num_sge = 1;
492 		qp->r_sge.total_len = wqe->length;
493 		break;
494 
495 	case IB_WR_RDMA_READ:
496 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_READ)))
497 			goto inv_err;
498 		if (unlikely(!rvt_rkey_ok(qp, &sqp->s_sge.sge, wqe->length,
499 					  wqe->rdma_wr.remote_addr,
500 					  wqe->rdma_wr.rkey,
501 					  IB_ACCESS_REMOTE_READ)))
502 			goto acc_err;
503 		release = 0;
504 		sqp->s_sge.sg_list = NULL;
505 		sqp->s_sge.num_sge = 1;
506 		qp->r_sge.sge = wqe->sg_list[0];
507 		qp->r_sge.sg_list = wqe->sg_list + 1;
508 		qp->r_sge.num_sge = wqe->wr.num_sge;
509 		qp->r_sge.total_len = wqe->length;
510 		break;
511 
512 	case IB_WR_ATOMIC_CMP_AND_SWP:
513 	case IB_WR_ATOMIC_FETCH_AND_ADD:
514 		if (unlikely(!(qp->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC)))
515 			goto inv_err;
516 		if (unlikely(!rvt_rkey_ok(qp, &qp->r_sge.sge, sizeof(u64),
517 					  wqe->atomic_wr.remote_addr,
518 					  wqe->atomic_wr.rkey,
519 					  IB_ACCESS_REMOTE_ATOMIC)))
520 			goto acc_err;
521 		/* Perform atomic OP and save result. */
522 		maddr = (atomic64_t *)qp->r_sge.sge.vaddr;
523 		sdata = wqe->atomic_wr.compare_add;
524 		*(u64 *)sqp->s_sge.sge.vaddr =
525 			(wqe->wr.opcode == IB_WR_ATOMIC_FETCH_AND_ADD) ?
526 			(u64)atomic64_add_return(sdata, maddr) - sdata :
527 			(u64)cmpxchg((u64 *)qp->r_sge.sge.vaddr,
528 				      sdata, wqe->atomic_wr.swap);
529 		rvt_put_mr(qp->r_sge.sge.mr);
530 		qp->r_sge.num_sge = 0;
531 		goto send_comp;
532 
533 	default:
534 		send_status = IB_WC_LOC_QP_OP_ERR;
535 		goto serr;
536 	}
537 
538 	sge = &sqp->s_sge.sge;
539 	while (sqp->s_len) {
540 		u32 len = sqp->s_len;
541 
542 		if (len > sge->length)
543 			len = sge->length;
544 		if (len > sge->sge_length)
545 			len = sge->sge_length;
546 		WARN_ON_ONCE(len == 0);
547 		hfi1_copy_sge(&qp->r_sge, sge->vaddr, len, release, copy_last);
548 		sge->vaddr += len;
549 		sge->length -= len;
550 		sge->sge_length -= len;
551 		if (sge->sge_length == 0) {
552 			if (!release)
553 				rvt_put_mr(sge->mr);
554 			if (--sqp->s_sge.num_sge)
555 				*sge = *sqp->s_sge.sg_list++;
556 		} else if (sge->length == 0 && sge->mr->lkey) {
557 			if (++sge->n >= RVT_SEGSZ) {
558 				if (++sge->m >= sge->mr->mapsz)
559 					break;
560 				sge->n = 0;
561 			}
562 			sge->vaddr =
563 				sge->mr->map[sge->m]->segs[sge->n].vaddr;
564 			sge->length =
565 				sge->mr->map[sge->m]->segs[sge->n].length;
566 		}
567 		sqp->s_len -= len;
568 	}
569 	if (release)
570 		rvt_put_ss(&qp->r_sge);
571 
572 	if (!test_and_clear_bit(RVT_R_WRID_VALID, &qp->r_aflags))
573 		goto send_comp;
574 
575 	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
576 		wc.opcode = IB_WC_RECV_RDMA_WITH_IMM;
577 	else
578 		wc.opcode = IB_WC_RECV;
579 	wc.wr_id = qp->r_wr_id;
580 	wc.status = IB_WC_SUCCESS;
581 	wc.byte_len = wqe->length;
582 	wc.qp = &qp->ibqp;
583 	wc.src_qp = qp->remote_qpn;
584 	wc.slid = qp->remote_ah_attr.dlid;
585 	wc.sl = qp->remote_ah_attr.sl;
586 	wc.port_num = 1;
587 	/* Signal completion event if the solicited bit is set. */
588 	rvt_cq_enter(ibcq_to_rvtcq(qp->ibqp.recv_cq), &wc,
589 		     wqe->wr.send_flags & IB_SEND_SOLICITED);
590 
591 send_comp:
592 	spin_lock_irqsave(&sqp->s_lock, flags);
593 	ibp->rvp.n_loop_pkts++;
594 flush_send:
595 	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
596 	hfi1_send_complete(sqp, wqe, send_status);
597 	if (local_ops) {
598 		atomic_dec(&sqp->local_ops_pending);
599 		local_ops = 0;
600 	}
601 	goto again;
602 
603 rnr_nak:
604 	/* Handle RNR NAK */
605 	if (qp->ibqp.qp_type == IB_QPT_UC)
606 		goto send_comp;
607 	ibp->rvp.n_rnr_naks++;
608 	/*
609 	 * Note: we don't need the s_lock held since the BUSY flag
610 	 * makes this single threaded.
611 	 */
612 	if (sqp->s_rnr_retry == 0) {
613 		send_status = IB_WC_RNR_RETRY_EXC_ERR;
614 		goto serr;
615 	}
616 	if (sqp->s_rnr_retry_cnt < 7)
617 		sqp->s_rnr_retry--;
618 	spin_lock_irqsave(&sqp->s_lock, flags);
619 	if (!(ib_rvt_state_ops[sqp->state] & RVT_PROCESS_RECV_OK))
620 		goto clr_busy;
621 	to = ib_hfi1_rnr_table[qp->r_min_rnr_timer];
622 	hfi1_add_rnr_timer(sqp, to);
623 	goto clr_busy;
624 
625 op_err:
626 	send_status = IB_WC_REM_OP_ERR;
627 	wc.status = IB_WC_LOC_QP_OP_ERR;
628 	goto err;
629 
630 inv_err:
631 	send_status = IB_WC_REM_INV_REQ_ERR;
632 	wc.status = IB_WC_LOC_QP_OP_ERR;
633 	goto err;
634 
635 acc_err:
636 	send_status = IB_WC_REM_ACCESS_ERR;
637 	wc.status = IB_WC_LOC_PROT_ERR;
638 err:
639 	/* responder goes to error state */
640 	hfi1_rc_error(qp, wc.status);
641 
642 serr:
643 	spin_lock_irqsave(&sqp->s_lock, flags);
644 	hfi1_send_complete(sqp, wqe, send_status);
645 	if (sqp->ibqp.qp_type == IB_QPT_RC) {
646 		int lastwqe = rvt_error_qp(sqp, IB_WC_WR_FLUSH_ERR);
647 
648 		sqp->s_flags &= ~RVT_S_BUSY;
649 		spin_unlock_irqrestore(&sqp->s_lock, flags);
650 		if (lastwqe) {
651 			struct ib_event ev;
652 
653 			ev.device = sqp->ibqp.device;
654 			ev.element.qp = &sqp->ibqp;
655 			ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
656 			sqp->ibqp.event_handler(&ev, sqp->ibqp.qp_context);
657 		}
658 		goto done;
659 	}
660 clr_busy:
661 	sqp->s_flags &= ~RVT_S_BUSY;
662 unlock:
663 	spin_unlock_irqrestore(&sqp->s_lock, flags);
664 done:
665 	rcu_read_unlock();
666 }
667 
668 /**
669  * hfi1_make_grh - construct a GRH header
670  * @ibp: a pointer to the IB port
671  * @hdr: a pointer to the GRH header being constructed
672  * @grh: the global route address to send to
673  * @hwords: the number of 32 bit words of header being sent
674  * @nwords: the number of 32 bit words of data being sent
675  *
676  * Return the size of the header in 32 bit words.
677  */
678 u32 hfi1_make_grh(struct hfi1_ibport *ibp, struct ib_grh *hdr,
679 		  struct ib_global_route *grh, u32 hwords, u32 nwords)
680 {
681 	hdr->version_tclass_flow =
682 		cpu_to_be32((IB_GRH_VERSION << IB_GRH_VERSION_SHIFT) |
683 			    (grh->traffic_class << IB_GRH_TCLASS_SHIFT) |
684 			    (grh->flow_label << IB_GRH_FLOW_SHIFT));
685 	hdr->paylen = cpu_to_be16((hwords - 2 + nwords + SIZE_OF_CRC) << 2);
686 	/* next_hdr is defined by C8-7 in ch. 8.4.1 */
687 	hdr->next_hdr = IB_GRH_NEXT_HDR;
688 	hdr->hop_limit = grh->hop_limit;
689 	/* The SGID is 32-bit aligned. */
690 	hdr->sgid.global.subnet_prefix = ibp->rvp.gid_prefix;
691 	hdr->sgid.global.interface_id =
692 		grh->sgid_index < HFI1_GUIDS_PER_PORT ?
693 		get_sguid(ibp, grh->sgid_index) :
694 		get_sguid(ibp, HFI1_PORT_GUID_INDEX);
695 	hdr->dgid = grh->dgid;
696 
697 	/* GRH header size in 32-bit words. */
698 	return sizeof(struct ib_grh) / sizeof(u32);
699 }
700 
701 #define BTH2_OFFSET (offsetof(struct hfi1_sdma_header, hdr.u.oth.bth[2]) / 4)
702 
703 /**
704  * build_ahg - create ahg in s_ahg
705  * @qp: a pointer to QP
706  * @npsn: the next PSN for the request/response
707  *
708  * This routine handles the AHG by allocating an ahg entry and causing the
709  * copy of the first middle.
710  *
711  * Subsequent middles use the copied entry, editing the
712  * PSN with 1 or 2 edits.
713  */
714 static inline void build_ahg(struct rvt_qp *qp, u32 npsn)
715 {
716 	struct hfi1_qp_priv *priv = qp->priv;
717 
718 	if (unlikely(qp->s_flags & RVT_S_AHG_CLEAR))
719 		clear_ahg(qp);
720 	if (!(qp->s_flags & RVT_S_AHG_VALID)) {
721 		/* first middle that needs copy  */
722 		if (qp->s_ahgidx < 0)
723 			qp->s_ahgidx = sdma_ahg_alloc(priv->s_sde);
724 		if (qp->s_ahgidx >= 0) {
725 			qp->s_ahgpsn = npsn;
726 			priv->s_ahg->tx_flags |= SDMA_TXREQ_F_AHG_COPY;
727 			/* save to protect a change in another thread */
728 			priv->s_ahg->ahgidx = qp->s_ahgidx;
729 			qp->s_flags |= RVT_S_AHG_VALID;
730 		}
731 	} else {
732 		/* subsequent middle after valid */
733 		if (qp->s_ahgidx >= 0) {
734 			priv->s_ahg->tx_flags |= SDMA_TXREQ_F_USE_AHG;
735 			priv->s_ahg->ahgidx = qp->s_ahgidx;
736 			priv->s_ahg->ahgcount++;
737 			priv->s_ahg->ahgdesc[0] =
738 				sdma_build_ahg_descriptor(
739 					(__force u16)cpu_to_be16((u16)npsn),
740 					BTH2_OFFSET,
741 					16,
742 					16);
743 			if ((npsn & 0xffff0000) !=
744 					(qp->s_ahgpsn & 0xffff0000)) {
745 				priv->s_ahg->ahgcount++;
746 				priv->s_ahg->ahgdesc[1] =
747 					sdma_build_ahg_descriptor(
748 						(__force u16)cpu_to_be16(
749 							(u16)(npsn >> 16)),
750 						BTH2_OFFSET,
751 						0,
752 						16);
753 			}
754 		}
755 	}
756 }
757 
758 void hfi1_make_ruc_header(struct rvt_qp *qp, struct ib_other_headers *ohdr,
759 			  u32 bth0, u32 bth2, int middle,
760 			  struct hfi1_pkt_state *ps)
761 {
762 	struct hfi1_qp_priv *priv = qp->priv;
763 	struct hfi1_ibport *ibp = ps->ibp;
764 	u16 lrh0;
765 	u32 nwords;
766 	u32 extra_bytes;
767 	u32 bth1;
768 
769 	/* Construct the header. */
770 	extra_bytes = -ps->s_txreq->s_cur_size & 3;
771 	nwords = (ps->s_txreq->s_cur_size + extra_bytes) >> 2;
772 	lrh0 = HFI1_LRH_BTH;
773 	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
774 		qp->s_hdrwords += hfi1_make_grh(ibp,
775 						&ps->s_txreq->phdr.hdr.u.l.grh,
776 						&qp->remote_ah_attr.grh,
777 						qp->s_hdrwords, nwords);
778 		lrh0 = HFI1_LRH_GRH;
779 		middle = 0;
780 	}
781 	lrh0 |= (priv->s_sc & 0xf) << 12 | (qp->remote_ah_attr.sl & 0xf) << 4;
782 	/*
783 	 * reset s_ahg/AHG fields
784 	 *
785 	 * This insures that the ahgentry/ahgcount
786 	 * are at a non-AHG default to protect
787 	 * build_verbs_tx_desc() from using
788 	 * an include ahgidx.
789 	 *
790 	 * build_ahg() will modify as appropriate
791 	 * to use the AHG feature.
792 	 */
793 	priv->s_ahg->tx_flags = 0;
794 	priv->s_ahg->ahgcount = 0;
795 	priv->s_ahg->ahgidx = 0;
796 	if (qp->s_mig_state == IB_MIG_MIGRATED)
797 		bth0 |= IB_BTH_MIG_REQ;
798 	else
799 		middle = 0;
800 	if (middle)
801 		build_ahg(qp, bth2);
802 	else
803 		qp->s_flags &= ~RVT_S_AHG_VALID;
804 	ps->s_txreq->phdr.hdr.lrh[0] = cpu_to_be16(lrh0);
805 	ps->s_txreq->phdr.hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
806 	ps->s_txreq->phdr.hdr.lrh[2] =
807 		cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
808 	ps->s_txreq->phdr.hdr.lrh[3] = cpu_to_be16(ppd_from_ibp(ibp)->lid |
809 				       qp->remote_ah_attr.src_path_bits);
810 	bth0 |= hfi1_get_pkey(ibp, qp->s_pkey_index);
811 	bth0 |= extra_bytes << 20;
812 	ohdr->bth[0] = cpu_to_be32(bth0);
813 	bth1 = qp->remote_qpn;
814 	if (qp->s_flags & RVT_S_ECN) {
815 		qp->s_flags &= ~RVT_S_ECN;
816 		/* we recently received a FECN, so return a BECN */
817 		bth1 |= (HFI1_BECN_MASK << HFI1_BECN_SHIFT);
818 	}
819 	ohdr->bth[1] = cpu_to_be32(bth1);
820 	ohdr->bth[2] = cpu_to_be32(bth2);
821 }
822 
823 /* when sending, force a reschedule every one of these periods */
824 #define SEND_RESCHED_TIMEOUT (5 * HZ)  /* 5s in jiffies */
825 
826 void _hfi1_do_send(struct work_struct *work)
827 {
828 	struct iowait *wait = container_of(work, struct iowait, iowork);
829 	struct rvt_qp *qp = iowait_to_qp(wait);
830 
831 	hfi1_do_send(qp);
832 }
833 
834 /**
835  * hfi1_do_send - perform a send on a QP
836  * @work: contains a pointer to the QP
837  *
838  * Process entries in the send work queue until credit or queue is
839  * exhausted.  Only allow one CPU to send a packet per QP.
840  * Otherwise, two threads could send packets out of order.
841  */
842 void hfi1_do_send(struct rvt_qp *qp)
843 {
844 	struct hfi1_pkt_state ps;
845 	struct hfi1_qp_priv *priv = qp->priv;
846 	int (*make_req)(struct rvt_qp *qp, struct hfi1_pkt_state *ps);
847 	unsigned long timeout;
848 	unsigned long timeout_int;
849 	int cpu;
850 
851 	ps.dev = to_idev(qp->ibqp.device);
852 	ps.ibp = to_iport(qp->ibqp.device, qp->port_num);
853 	ps.ppd = ppd_from_ibp(ps.ibp);
854 
855 	switch (qp->ibqp.qp_type) {
856 	case IB_QPT_RC:
857 		if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc
858 								) - 1)) ==
859 				 ps.ppd->lid)) {
860 			ruc_loopback(qp);
861 			return;
862 		}
863 		make_req = hfi1_make_rc_req;
864 		timeout_int = (qp->timeout_jiffies);
865 		break;
866 	case IB_QPT_UC:
867 		if (!loopback && ((qp->remote_ah_attr.dlid & ~((1 << ps.ppd->lmc
868 								) - 1)) ==
869 				 ps.ppd->lid)) {
870 			ruc_loopback(qp);
871 			return;
872 		}
873 		make_req = hfi1_make_uc_req;
874 		timeout_int = SEND_RESCHED_TIMEOUT;
875 		break;
876 	default:
877 		make_req = hfi1_make_ud_req;
878 		timeout_int = SEND_RESCHED_TIMEOUT;
879 	}
880 
881 	spin_lock_irqsave(&qp->s_lock, ps.flags);
882 
883 	/* Return if we are already busy processing a work request. */
884 	if (!hfi1_send_ok(qp)) {
885 		spin_unlock_irqrestore(&qp->s_lock, ps.flags);
886 		return;
887 	}
888 
889 	qp->s_flags |= RVT_S_BUSY;
890 
891 	timeout = jiffies + (timeout_int) / 8;
892 	cpu = priv->s_sde ? priv->s_sde->cpu :
893 			cpumask_first(cpumask_of_node(ps.ppd->dd->node));
894 	/* insure a pre-built packet is handled  */
895 	ps.s_txreq = get_waiting_verbs_txreq(qp);
896 	do {
897 		/* Check for a constructed packet to be sent. */
898 		if (qp->s_hdrwords != 0) {
899 			spin_unlock_irqrestore(&qp->s_lock, ps.flags);
900 			/*
901 			 * If the packet cannot be sent now, return and
902 			 * the send engine will be woken up later.
903 			 */
904 			if (hfi1_verbs_send(qp, &ps))
905 				return;
906 			/* Record that s_ahg is empty. */
907 			qp->s_hdrwords = 0;
908 			/* allow other tasks to run */
909 			if (unlikely(time_after(jiffies, timeout))) {
910 				if (workqueue_congested(cpu,
911 							ps.ppd->hfi1_wq)) {
912 					spin_lock_irqsave(
913 						&qp->s_lock,
914 						ps.flags);
915 					qp->s_flags &= ~RVT_S_BUSY;
916 					hfi1_schedule_send(qp);
917 					spin_unlock_irqrestore(
918 						&qp->s_lock,
919 						ps.flags);
920 					this_cpu_inc(
921 						*ps.ppd->dd->send_schedule);
922 					return;
923 				}
924 				if (!irqs_disabled()) {
925 					cond_resched();
926 					this_cpu_inc(
927 					   *ps.ppd->dd->send_schedule);
928 				}
929 				timeout = jiffies + (timeout_int) / 8;
930 			}
931 			spin_lock_irqsave(&qp->s_lock, ps.flags);
932 		}
933 	} while (make_req(qp, &ps));
934 
935 	spin_unlock_irqrestore(&qp->s_lock, ps.flags);
936 }
937 
938 /*
939  * This should be called with s_lock held.
940  */
941 void hfi1_send_complete(struct rvt_qp *qp, struct rvt_swqe *wqe,
942 			enum ib_wc_status status)
943 {
944 	u32 old_last, last;
945 
946 	if (!(ib_rvt_state_ops[qp->state] & RVT_PROCESS_OR_FLUSH_SEND))
947 		return;
948 
949 	last = qp->s_last;
950 	old_last = last;
951 	if (++last >= qp->s_size)
952 		last = 0;
953 	qp->s_last = last;
954 	/* See post_send() */
955 	barrier();
956 	rvt_put_swqe(wqe);
957 	if (qp->ibqp.qp_type == IB_QPT_UD ||
958 	    qp->ibqp.qp_type == IB_QPT_SMI ||
959 	    qp->ibqp.qp_type == IB_QPT_GSI)
960 		atomic_dec(&ibah_to_rvtah(wqe->ud_wr.ah)->refcount);
961 
962 	rvt_qp_swqe_complete(qp, wqe, status);
963 
964 	if (qp->s_acked == old_last)
965 		qp->s_acked = last;
966 	if (qp->s_cur == old_last)
967 		qp->s_cur = last;
968 	if (qp->s_tail == old_last)
969 		qp->s_tail = last;
970 	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
971 		qp->s_draining = 0;
972 }
973