xref: /linux/drivers/infiniband/hw/hfi1/user_sdma.c (revision 0883c2c06fb5bcf5b9e008270827e63c09a88c1e)
1 /*
2  * Copyright(c) 2015, 2016 Intel Corporation.
3  *
4  * This file is provided under a dual BSD/GPLv2 license.  When using or
5  * redistributing this file, you may do so under either license.
6  *
7  * GPL LICENSE SUMMARY
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of version 2 of the GNU General Public License as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful, but
14  * WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * General Public License for more details.
17  *
18  * BSD LICENSE
19  *
20  * Redistribution and use in source and binary forms, with or without
21  * modification, are permitted provided that the following conditions
22  * are met:
23  *
24  *  - Redistributions of source code must retain the above copyright
25  *    notice, this list of conditions and the following disclaimer.
26  *  - Redistributions in binary form must reproduce the above copyright
27  *    notice, this list of conditions and the following disclaimer in
28  *    the documentation and/or other materials provided with the
29  *    distribution.
30  *  - Neither the name of Intel Corporation nor the names of its
31  *    contributors may be used to endorse or promote products derived
32  *    from this software without specific prior written permission.
33  *
34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45  *
46  */
47 #include <linux/mm.h>
48 #include <linux/types.h>
49 #include <linux/device.h>
50 #include <linux/dmapool.h>
51 #include <linux/slab.h>
52 #include <linux/list.h>
53 #include <linux/highmem.h>
54 #include <linux/io.h>
55 #include <linux/uio.h>
56 #include <linux/rbtree.h>
57 #include <linux/spinlock.h>
58 #include <linux/delay.h>
59 #include <linux/kthread.h>
60 #include <linux/mmu_context.h>
61 #include <linux/module.h>
62 #include <linux/vmalloc.h>
63 
64 #include "hfi.h"
65 #include "sdma.h"
66 #include "user_sdma.h"
67 #include "verbs.h"  /* for the headers */
68 #include "common.h" /* for struct hfi1_tid_info */
69 #include "trace.h"
70 #include "mmu_rb.h"
71 
72 static uint hfi1_sdma_comp_ring_size = 128;
73 module_param_named(sdma_comp_size, hfi1_sdma_comp_ring_size, uint, S_IRUGO);
74 MODULE_PARM_DESC(sdma_comp_size, "Size of User SDMA completion ring. Default: 128");
75 
76 /* The maximum number of Data io vectors per message/request */
77 #define MAX_VECTORS_PER_REQ 8
78 /*
79  * Maximum number of packet to send from each message/request
80  * before moving to the next one.
81  */
82 #define MAX_PKTS_PER_QUEUE 16
83 
84 #define num_pages(x) (1 + ((((x) - 1) & PAGE_MASK) >> PAGE_SHIFT))
85 
86 #define req_opcode(x) \
87 	(((x) >> HFI1_SDMA_REQ_OPCODE_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
88 #define req_version(x) \
89 	(((x) >> HFI1_SDMA_REQ_VERSION_SHIFT) & HFI1_SDMA_REQ_OPCODE_MASK)
90 #define req_iovcnt(x) \
91 	(((x) >> HFI1_SDMA_REQ_IOVCNT_SHIFT) & HFI1_SDMA_REQ_IOVCNT_MASK)
92 
93 /* Number of BTH.PSN bits used for sequence number in expected rcvs */
94 #define BTH_SEQ_MASK 0x7ffull
95 
96 /*
97  * Define fields in the KDETH header so we can update the header
98  * template.
99  */
100 #define KDETH_OFFSET_SHIFT        0
101 #define KDETH_OFFSET_MASK         0x7fff
102 #define KDETH_OM_SHIFT            15
103 #define KDETH_OM_MASK             0x1
104 #define KDETH_TID_SHIFT           16
105 #define KDETH_TID_MASK            0x3ff
106 #define KDETH_TIDCTRL_SHIFT       26
107 #define KDETH_TIDCTRL_MASK        0x3
108 #define KDETH_INTR_SHIFT          28
109 #define KDETH_INTR_MASK           0x1
110 #define KDETH_SH_SHIFT            29
111 #define KDETH_SH_MASK             0x1
112 #define KDETH_HCRC_UPPER_SHIFT    16
113 #define KDETH_HCRC_UPPER_MASK     0xff
114 #define KDETH_HCRC_LOWER_SHIFT    24
115 #define KDETH_HCRC_LOWER_MASK     0xff
116 
117 #define PBC2LRH(x) ((((x) & 0xfff) << 2) - 4)
118 #define LRH2PBC(x) ((((x) >> 2) + 1) & 0xfff)
119 
120 #define KDETH_GET(val, field)						\
121 	(((le32_to_cpu((val))) >> KDETH_##field##_SHIFT) & KDETH_##field##_MASK)
122 #define KDETH_SET(dw, field, val) do {					\
123 		u32 dwval = le32_to_cpu(dw);				\
124 		dwval &= ~(KDETH_##field##_MASK << KDETH_##field##_SHIFT); \
125 		dwval |= (((val) & KDETH_##field##_MASK) << \
126 			  KDETH_##field##_SHIFT);			\
127 		dw = cpu_to_le32(dwval);				\
128 	} while (0)
129 
130 #define AHG_HEADER_SET(arr, idx, dw, bit, width, value)			\
131 	do {								\
132 		if ((idx) < ARRAY_SIZE((arr)))				\
133 			(arr)[(idx++)] = sdma_build_ahg_descriptor(	\
134 				(__force u16)(value), (dw), (bit),	\
135 							(width));	\
136 		else							\
137 			return -ERANGE;					\
138 	} while (0)
139 
140 /* KDETH OM multipliers and switch over point */
141 #define KDETH_OM_SMALL     4
142 #define KDETH_OM_LARGE     64
143 #define KDETH_OM_MAX_SIZE  (1 << ((KDETH_OM_LARGE / KDETH_OM_SMALL) + 1))
144 
145 /* Last packet in the request */
146 #define TXREQ_FLAGS_REQ_LAST_PKT BIT(0)
147 
148 #define SDMA_REQ_IN_USE     0
149 #define SDMA_REQ_FOR_THREAD 1
150 #define SDMA_REQ_SEND_DONE  2
151 #define SDMA_REQ_HAVE_AHG   3
152 #define SDMA_REQ_HAS_ERROR  4
153 #define SDMA_REQ_DONE_ERROR 5
154 
155 #define SDMA_PKT_Q_INACTIVE BIT(0)
156 #define SDMA_PKT_Q_ACTIVE   BIT(1)
157 #define SDMA_PKT_Q_DEFERRED BIT(2)
158 
159 /*
160  * Maximum retry attempts to submit a TX request
161  * before putting the process to sleep.
162  */
163 #define MAX_DEFER_RETRY_COUNT 1
164 
165 static unsigned initial_pkt_count = 8;
166 
167 #define SDMA_IOWAIT_TIMEOUT 1000 /* in milliseconds */
168 
169 struct sdma_mmu_node;
170 
171 struct user_sdma_iovec {
172 	struct list_head list;
173 	struct iovec iov;
174 	/* number of pages in this vector */
175 	unsigned npages;
176 	/* array of pinned pages for this vector */
177 	struct page **pages;
178 	/*
179 	 * offset into the virtual address space of the vector at
180 	 * which we last left off.
181 	 */
182 	u64 offset;
183 	struct sdma_mmu_node *node;
184 };
185 
186 #define SDMA_CACHE_NODE_EVICT 0
187 
188 struct sdma_mmu_node {
189 	struct mmu_rb_node rb;
190 	struct list_head list;
191 	struct hfi1_user_sdma_pkt_q *pq;
192 	atomic_t refcount;
193 	struct page **pages;
194 	unsigned npages;
195 	unsigned long flags;
196 };
197 
198 struct user_sdma_request {
199 	struct sdma_req_info info;
200 	struct hfi1_user_sdma_pkt_q *pq;
201 	struct hfi1_user_sdma_comp_q *cq;
202 	/* This is the original header from user space */
203 	struct hfi1_pkt_header hdr;
204 	/*
205 	 * Pointer to the SDMA engine for this request.
206 	 * Since different request could be on different VLs,
207 	 * each request will need it's own engine pointer.
208 	 */
209 	struct sdma_engine *sde;
210 	u8 ahg_idx;
211 	u32 ahg[9];
212 	/*
213 	 * KDETH.Offset (Eager) field
214 	 * We need to remember the initial value so the headers
215 	 * can be updated properly.
216 	 */
217 	u32 koffset;
218 	/*
219 	 * KDETH.OFFSET (TID) field
220 	 * The offset can cover multiple packets, depending on the
221 	 * size of the TID entry.
222 	 */
223 	u32 tidoffset;
224 	/*
225 	 * KDETH.OM
226 	 * Remember this because the header template always sets it
227 	 * to 0.
228 	 */
229 	u8 omfactor;
230 	/*
231 	 * We copy the iovs for this request (based on
232 	 * info.iovcnt). These are only the data vectors
233 	 */
234 	unsigned data_iovs;
235 	/* total length of the data in the request */
236 	u32 data_len;
237 	/* progress index moving along the iovs array */
238 	unsigned iov_idx;
239 	struct user_sdma_iovec iovs[MAX_VECTORS_PER_REQ];
240 	/* number of elements copied to the tids array */
241 	u16 n_tids;
242 	/* TID array values copied from the tid_iov vector */
243 	u32 *tids;
244 	u16 tididx;
245 	u32 sent;
246 	u64 seqnum;
247 	u64 seqcomp;
248 	u64 seqsubmitted;
249 	struct list_head txps;
250 	unsigned long flags;
251 	/* status of the last txreq completed */
252 	int status;
253 };
254 
255 /*
256  * A single txreq could span up to 3 physical pages when the MTU
257  * is sufficiently large (> 4K). Each of the IOV pointers also
258  * needs it's own set of flags so the vector has been handled
259  * independently of each other.
260  */
261 struct user_sdma_txreq {
262 	/* Packet header for the txreq */
263 	struct hfi1_pkt_header hdr;
264 	struct sdma_txreq txreq;
265 	struct list_head list;
266 	struct user_sdma_request *req;
267 	u16 flags;
268 	unsigned busycount;
269 	u64 seqnum;
270 };
271 
272 #define SDMA_DBG(req, fmt, ...)				     \
273 	hfi1_cdbg(SDMA, "[%u:%u:%u:%u] " fmt, (req)->pq->dd->unit, \
274 		 (req)->pq->ctxt, (req)->pq->subctxt, (req)->info.comp_idx, \
275 		 ##__VA_ARGS__)
276 #define SDMA_Q_DBG(pq, fmt, ...)			 \
277 	hfi1_cdbg(SDMA, "[%u:%u:%u] " fmt, (pq)->dd->unit, (pq)->ctxt, \
278 		 (pq)->subctxt, ##__VA_ARGS__)
279 
280 static int user_sdma_send_pkts(struct user_sdma_request *, unsigned);
281 static int num_user_pages(const struct iovec *);
282 static void user_sdma_txreq_cb(struct sdma_txreq *, int);
283 static inline void pq_update(struct hfi1_user_sdma_pkt_q *);
284 static void user_sdma_free_request(struct user_sdma_request *, bool);
285 static int pin_vector_pages(struct user_sdma_request *,
286 			    struct user_sdma_iovec *);
287 static void unpin_vector_pages(struct mm_struct *, struct page **, unsigned,
288 			       unsigned);
289 static int check_header_template(struct user_sdma_request *,
290 				 struct hfi1_pkt_header *, u32, u32);
291 static int set_txreq_header(struct user_sdma_request *,
292 			    struct user_sdma_txreq *, u32);
293 static int set_txreq_header_ahg(struct user_sdma_request *,
294 				struct user_sdma_txreq *, u32);
295 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *,
296 				  struct hfi1_user_sdma_comp_q *,
297 				  u16, enum hfi1_sdma_comp_state, int);
298 static inline u32 set_pkt_bth_psn(__be32, u8, u32);
299 static inline u32 get_lrh_len(struct hfi1_pkt_header, u32 len);
300 
301 static int defer_packet_queue(
302 	struct sdma_engine *,
303 	struct iowait *,
304 	struct sdma_txreq *,
305 	unsigned seq);
306 static void activate_packet_queue(struct iowait *, int);
307 static bool sdma_rb_filter(struct mmu_rb_node *, unsigned long, unsigned long);
308 static int sdma_rb_insert(struct rb_root *, struct mmu_rb_node *);
309 static void sdma_rb_remove(struct rb_root *, struct mmu_rb_node *,
310 			   struct mm_struct *);
311 static int sdma_rb_invalidate(struct rb_root *, struct mmu_rb_node *);
312 
313 static struct mmu_rb_ops sdma_rb_ops = {
314 	.filter = sdma_rb_filter,
315 	.insert = sdma_rb_insert,
316 	.remove = sdma_rb_remove,
317 	.invalidate = sdma_rb_invalidate
318 };
319 
320 static int defer_packet_queue(
321 	struct sdma_engine *sde,
322 	struct iowait *wait,
323 	struct sdma_txreq *txreq,
324 	unsigned seq)
325 {
326 	struct hfi1_user_sdma_pkt_q *pq =
327 		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
328 	struct hfi1_ibdev *dev = &pq->dd->verbs_dev;
329 	struct user_sdma_txreq *tx =
330 		container_of(txreq, struct user_sdma_txreq, txreq);
331 
332 	if (sdma_progress(sde, seq, txreq)) {
333 		if (tx->busycount++ < MAX_DEFER_RETRY_COUNT)
334 			goto eagain;
335 	}
336 	/*
337 	 * We are assuming that if the list is enqueued somewhere, it
338 	 * is to the dmawait list since that is the only place where
339 	 * it is supposed to be enqueued.
340 	 */
341 	xchg(&pq->state, SDMA_PKT_Q_DEFERRED);
342 	write_seqlock(&dev->iowait_lock);
343 	if (list_empty(&pq->busy.list))
344 		list_add_tail(&pq->busy.list, &sde->dmawait);
345 	write_sequnlock(&dev->iowait_lock);
346 	return -EBUSY;
347 eagain:
348 	return -EAGAIN;
349 }
350 
351 static void activate_packet_queue(struct iowait *wait, int reason)
352 {
353 	struct hfi1_user_sdma_pkt_q *pq =
354 		container_of(wait, struct hfi1_user_sdma_pkt_q, busy);
355 	xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
356 	wake_up(&wait->wait_dma);
357 };
358 
359 static void sdma_kmem_cache_ctor(void *obj)
360 {
361 	struct user_sdma_txreq *tx = obj;
362 
363 	memset(tx, 0, sizeof(*tx));
364 }
365 
366 int hfi1_user_sdma_alloc_queues(struct hfi1_ctxtdata *uctxt, struct file *fp)
367 {
368 	struct hfi1_filedata *fd;
369 	int ret = 0;
370 	unsigned memsize;
371 	char buf[64];
372 	struct hfi1_devdata *dd;
373 	struct hfi1_user_sdma_comp_q *cq;
374 	struct hfi1_user_sdma_pkt_q *pq;
375 	unsigned long flags;
376 
377 	if (!uctxt || !fp) {
378 		ret = -EBADF;
379 		goto done;
380 	}
381 
382 	fd = fp->private_data;
383 
384 	if (!hfi1_sdma_comp_ring_size) {
385 		ret = -EINVAL;
386 		goto done;
387 	}
388 
389 	dd = uctxt->dd;
390 
391 	pq = kzalloc(sizeof(*pq), GFP_KERNEL);
392 	if (!pq)
393 		goto pq_nomem;
394 
395 	memsize = sizeof(*pq->reqs) * hfi1_sdma_comp_ring_size;
396 	pq->reqs = kzalloc(memsize, GFP_KERNEL);
397 	if (!pq->reqs)
398 		goto pq_reqs_nomem;
399 
400 	INIT_LIST_HEAD(&pq->list);
401 	pq->dd = dd;
402 	pq->ctxt = uctxt->ctxt;
403 	pq->subctxt = fd->subctxt;
404 	pq->n_max_reqs = hfi1_sdma_comp_ring_size;
405 	pq->state = SDMA_PKT_Q_INACTIVE;
406 	atomic_set(&pq->n_reqs, 0);
407 	init_waitqueue_head(&pq->wait);
408 	pq->sdma_rb_root = RB_ROOT;
409 	INIT_LIST_HEAD(&pq->evict);
410 	spin_lock_init(&pq->evict_lock);
411 
412 	iowait_init(&pq->busy, 0, NULL, defer_packet_queue,
413 		    activate_packet_queue, NULL);
414 	pq->reqidx = 0;
415 	snprintf(buf, 64, "txreq-kmem-cache-%u-%u-%u", dd->unit, uctxt->ctxt,
416 		 fd->subctxt);
417 	pq->txreq_cache = kmem_cache_create(buf,
418 			       sizeof(struct user_sdma_txreq),
419 					    L1_CACHE_BYTES,
420 					    SLAB_HWCACHE_ALIGN,
421 					    sdma_kmem_cache_ctor);
422 	if (!pq->txreq_cache) {
423 		dd_dev_err(dd, "[%u] Failed to allocate TxReq cache\n",
424 			   uctxt->ctxt);
425 		goto pq_txreq_nomem;
426 	}
427 	fd->pq = pq;
428 	cq = kzalloc(sizeof(*cq), GFP_KERNEL);
429 	if (!cq)
430 		goto cq_nomem;
431 
432 	memsize = PAGE_ALIGN(sizeof(*cq->comps) * hfi1_sdma_comp_ring_size);
433 	cq->comps = vmalloc_user(memsize);
434 	if (!cq->comps)
435 		goto cq_comps_nomem;
436 
437 	cq->nentries = hfi1_sdma_comp_ring_size;
438 	fd->cq = cq;
439 
440 	ret = hfi1_mmu_rb_register(&pq->sdma_rb_root, &sdma_rb_ops);
441 	if (ret) {
442 		dd_dev_err(dd, "Failed to register with MMU %d", ret);
443 		goto done;
444 	}
445 
446 	spin_lock_irqsave(&uctxt->sdma_qlock, flags);
447 	list_add(&pq->list, &uctxt->sdma_queues);
448 	spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
449 	goto done;
450 
451 cq_comps_nomem:
452 	kfree(cq);
453 cq_nomem:
454 	kmem_cache_destroy(pq->txreq_cache);
455 pq_txreq_nomem:
456 	kfree(pq->reqs);
457 pq_reqs_nomem:
458 	kfree(pq);
459 	fd->pq = NULL;
460 pq_nomem:
461 	ret = -ENOMEM;
462 done:
463 	return ret;
464 }
465 
466 int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd)
467 {
468 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
469 	struct hfi1_user_sdma_pkt_q *pq;
470 	unsigned long flags;
471 
472 	hfi1_cdbg(SDMA, "[%u:%u:%u] Freeing user SDMA queues", uctxt->dd->unit,
473 		  uctxt->ctxt, fd->subctxt);
474 	pq = fd->pq;
475 	hfi1_mmu_rb_unregister(&pq->sdma_rb_root);
476 	if (pq) {
477 		spin_lock_irqsave(&uctxt->sdma_qlock, flags);
478 		if (!list_empty(&pq->list))
479 			list_del_init(&pq->list);
480 		spin_unlock_irqrestore(&uctxt->sdma_qlock, flags);
481 		iowait_sdma_drain(&pq->busy);
482 		/* Wait until all requests have been freed. */
483 		wait_event_interruptible(
484 			pq->wait,
485 			(ACCESS_ONCE(pq->state) == SDMA_PKT_Q_INACTIVE));
486 		kfree(pq->reqs);
487 		kmem_cache_destroy(pq->txreq_cache);
488 		kfree(pq);
489 		fd->pq = NULL;
490 	}
491 	if (fd->cq) {
492 		vfree(fd->cq->comps);
493 		kfree(fd->cq);
494 		fd->cq = NULL;
495 	}
496 	return 0;
497 }
498 
499 int hfi1_user_sdma_process_request(struct file *fp, struct iovec *iovec,
500 				   unsigned long dim, unsigned long *count)
501 {
502 	int ret = 0, i = 0;
503 	struct hfi1_filedata *fd = fp->private_data;
504 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
505 	struct hfi1_user_sdma_pkt_q *pq = fd->pq;
506 	struct hfi1_user_sdma_comp_q *cq = fd->cq;
507 	struct hfi1_devdata *dd = pq->dd;
508 	unsigned long idx = 0;
509 	u8 pcount = initial_pkt_count;
510 	struct sdma_req_info info;
511 	struct user_sdma_request *req;
512 	u8 opcode, sc, vl;
513 	int req_queued = 0;
514 
515 	if (iovec[idx].iov_len < sizeof(info) + sizeof(req->hdr)) {
516 		hfi1_cdbg(
517 		   SDMA,
518 		   "[%u:%u:%u] First vector not big enough for header %lu/%lu",
519 		   dd->unit, uctxt->ctxt, fd->subctxt,
520 		   iovec[idx].iov_len, sizeof(info) + sizeof(req->hdr));
521 		return -EINVAL;
522 	}
523 	ret = copy_from_user(&info, iovec[idx].iov_base, sizeof(info));
524 	if (ret) {
525 		hfi1_cdbg(SDMA, "[%u:%u:%u] Failed to copy info QW (%d)",
526 			  dd->unit, uctxt->ctxt, fd->subctxt, ret);
527 		return -EFAULT;
528 	}
529 
530 	trace_hfi1_sdma_user_reqinfo(dd, uctxt->ctxt, fd->subctxt,
531 				     (u16 *)&info);
532 	if (cq->comps[info.comp_idx].status == QUEUED ||
533 	    test_bit(SDMA_REQ_IN_USE, &pq->reqs[info.comp_idx].flags)) {
534 		hfi1_cdbg(SDMA, "[%u:%u:%u] Entry %u is in QUEUED state",
535 			  dd->unit, uctxt->ctxt, fd->subctxt,
536 			  info.comp_idx);
537 		return -EBADSLT;
538 	}
539 	if (!info.fragsize) {
540 		hfi1_cdbg(SDMA,
541 			  "[%u:%u:%u:%u] Request does not specify fragsize",
542 			  dd->unit, uctxt->ctxt, fd->subctxt, info.comp_idx);
543 		return -EINVAL;
544 	}
545 	/*
546 	 * We've done all the safety checks that we can up to this point,
547 	 * "allocate" the request entry.
548 	 */
549 	hfi1_cdbg(SDMA, "[%u:%u:%u] Using req/comp entry %u\n", dd->unit,
550 		  uctxt->ctxt, fd->subctxt, info.comp_idx);
551 	req = pq->reqs + info.comp_idx;
552 	memset(req, 0, sizeof(*req));
553 	/* Mark the request as IN_USE before we start filling it in. */
554 	set_bit(SDMA_REQ_IN_USE, &req->flags);
555 	req->data_iovs = req_iovcnt(info.ctrl) - 1;
556 	req->pq = pq;
557 	req->cq = cq;
558 	req->status = -1;
559 	INIT_LIST_HEAD(&req->txps);
560 
561 	memcpy(&req->info, &info, sizeof(info));
562 
563 	if (req_opcode(info.ctrl) == EXPECTED)
564 		req->data_iovs--;
565 
566 	if (!info.npkts || req->data_iovs > MAX_VECTORS_PER_REQ) {
567 		SDMA_DBG(req, "Too many vectors (%u/%u)", req->data_iovs,
568 			 MAX_VECTORS_PER_REQ);
569 		return -EINVAL;
570 	}
571 	/* Copy the header from the user buffer */
572 	ret = copy_from_user(&req->hdr, iovec[idx].iov_base + sizeof(info),
573 			     sizeof(req->hdr));
574 	if (ret) {
575 		SDMA_DBG(req, "Failed to copy header template (%d)", ret);
576 		ret = -EFAULT;
577 		goto free_req;
578 	}
579 
580 	/* If Static rate control is not enabled, sanitize the header. */
581 	if (!HFI1_CAP_IS_USET(STATIC_RATE_CTRL))
582 		req->hdr.pbc[2] = 0;
583 
584 	/* Validate the opcode. Do not trust packets from user space blindly. */
585 	opcode = (be32_to_cpu(req->hdr.bth[0]) >> 24) & 0xff;
586 	if ((opcode & USER_OPCODE_CHECK_MASK) !=
587 	     USER_OPCODE_CHECK_VAL) {
588 		SDMA_DBG(req, "Invalid opcode (%d)", opcode);
589 		ret = -EINVAL;
590 		goto free_req;
591 	}
592 	/*
593 	 * Validate the vl. Do not trust packets from user space blindly.
594 	 * VL comes from PBC, SC comes from LRH, and the VL needs to
595 	 * match the SC look up.
596 	 */
597 	vl = (le16_to_cpu(req->hdr.pbc[0]) >> 12) & 0xF;
598 	sc = (((be16_to_cpu(req->hdr.lrh[0]) >> 12) & 0xF) |
599 	      (((le16_to_cpu(req->hdr.pbc[1]) >> 14) & 0x1) << 4));
600 	if (vl >= dd->pport->vls_operational ||
601 	    vl != sc_to_vlt(dd, sc)) {
602 		SDMA_DBG(req, "Invalid SC(%u)/VL(%u)", sc, vl);
603 		ret = -EINVAL;
604 		goto free_req;
605 	}
606 
607 	/* Checking P_KEY for requests from user-space */
608 	if (egress_pkey_check(dd->pport, req->hdr.lrh, req->hdr.bth, sc,
609 			      PKEY_CHECK_INVALID)) {
610 		ret = -EINVAL;
611 		goto free_req;
612 	}
613 
614 	/*
615 	 * Also should check the BTH.lnh. If it says the next header is GRH then
616 	 * the RXE parsing will be off and will land in the middle of the KDETH
617 	 * or miss it entirely.
618 	 */
619 	if ((be16_to_cpu(req->hdr.lrh[0]) & 0x3) == HFI1_LRH_GRH) {
620 		SDMA_DBG(req, "User tried to pass in a GRH");
621 		ret = -EINVAL;
622 		goto free_req;
623 	}
624 
625 	req->koffset = le32_to_cpu(req->hdr.kdeth.swdata[6]);
626 	/*
627 	 * Calculate the initial TID offset based on the values of
628 	 * KDETH.OFFSET and KDETH.OM that are passed in.
629 	 */
630 	req->tidoffset = KDETH_GET(req->hdr.kdeth.ver_tid_offset, OFFSET) *
631 		(KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
632 		 KDETH_OM_LARGE : KDETH_OM_SMALL);
633 	SDMA_DBG(req, "Initial TID offset %u", req->tidoffset);
634 	idx++;
635 
636 	/* Save all the IO vector structures */
637 	while (i < req->data_iovs) {
638 		INIT_LIST_HEAD(&req->iovs[i].list);
639 		memcpy(&req->iovs[i].iov, iovec + idx++, sizeof(struct iovec));
640 		ret = pin_vector_pages(req, &req->iovs[i]);
641 		if (ret) {
642 			req->status = ret;
643 			goto free_req;
644 		}
645 		req->data_len += req->iovs[i++].iov.iov_len;
646 	}
647 	SDMA_DBG(req, "total data length %u", req->data_len);
648 
649 	if (pcount > req->info.npkts)
650 		pcount = req->info.npkts;
651 	/*
652 	 * Copy any TID info
653 	 * User space will provide the TID info only when the
654 	 * request type is EXPECTED. This is true even if there is
655 	 * only one packet in the request and the header is already
656 	 * setup. The reason for the singular TID case is that the
657 	 * driver needs to perform safety checks.
658 	 */
659 	if (req_opcode(req->info.ctrl) == EXPECTED) {
660 		u16 ntids = iovec[idx].iov_len / sizeof(*req->tids);
661 
662 		if (!ntids || ntids > MAX_TID_PAIR_ENTRIES) {
663 			ret = -EINVAL;
664 			goto free_req;
665 		}
666 		req->tids = kcalloc(ntids, sizeof(*req->tids), GFP_KERNEL);
667 		if (!req->tids) {
668 			ret = -ENOMEM;
669 			goto free_req;
670 		}
671 		/*
672 		 * We have to copy all of the tids because they may vary
673 		 * in size and, therefore, the TID count might not be
674 		 * equal to the pkt count. However, there is no way to
675 		 * tell at this point.
676 		 */
677 		ret = copy_from_user(req->tids, iovec[idx].iov_base,
678 				     ntids * sizeof(*req->tids));
679 		if (ret) {
680 			SDMA_DBG(req, "Failed to copy %d TIDs (%d)",
681 				 ntids, ret);
682 			ret = -EFAULT;
683 			goto free_req;
684 		}
685 		req->n_tids = ntids;
686 		idx++;
687 	}
688 
689 	/* Have to select the engine */
690 	req->sde = sdma_select_engine_vl(dd,
691 					 (u32)(uctxt->ctxt + fd->subctxt),
692 					 vl);
693 	if (!req->sde || !sdma_running(req->sde)) {
694 		ret = -ECOMM;
695 		goto free_req;
696 	}
697 
698 	/* We don't need an AHG entry if the request contains only one packet */
699 	if (req->info.npkts > 1 && HFI1_CAP_IS_USET(SDMA_AHG)) {
700 		int ahg = sdma_ahg_alloc(req->sde);
701 
702 		if (likely(ahg >= 0)) {
703 			req->ahg_idx = (u8)ahg;
704 			set_bit(SDMA_REQ_HAVE_AHG, &req->flags);
705 		}
706 	}
707 
708 	set_comp_state(pq, cq, info.comp_idx, QUEUED, 0);
709 	atomic_inc(&pq->n_reqs);
710 	req_queued = 1;
711 	/* Send the first N packets in the request to buy us some time */
712 	ret = user_sdma_send_pkts(req, pcount);
713 	if (unlikely(ret < 0 && ret != -EBUSY)) {
714 		req->status = ret;
715 		goto free_req;
716 	}
717 
718 	/*
719 	 * It is possible that the SDMA engine would have processed all the
720 	 * submitted packets by the time we get here. Therefore, only set
721 	 * packet queue state to ACTIVE if there are still uncompleted
722 	 * requests.
723 	 */
724 	if (atomic_read(&pq->n_reqs))
725 		xchg(&pq->state, SDMA_PKT_Q_ACTIVE);
726 
727 	/*
728 	 * This is a somewhat blocking send implementation.
729 	 * The driver will block the caller until all packets of the
730 	 * request have been submitted to the SDMA engine. However, it
731 	 * will not wait for send completions.
732 	 */
733 	while (!test_bit(SDMA_REQ_SEND_DONE, &req->flags)) {
734 		ret = user_sdma_send_pkts(req, pcount);
735 		if (ret < 0) {
736 			if (ret != -EBUSY) {
737 				req->status = ret;
738 				set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
739 				if (ACCESS_ONCE(req->seqcomp) ==
740 				    req->seqsubmitted - 1)
741 					goto free_req;
742 				return ret;
743 			}
744 			wait_event_interruptible_timeout(
745 				pq->busy.wait_dma,
746 				(pq->state == SDMA_PKT_Q_ACTIVE),
747 				msecs_to_jiffies(
748 					SDMA_IOWAIT_TIMEOUT));
749 		}
750 	}
751 	*count += idx;
752 	return 0;
753 free_req:
754 	user_sdma_free_request(req, true);
755 	if (req_queued)
756 		pq_update(pq);
757 	set_comp_state(pq, cq, info.comp_idx, ERROR, req->status);
758 	return ret;
759 }
760 
761 static inline u32 compute_data_length(struct user_sdma_request *req,
762 				      struct user_sdma_txreq *tx)
763 {
764 	/*
765 	 * Determine the proper size of the packet data.
766 	 * The size of the data of the first packet is in the header
767 	 * template. However, it includes the header and ICRC, which need
768 	 * to be subtracted.
769 	 * The size of the remaining packets is the minimum of the frag
770 	 * size (MTU) or remaining data in the request.
771 	 */
772 	u32 len;
773 
774 	if (!req->seqnum) {
775 		len = ((be16_to_cpu(req->hdr.lrh[2]) << 2) -
776 		       (sizeof(tx->hdr) - 4));
777 	} else if (req_opcode(req->info.ctrl) == EXPECTED) {
778 		u32 tidlen = EXP_TID_GET(req->tids[req->tididx], LEN) *
779 			PAGE_SIZE;
780 		/*
781 		 * Get the data length based on the remaining space in the
782 		 * TID pair.
783 		 */
784 		len = min(tidlen - req->tidoffset, (u32)req->info.fragsize);
785 		/* If we've filled up the TID pair, move to the next one. */
786 		if (unlikely(!len) && ++req->tididx < req->n_tids &&
787 		    req->tids[req->tididx]) {
788 			tidlen = EXP_TID_GET(req->tids[req->tididx],
789 					     LEN) * PAGE_SIZE;
790 			req->tidoffset = 0;
791 			len = min_t(u32, tidlen, req->info.fragsize);
792 		}
793 		/*
794 		 * Since the TID pairs map entire pages, make sure that we
795 		 * are not going to try to send more data that we have
796 		 * remaining.
797 		 */
798 		len = min(len, req->data_len - req->sent);
799 	} else {
800 		len = min(req->data_len - req->sent, (u32)req->info.fragsize);
801 	}
802 	SDMA_DBG(req, "Data Length = %u", len);
803 	return len;
804 }
805 
806 static inline u32 get_lrh_len(struct hfi1_pkt_header hdr, u32 len)
807 {
808 	/* (Size of complete header - size of PBC) + 4B ICRC + data length */
809 	return ((sizeof(hdr) - sizeof(hdr.pbc)) + 4 + len);
810 }
811 
812 static int user_sdma_send_pkts(struct user_sdma_request *req, unsigned maxpkts)
813 {
814 	int ret = 0;
815 	unsigned npkts = 0;
816 	struct user_sdma_txreq *tx = NULL;
817 	struct hfi1_user_sdma_pkt_q *pq = NULL;
818 	struct user_sdma_iovec *iovec = NULL;
819 
820 	if (!req->pq)
821 		return -EINVAL;
822 
823 	pq = req->pq;
824 
825 	/* If tx completion has reported an error, we are done. */
826 	if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
827 		set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
828 		return -EFAULT;
829 	}
830 
831 	/*
832 	 * Check if we might have sent the entire request already
833 	 */
834 	if (unlikely(req->seqnum == req->info.npkts)) {
835 		if (!list_empty(&req->txps))
836 			goto dosend;
837 		return ret;
838 	}
839 
840 	if (!maxpkts || maxpkts > req->info.npkts - req->seqnum)
841 		maxpkts = req->info.npkts - req->seqnum;
842 
843 	while (npkts < maxpkts) {
844 		u32 datalen = 0, queued = 0, data_sent = 0;
845 		u64 iov_offset = 0;
846 
847 		/*
848 		 * Check whether any of the completions have come back
849 		 * with errors. If so, we are not going to process any
850 		 * more packets from this request.
851 		 */
852 		if (test_bit(SDMA_REQ_HAS_ERROR, &req->flags)) {
853 			set_bit(SDMA_REQ_DONE_ERROR, &req->flags);
854 			return -EFAULT;
855 		}
856 
857 		tx = kmem_cache_alloc(pq->txreq_cache, GFP_KERNEL);
858 		if (!tx)
859 			return -ENOMEM;
860 
861 		tx->flags = 0;
862 		tx->req = req;
863 		tx->busycount = 0;
864 		INIT_LIST_HEAD(&tx->list);
865 
866 		if (req->seqnum == req->info.npkts - 1)
867 			tx->flags |= TXREQ_FLAGS_REQ_LAST_PKT;
868 
869 		/*
870 		 * Calculate the payload size - this is min of the fragment
871 		 * (MTU) size or the remaining bytes in the request but only
872 		 * if we have payload data.
873 		 */
874 		if (req->data_len) {
875 			iovec = &req->iovs[req->iov_idx];
876 			if (ACCESS_ONCE(iovec->offset) == iovec->iov.iov_len) {
877 				if (++req->iov_idx == req->data_iovs) {
878 					ret = -EFAULT;
879 					goto free_txreq;
880 				}
881 				iovec = &req->iovs[req->iov_idx];
882 				WARN_ON(iovec->offset);
883 			}
884 
885 			datalen = compute_data_length(req, tx);
886 			if (!datalen) {
887 				SDMA_DBG(req,
888 					 "Request has data but pkt len is 0");
889 				ret = -EFAULT;
890 				goto free_tx;
891 			}
892 		}
893 
894 		if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags)) {
895 			if (!req->seqnum) {
896 				u16 pbclen = le16_to_cpu(req->hdr.pbc[0]);
897 				u32 lrhlen = get_lrh_len(req->hdr, datalen);
898 				/*
899 				 * Copy the request header into the tx header
900 				 * because the HW needs a cacheline-aligned
901 				 * address.
902 				 * This copy can be optimized out if the hdr
903 				 * member of user_sdma_request were also
904 				 * cacheline aligned.
905 				 */
906 				memcpy(&tx->hdr, &req->hdr, sizeof(tx->hdr));
907 				if (PBC2LRH(pbclen) != lrhlen) {
908 					pbclen = (pbclen & 0xf000) |
909 						LRH2PBC(lrhlen);
910 					tx->hdr.pbc[0] = cpu_to_le16(pbclen);
911 				}
912 				ret = sdma_txinit_ahg(&tx->txreq,
913 						      SDMA_TXREQ_F_AHG_COPY,
914 						      sizeof(tx->hdr) + datalen,
915 						      req->ahg_idx, 0, NULL, 0,
916 						      user_sdma_txreq_cb);
917 				if (ret)
918 					goto free_tx;
919 				ret = sdma_txadd_kvaddr(pq->dd, &tx->txreq,
920 							&tx->hdr,
921 							sizeof(tx->hdr));
922 				if (ret)
923 					goto free_txreq;
924 			} else {
925 				int changes;
926 
927 				changes = set_txreq_header_ahg(req, tx,
928 							       datalen);
929 				if (changes < 0)
930 					goto free_tx;
931 				sdma_txinit_ahg(&tx->txreq,
932 						SDMA_TXREQ_F_USE_AHG,
933 						datalen, req->ahg_idx, changes,
934 						req->ahg, sizeof(req->hdr),
935 						user_sdma_txreq_cb);
936 			}
937 		} else {
938 			ret = sdma_txinit(&tx->txreq, 0, sizeof(req->hdr) +
939 					  datalen, user_sdma_txreq_cb);
940 			if (ret)
941 				goto free_tx;
942 			/*
943 			 * Modify the header for this packet. This only needs
944 			 * to be done if we are not going to use AHG. Otherwise,
945 			 * the HW will do it based on the changes we gave it
946 			 * during sdma_txinit_ahg().
947 			 */
948 			ret = set_txreq_header(req, tx, datalen);
949 			if (ret)
950 				goto free_txreq;
951 		}
952 
953 		/*
954 		 * If the request contains any data vectors, add up to
955 		 * fragsize bytes to the descriptor.
956 		 */
957 		while (queued < datalen &&
958 		       (req->sent + data_sent) < req->data_len) {
959 			unsigned long base, offset;
960 			unsigned pageidx, len;
961 
962 			base = (unsigned long)iovec->iov.iov_base;
963 			offset = offset_in_page(base + iovec->offset +
964 						iov_offset);
965 			pageidx = (((iovec->offset + iov_offset +
966 				     base) - (base & PAGE_MASK)) >> PAGE_SHIFT);
967 			len = offset + req->info.fragsize > PAGE_SIZE ?
968 				PAGE_SIZE - offset : req->info.fragsize;
969 			len = min((datalen - queued), len);
970 			ret = sdma_txadd_page(pq->dd, &tx->txreq,
971 					      iovec->pages[pageidx],
972 					      offset, len);
973 			if (ret) {
974 				SDMA_DBG(req, "SDMA txreq add page failed %d\n",
975 					 ret);
976 				goto free_txreq;
977 			}
978 			iov_offset += len;
979 			queued += len;
980 			data_sent += len;
981 			if (unlikely(queued < datalen &&
982 				     pageidx == iovec->npages &&
983 				     req->iov_idx < req->data_iovs - 1)) {
984 				iovec->offset += iov_offset;
985 				iovec = &req->iovs[++req->iov_idx];
986 				iov_offset = 0;
987 			}
988 		}
989 		/*
990 		 * The txreq was submitted successfully so we can update
991 		 * the counters.
992 		 */
993 		req->koffset += datalen;
994 		if (req_opcode(req->info.ctrl) == EXPECTED)
995 			req->tidoffset += datalen;
996 		req->sent += data_sent;
997 		if (req->data_len)
998 			iovec->offset += iov_offset;
999 		list_add_tail(&tx->txreq.list, &req->txps);
1000 		/*
1001 		 * It is important to increment this here as it is used to
1002 		 * generate the BTH.PSN and, therefore, can't be bulk-updated
1003 		 * outside of the loop.
1004 		 */
1005 		tx->seqnum = req->seqnum++;
1006 		npkts++;
1007 	}
1008 dosend:
1009 	ret = sdma_send_txlist(req->sde, &pq->busy, &req->txps);
1010 	if (list_empty(&req->txps)) {
1011 		req->seqsubmitted = req->seqnum;
1012 		if (req->seqnum == req->info.npkts) {
1013 			set_bit(SDMA_REQ_SEND_DONE, &req->flags);
1014 			/*
1015 			 * The txreq has already been submitted to the HW queue
1016 			 * so we can free the AHG entry now. Corruption will not
1017 			 * happen due to the sequential manner in which
1018 			 * descriptors are processed.
1019 			 */
1020 			if (test_bit(SDMA_REQ_HAVE_AHG, &req->flags))
1021 				sdma_ahg_free(req->sde, req->ahg_idx);
1022 		}
1023 	} else if (ret > 0) {
1024 		req->seqsubmitted += ret;
1025 		ret = 0;
1026 	}
1027 	return ret;
1028 
1029 free_txreq:
1030 	sdma_txclean(pq->dd, &tx->txreq);
1031 free_tx:
1032 	kmem_cache_free(pq->txreq_cache, tx);
1033 	return ret;
1034 }
1035 
1036 /*
1037  * How many pages in this iovec element?
1038  */
1039 static inline int num_user_pages(const struct iovec *iov)
1040 {
1041 	const unsigned long addr  = (unsigned long)iov->iov_base;
1042 	const unsigned long len   = iov->iov_len;
1043 	const unsigned long spage = addr & PAGE_MASK;
1044 	const unsigned long epage = (addr + len - 1) & PAGE_MASK;
1045 
1046 	return 1 + ((epage - spage) >> PAGE_SHIFT);
1047 }
1048 
1049 static u32 sdma_cache_evict(struct hfi1_user_sdma_pkt_q *pq, u32 npages)
1050 {
1051 	u32 cleared = 0;
1052 	struct sdma_mmu_node *node, *ptr;
1053 	struct list_head to_evict = LIST_HEAD_INIT(to_evict);
1054 
1055 	spin_lock(&pq->evict_lock);
1056 	list_for_each_entry_safe_reverse(node, ptr, &pq->evict, list) {
1057 		/* Make sure that no one is still using the node. */
1058 		if (!atomic_read(&node->refcount)) {
1059 			set_bit(SDMA_CACHE_NODE_EVICT, &node->flags);
1060 			list_del_init(&node->list);
1061 			list_add(&node->list, &to_evict);
1062 			cleared += node->npages;
1063 			if (cleared >= npages)
1064 				break;
1065 		}
1066 	}
1067 	spin_unlock(&pq->evict_lock);
1068 
1069 	list_for_each_entry_safe(node, ptr, &to_evict, list)
1070 		hfi1_mmu_rb_remove(&pq->sdma_rb_root, &node->rb);
1071 
1072 	return cleared;
1073 }
1074 
1075 static int pin_vector_pages(struct user_sdma_request *req,
1076 			    struct user_sdma_iovec *iovec) {
1077 	int ret = 0, pinned, npages, cleared;
1078 	struct page **pages;
1079 	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1080 	struct sdma_mmu_node *node = NULL;
1081 	struct mmu_rb_node *rb_node;
1082 
1083 	rb_node = hfi1_mmu_rb_extract(&pq->sdma_rb_root,
1084 				      (unsigned long)iovec->iov.iov_base,
1085 				      iovec->iov.iov_len);
1086 	if (rb_node && !IS_ERR(rb_node))
1087 		node = container_of(rb_node, struct sdma_mmu_node, rb);
1088 	else
1089 		rb_node = NULL;
1090 
1091 	if (!node) {
1092 		node = kzalloc(sizeof(*node), GFP_KERNEL);
1093 		if (!node)
1094 			return -ENOMEM;
1095 
1096 		node->rb.addr = (unsigned long)iovec->iov.iov_base;
1097 		node->pq = pq;
1098 		atomic_set(&node->refcount, 0);
1099 		INIT_LIST_HEAD(&node->list);
1100 	}
1101 
1102 	npages = num_user_pages(&iovec->iov);
1103 	if (node->npages < npages) {
1104 		pages = kcalloc(npages, sizeof(*pages), GFP_KERNEL);
1105 		if (!pages) {
1106 			SDMA_DBG(req, "Failed page array alloc");
1107 			ret = -ENOMEM;
1108 			goto bail;
1109 		}
1110 		memcpy(pages, node->pages, node->npages * sizeof(*pages));
1111 
1112 		npages -= node->npages;
1113 
1114 		/*
1115 		 * If rb_node is NULL, it means that this is brand new node
1116 		 * and, therefore not on the eviction list.
1117 		 * If, however, the rb_node is non-NULL, it means that the
1118 		 * node is already in RB tree and, therefore on the eviction
1119 		 * list (nodes are unconditionally inserted in the eviction
1120 		 * list). In that case, we have to remove the node prior to
1121 		 * calling the eviction function in order to prevent it from
1122 		 * freeing this node.
1123 		 */
1124 		if (rb_node) {
1125 			spin_lock(&pq->evict_lock);
1126 			list_del_init(&node->list);
1127 			spin_unlock(&pq->evict_lock);
1128 		}
1129 retry:
1130 		if (!hfi1_can_pin_pages(pq->dd, pq->n_locked, npages)) {
1131 			cleared = sdma_cache_evict(pq, npages);
1132 			if (cleared >= npages)
1133 				goto retry;
1134 		}
1135 		pinned = hfi1_acquire_user_pages(
1136 			((unsigned long)iovec->iov.iov_base +
1137 			 (node->npages * PAGE_SIZE)), npages, 0,
1138 			pages + node->npages);
1139 		if (pinned < 0) {
1140 			kfree(pages);
1141 			ret = pinned;
1142 			goto bail;
1143 		}
1144 		if (pinned != npages) {
1145 			unpin_vector_pages(current->mm, pages, node->npages,
1146 					   pinned);
1147 			ret = -EFAULT;
1148 			goto bail;
1149 		}
1150 		kfree(node->pages);
1151 		node->rb.len = iovec->iov.iov_len;
1152 		node->pages = pages;
1153 		node->npages += pinned;
1154 		npages = node->npages;
1155 		spin_lock(&pq->evict_lock);
1156 		list_add(&node->list, &pq->evict);
1157 		pq->n_locked += pinned;
1158 		spin_unlock(&pq->evict_lock);
1159 	}
1160 	iovec->pages = node->pages;
1161 	iovec->npages = npages;
1162 	iovec->node = node;
1163 
1164 	ret = hfi1_mmu_rb_insert(&req->pq->sdma_rb_root, &node->rb);
1165 	if (ret) {
1166 		spin_lock(&pq->evict_lock);
1167 		if (!list_empty(&node->list))
1168 			list_del(&node->list);
1169 		pq->n_locked -= node->npages;
1170 		spin_unlock(&pq->evict_lock);
1171 		goto bail;
1172 	}
1173 	return 0;
1174 bail:
1175 	if (rb_node)
1176 		unpin_vector_pages(current->mm, node->pages, 0, node->npages);
1177 	kfree(node);
1178 	return ret;
1179 }
1180 
1181 static void unpin_vector_pages(struct mm_struct *mm, struct page **pages,
1182 			       unsigned start, unsigned npages)
1183 {
1184 	hfi1_release_user_pages(mm, pages + start, npages, 0);
1185 	kfree(pages);
1186 }
1187 
1188 static int check_header_template(struct user_sdma_request *req,
1189 				 struct hfi1_pkt_header *hdr, u32 lrhlen,
1190 				 u32 datalen)
1191 {
1192 	/*
1193 	 * Perform safety checks for any type of packet:
1194 	 *    - transfer size is multiple of 64bytes
1195 	 *    - packet length is multiple of 4bytes
1196 	 *    - entire request length is multiple of 4bytes
1197 	 *    - packet length is not larger than MTU size
1198 	 *
1199 	 * These checks are only done for the first packet of the
1200 	 * transfer since the header is "given" to us by user space.
1201 	 * For the remainder of the packets we compute the values.
1202 	 */
1203 	if (req->info.fragsize % PIO_BLOCK_SIZE ||
1204 	    lrhlen & 0x3 || req->data_len & 0x3  ||
1205 	    lrhlen > get_lrh_len(*hdr, req->info.fragsize))
1206 		return -EINVAL;
1207 
1208 	if (req_opcode(req->info.ctrl) == EXPECTED) {
1209 		/*
1210 		 * The header is checked only on the first packet. Furthermore,
1211 		 * we ensure that at least one TID entry is copied when the
1212 		 * request is submitted. Therefore, we don't have to verify that
1213 		 * tididx points to something sane.
1214 		 */
1215 		u32 tidval = req->tids[req->tididx],
1216 			tidlen = EXP_TID_GET(tidval, LEN) * PAGE_SIZE,
1217 			tididx = EXP_TID_GET(tidval, IDX),
1218 			tidctrl = EXP_TID_GET(tidval, CTRL),
1219 			tidoff;
1220 		__le32 kval = hdr->kdeth.ver_tid_offset;
1221 
1222 		tidoff = KDETH_GET(kval, OFFSET) *
1223 			  (KDETH_GET(req->hdr.kdeth.ver_tid_offset, OM) ?
1224 			   KDETH_OM_LARGE : KDETH_OM_SMALL);
1225 		/*
1226 		 * Expected receive packets have the following
1227 		 * additional checks:
1228 		 *     - offset is not larger than the TID size
1229 		 *     - TIDCtrl values match between header and TID array
1230 		 *     - TID indexes match between header and TID array
1231 		 */
1232 		if ((tidoff + datalen > tidlen) ||
1233 		    KDETH_GET(kval, TIDCTRL) != tidctrl ||
1234 		    KDETH_GET(kval, TID) != tididx)
1235 			return -EINVAL;
1236 	}
1237 	return 0;
1238 }
1239 
1240 /*
1241  * Correctly set the BTH.PSN field based on type of
1242  * transfer - eager packets can just increment the PSN but
1243  * expected packets encode generation and sequence in the
1244  * BTH.PSN field so just incrementing will result in errors.
1245  */
1246 static inline u32 set_pkt_bth_psn(__be32 bthpsn, u8 expct, u32 frags)
1247 {
1248 	u32 val = be32_to_cpu(bthpsn),
1249 		mask = (HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffffull :
1250 			0xffffffull),
1251 		psn = val & mask;
1252 	if (expct)
1253 		psn = (psn & ~BTH_SEQ_MASK) | ((psn + frags) & BTH_SEQ_MASK);
1254 	else
1255 		psn = psn + frags;
1256 	return psn & mask;
1257 }
1258 
1259 static int set_txreq_header(struct user_sdma_request *req,
1260 			    struct user_sdma_txreq *tx, u32 datalen)
1261 {
1262 	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1263 	struct hfi1_pkt_header *hdr = &tx->hdr;
1264 	u16 pbclen;
1265 	int ret;
1266 	u32 tidval = 0, lrhlen = get_lrh_len(*hdr, datalen);
1267 
1268 	/* Copy the header template to the request before modification */
1269 	memcpy(hdr, &req->hdr, sizeof(*hdr));
1270 
1271 	/*
1272 	 * Check if the PBC and LRH length are mismatched. If so
1273 	 * adjust both in the header.
1274 	 */
1275 	pbclen = le16_to_cpu(hdr->pbc[0]);
1276 	if (PBC2LRH(pbclen) != lrhlen) {
1277 		pbclen = (pbclen & 0xf000) | LRH2PBC(lrhlen);
1278 		hdr->pbc[0] = cpu_to_le16(pbclen);
1279 		hdr->lrh[2] = cpu_to_be16(lrhlen >> 2);
1280 		/*
1281 		 * Third packet
1282 		 * This is the first packet in the sequence that has
1283 		 * a "static" size that can be used for the rest of
1284 		 * the packets (besides the last one).
1285 		 */
1286 		if (unlikely(req->seqnum == 2)) {
1287 			/*
1288 			 * From this point on the lengths in both the
1289 			 * PBC and LRH are the same until the last
1290 			 * packet.
1291 			 * Adjust the template so we don't have to update
1292 			 * every packet
1293 			 */
1294 			req->hdr.pbc[0] = hdr->pbc[0];
1295 			req->hdr.lrh[2] = hdr->lrh[2];
1296 		}
1297 	}
1298 	/*
1299 	 * We only have to modify the header if this is not the
1300 	 * first packet in the request. Otherwise, we use the
1301 	 * header given to us.
1302 	 */
1303 	if (unlikely(!req->seqnum)) {
1304 		ret = check_header_template(req, hdr, lrhlen, datalen);
1305 		if (ret)
1306 			return ret;
1307 		goto done;
1308 	}
1309 
1310 	hdr->bth[2] = cpu_to_be32(
1311 		set_pkt_bth_psn(hdr->bth[2],
1312 				(req_opcode(req->info.ctrl) == EXPECTED),
1313 				req->seqnum));
1314 
1315 	/* Set ACK request on last packet */
1316 	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
1317 		hdr->bth[2] |= cpu_to_be32(1UL << 31);
1318 
1319 	/* Set the new offset */
1320 	hdr->kdeth.swdata[6] = cpu_to_le32(req->koffset);
1321 	/* Expected packets have to fill in the new TID information */
1322 	if (req_opcode(req->info.ctrl) == EXPECTED) {
1323 		tidval = req->tids[req->tididx];
1324 		/*
1325 		 * If the offset puts us at the end of the current TID,
1326 		 * advance everything.
1327 		 */
1328 		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1329 					 PAGE_SIZE)) {
1330 			req->tidoffset = 0;
1331 			/*
1332 			 * Since we don't copy all the TIDs, all at once,
1333 			 * we have to check again.
1334 			 */
1335 			if (++req->tididx > req->n_tids - 1 ||
1336 			    !req->tids[req->tididx]) {
1337 				return -EINVAL;
1338 			}
1339 			tidval = req->tids[req->tididx];
1340 		}
1341 		req->omfactor = EXP_TID_GET(tidval, LEN) * PAGE_SIZE >=
1342 			KDETH_OM_MAX_SIZE ? KDETH_OM_LARGE : KDETH_OM_SMALL;
1343 		/* Set KDETH.TIDCtrl based on value for this TID. */
1344 		KDETH_SET(hdr->kdeth.ver_tid_offset, TIDCTRL,
1345 			  EXP_TID_GET(tidval, CTRL));
1346 		/* Set KDETH.TID based on value for this TID */
1347 		KDETH_SET(hdr->kdeth.ver_tid_offset, TID,
1348 			  EXP_TID_GET(tidval, IDX));
1349 		/* Clear KDETH.SH only on the last packet */
1350 		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
1351 			KDETH_SET(hdr->kdeth.ver_tid_offset, SH, 0);
1352 		/*
1353 		 * Set the KDETH.OFFSET and KDETH.OM based on size of
1354 		 * transfer.
1355 		 */
1356 		SDMA_DBG(req, "TID offset %ubytes %uunits om%u",
1357 			 req->tidoffset, req->tidoffset / req->omfactor,
1358 			 req->omfactor != KDETH_OM_SMALL);
1359 		KDETH_SET(hdr->kdeth.ver_tid_offset, OFFSET,
1360 			  req->tidoffset / req->omfactor);
1361 		KDETH_SET(hdr->kdeth.ver_tid_offset, OM,
1362 			  req->omfactor != KDETH_OM_SMALL);
1363 	}
1364 done:
1365 	trace_hfi1_sdma_user_header(pq->dd, pq->ctxt, pq->subctxt,
1366 				    req->info.comp_idx, hdr, tidval);
1367 	return sdma_txadd_kvaddr(pq->dd, &tx->txreq, hdr, sizeof(*hdr));
1368 }
1369 
1370 static int set_txreq_header_ahg(struct user_sdma_request *req,
1371 				struct user_sdma_txreq *tx, u32 len)
1372 {
1373 	int diff = 0;
1374 	struct hfi1_user_sdma_pkt_q *pq = req->pq;
1375 	struct hfi1_pkt_header *hdr = &req->hdr;
1376 	u16 pbclen = le16_to_cpu(hdr->pbc[0]);
1377 	u32 val32, tidval = 0, lrhlen = get_lrh_len(*hdr, len);
1378 
1379 	if (PBC2LRH(pbclen) != lrhlen) {
1380 		/* PBC.PbcLengthDWs */
1381 		AHG_HEADER_SET(req->ahg, diff, 0, 0, 12,
1382 			       cpu_to_le16(LRH2PBC(lrhlen)));
1383 		/* LRH.PktLen (we need the full 16 bits due to byte swap) */
1384 		AHG_HEADER_SET(req->ahg, diff, 3, 0, 16,
1385 			       cpu_to_be16(lrhlen >> 2));
1386 	}
1387 
1388 	/*
1389 	 * Do the common updates
1390 	 */
1391 	/* BTH.PSN and BTH.A */
1392 	val32 = (be32_to_cpu(hdr->bth[2]) + req->seqnum) &
1393 		(HFI1_CAP_IS_KSET(EXTENDED_PSN) ? 0x7fffffff : 0xffffff);
1394 	if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT))
1395 		val32 |= 1UL << 31;
1396 	AHG_HEADER_SET(req->ahg, diff, 6, 0, 16, cpu_to_be16(val32 >> 16));
1397 	AHG_HEADER_SET(req->ahg, diff, 6, 16, 16, cpu_to_be16(val32 & 0xffff));
1398 	/* KDETH.Offset */
1399 	AHG_HEADER_SET(req->ahg, diff, 15, 0, 16,
1400 		       cpu_to_le16(req->koffset & 0xffff));
1401 	AHG_HEADER_SET(req->ahg, diff, 15, 16, 16,
1402 		       cpu_to_le16(req->koffset >> 16));
1403 	if (req_opcode(req->info.ctrl) == EXPECTED) {
1404 		__le16 val;
1405 
1406 		tidval = req->tids[req->tididx];
1407 
1408 		/*
1409 		 * If the offset puts us at the end of the current TID,
1410 		 * advance everything.
1411 		 */
1412 		if ((req->tidoffset) == (EXP_TID_GET(tidval, LEN) *
1413 					 PAGE_SIZE)) {
1414 			req->tidoffset = 0;
1415 			/*
1416 			 * Since we don't copy all the TIDs, all at once,
1417 			 * we have to check again.
1418 			 */
1419 			if (++req->tididx > req->n_tids - 1 ||
1420 			    !req->tids[req->tididx]) {
1421 				return -EINVAL;
1422 			}
1423 			tidval = req->tids[req->tididx];
1424 		}
1425 		req->omfactor = ((EXP_TID_GET(tidval, LEN) *
1426 				  PAGE_SIZE) >=
1427 				 KDETH_OM_MAX_SIZE) ? KDETH_OM_LARGE :
1428 			KDETH_OM_SMALL;
1429 		/* KDETH.OM and KDETH.OFFSET (TID) */
1430 		AHG_HEADER_SET(req->ahg, diff, 7, 0, 16,
1431 			       ((!!(req->omfactor - KDETH_OM_SMALL)) << 15 |
1432 				((req->tidoffset / req->omfactor) & 0x7fff)));
1433 		/* KDETH.TIDCtrl, KDETH.TID */
1434 		val = cpu_to_le16(((EXP_TID_GET(tidval, CTRL) & 0x3) << 10) |
1435 					(EXP_TID_GET(tidval, IDX) & 0x3ff));
1436 		/* Clear KDETH.SH on last packet */
1437 		if (unlikely(tx->flags & TXREQ_FLAGS_REQ_LAST_PKT)) {
1438 			val |= cpu_to_le16(KDETH_GET(hdr->kdeth.ver_tid_offset,
1439 								INTR) >> 16);
1440 			val &= cpu_to_le16(~(1U << 13));
1441 			AHG_HEADER_SET(req->ahg, diff, 7, 16, 14, val);
1442 		} else {
1443 			AHG_HEADER_SET(req->ahg, diff, 7, 16, 12, val);
1444 		}
1445 	}
1446 
1447 	trace_hfi1_sdma_user_header_ahg(pq->dd, pq->ctxt, pq->subctxt,
1448 					req->info.comp_idx, req->sde->this_idx,
1449 					req->ahg_idx, req->ahg, diff, tidval);
1450 	return diff;
1451 }
1452 
1453 /*
1454  * SDMA tx request completion callback. Called when the SDMA progress
1455  * state machine gets notification that the SDMA descriptors for this
1456  * tx request have been processed by the DMA engine. Called in
1457  * interrupt context.
1458  */
1459 static void user_sdma_txreq_cb(struct sdma_txreq *txreq, int status)
1460 {
1461 	struct user_sdma_txreq *tx =
1462 		container_of(txreq, struct user_sdma_txreq, txreq);
1463 	struct user_sdma_request *req;
1464 	struct hfi1_user_sdma_pkt_q *pq;
1465 	struct hfi1_user_sdma_comp_q *cq;
1466 	u16 idx;
1467 
1468 	if (!tx->req)
1469 		return;
1470 
1471 	req = tx->req;
1472 	pq = req->pq;
1473 	cq = req->cq;
1474 
1475 	if (status != SDMA_TXREQ_S_OK) {
1476 		SDMA_DBG(req, "SDMA completion with error %d",
1477 			 status);
1478 		set_bit(SDMA_REQ_HAS_ERROR, &req->flags);
1479 	}
1480 
1481 	req->seqcomp = tx->seqnum;
1482 	kmem_cache_free(pq->txreq_cache, tx);
1483 	tx = NULL;
1484 
1485 	idx = req->info.comp_idx;
1486 	if (req->status == -1 && status == SDMA_TXREQ_S_OK) {
1487 		if (req->seqcomp == req->info.npkts - 1) {
1488 			req->status = 0;
1489 			user_sdma_free_request(req, false);
1490 			pq_update(pq);
1491 			set_comp_state(pq, cq, idx, COMPLETE, 0);
1492 		}
1493 	} else {
1494 		if (status != SDMA_TXREQ_S_OK)
1495 			req->status = status;
1496 		if (req->seqcomp == (ACCESS_ONCE(req->seqsubmitted) - 1) &&
1497 		    (test_bit(SDMA_REQ_SEND_DONE, &req->flags) ||
1498 		     test_bit(SDMA_REQ_DONE_ERROR, &req->flags))) {
1499 			user_sdma_free_request(req, false);
1500 			pq_update(pq);
1501 			set_comp_state(pq, cq, idx, ERROR, req->status);
1502 		}
1503 	}
1504 }
1505 
1506 static inline void pq_update(struct hfi1_user_sdma_pkt_q *pq)
1507 {
1508 	if (atomic_dec_and_test(&pq->n_reqs)) {
1509 		xchg(&pq->state, SDMA_PKT_Q_INACTIVE);
1510 		wake_up(&pq->wait);
1511 	}
1512 }
1513 
1514 static void user_sdma_free_request(struct user_sdma_request *req, bool unpin)
1515 {
1516 	if (!list_empty(&req->txps)) {
1517 		struct sdma_txreq *t, *p;
1518 
1519 		list_for_each_entry_safe(t, p, &req->txps, list) {
1520 			struct user_sdma_txreq *tx =
1521 				container_of(t, struct user_sdma_txreq, txreq);
1522 			list_del_init(&t->list);
1523 			sdma_txclean(req->pq->dd, t);
1524 			kmem_cache_free(req->pq->txreq_cache, tx);
1525 		}
1526 	}
1527 	if (req->data_iovs) {
1528 		struct sdma_mmu_node *node;
1529 		int i;
1530 
1531 		for (i = 0; i < req->data_iovs; i++) {
1532 			node = req->iovs[i].node;
1533 			if (!node)
1534 				continue;
1535 
1536 			if (unpin)
1537 				hfi1_mmu_rb_remove(&req->pq->sdma_rb_root,
1538 						   &node->rb);
1539 			else
1540 				atomic_dec(&node->refcount);
1541 		}
1542 	}
1543 	kfree(req->tids);
1544 	clear_bit(SDMA_REQ_IN_USE, &req->flags);
1545 }
1546 
1547 static inline void set_comp_state(struct hfi1_user_sdma_pkt_q *pq,
1548 				  struct hfi1_user_sdma_comp_q *cq,
1549 				  u16 idx, enum hfi1_sdma_comp_state state,
1550 				  int ret)
1551 {
1552 	hfi1_cdbg(SDMA, "[%u:%u:%u:%u] Setting completion status %u %d",
1553 		  pq->dd->unit, pq->ctxt, pq->subctxt, idx, state, ret);
1554 	cq->comps[idx].status = state;
1555 	if (state == ERROR)
1556 		cq->comps[idx].errcode = -ret;
1557 	trace_hfi1_sdma_user_completion(pq->dd, pq->ctxt, pq->subctxt,
1558 					idx, state, ret);
1559 }
1560 
1561 static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
1562 			   unsigned long len)
1563 {
1564 	return (bool)(node->addr == addr);
1565 }
1566 
1567 static int sdma_rb_insert(struct rb_root *root, struct mmu_rb_node *mnode)
1568 {
1569 	struct sdma_mmu_node *node =
1570 		container_of(mnode, struct sdma_mmu_node, rb);
1571 
1572 	atomic_inc(&node->refcount);
1573 	return 0;
1574 }
1575 
1576 static void sdma_rb_remove(struct rb_root *root, struct mmu_rb_node *mnode,
1577 			   struct mm_struct *mm)
1578 {
1579 	struct sdma_mmu_node *node =
1580 		container_of(mnode, struct sdma_mmu_node, rb);
1581 
1582 	spin_lock(&node->pq->evict_lock);
1583 	/*
1584 	 * We've been called by the MMU notifier but this node has been
1585 	 * scheduled for eviction. The eviction function will take care
1586 	 * of freeing this node.
1587 	 * We have to take the above lock first because we are racing
1588 	 * against the setting of the bit in the eviction function.
1589 	 */
1590 	if (mm && test_bit(SDMA_CACHE_NODE_EVICT, &node->flags)) {
1591 		spin_unlock(&node->pq->evict_lock);
1592 		return;
1593 	}
1594 
1595 	if (!list_empty(&node->list))
1596 		list_del(&node->list);
1597 	node->pq->n_locked -= node->npages;
1598 	spin_unlock(&node->pq->evict_lock);
1599 
1600 	/*
1601 	 * If mm is set, we are being called by the MMU notifier and we
1602 	 * should not pass a mm_struct to unpin_vector_page(). This is to
1603 	 * prevent a deadlock when hfi1_release_user_pages() attempts to
1604 	 * take the mmap_sem, which the MMU notifier has already taken.
1605 	 */
1606 	unpin_vector_pages(mm ? NULL : current->mm, node->pages, 0,
1607 			   node->npages);
1608 	/*
1609 	 * If called by the MMU notifier, we have to adjust the pinned
1610 	 * page count ourselves.
1611 	 */
1612 	if (mm)
1613 		mm->pinned_vm -= node->npages;
1614 	kfree(node);
1615 }
1616 
1617 static int sdma_rb_invalidate(struct rb_root *root, struct mmu_rb_node *mnode)
1618 {
1619 	struct sdma_mmu_node *node =
1620 		container_of(mnode, struct sdma_mmu_node, rb);
1621 
1622 	if (!atomic_read(&node->refcount))
1623 		return 1;
1624 	return 0;
1625 }
1626