xref: /linux/drivers/infiniband/hw/hfi1/file_ops.c (revision e58e871becec2d3b04ed91c0c16fe8deac9c9dfa)
1 /*
2  * Copyright(c) 2015-2017 Intel Corporation.
3  *
4  * This file is provided under a dual BSD/GPLv2 license.  When using or
5  * redistributing this file, you may do so under either license.
6  *
7  * GPL LICENSE SUMMARY
8  *
9  * This program is free software; you can redistribute it and/or modify
10  * it under the terms of version 2 of the GNU General Public License as
11  * published by the Free Software Foundation.
12  *
13  * This program is distributed in the hope that it will be useful, but
14  * WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * General Public License for more details.
17  *
18  * BSD LICENSE
19  *
20  * Redistribution and use in source and binary forms, with or without
21  * modification, are permitted provided that the following conditions
22  * are met:
23  *
24  *  - Redistributions of source code must retain the above copyright
25  *    notice, this list of conditions and the following disclaimer.
26  *  - Redistributions in binary form must reproduce the above copyright
27  *    notice, this list of conditions and the following disclaimer in
28  *    the documentation and/or other materials provided with the
29  *    distribution.
30  *  - Neither the name of Intel Corporation nor the names of its
31  *    contributors may be used to endorse or promote products derived
32  *    from this software without specific prior written permission.
33  *
34  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
35  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
36  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
37  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
38  * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
39  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
40  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
41  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
42  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
43  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
44  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
45  *
46  */
47 #include <linux/poll.h>
48 #include <linux/cdev.h>
49 #include <linux/vmalloc.h>
50 #include <linux/io.h>
51 #include <linux/sched/mm.h>
52 #include <linux/bitmap.h>
53 
54 #include <rdma/ib.h>
55 
56 #include "hfi.h"
57 #include "pio.h"
58 #include "device.h"
59 #include "common.h"
60 #include "trace.h"
61 #include "user_sdma.h"
62 #include "user_exp_rcv.h"
63 #include "aspm.h"
64 #include "mmu_rb.h"
65 
66 #undef pr_fmt
67 #define pr_fmt(fmt) DRIVER_NAME ": " fmt
68 
69 #define SEND_CTXT_HALT_TIMEOUT 1000 /* msecs */
70 
71 /*
72  * File operation functions
73  */
74 static int hfi1_file_open(struct inode *inode, struct file *fp);
75 static int hfi1_file_close(struct inode *inode, struct file *fp);
76 static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from);
77 static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt);
78 static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma);
79 
80 static u64 kvirt_to_phys(void *addr);
81 static int assign_ctxt(struct hfi1_filedata *fd, struct hfi1_user_info *uinfo);
82 static int init_subctxts(struct hfi1_ctxtdata *uctxt,
83 			 const struct hfi1_user_info *uinfo);
84 static int init_user_ctxt(struct hfi1_filedata *fd);
85 static void user_init(struct hfi1_ctxtdata *uctxt);
86 static int get_ctxt_info(struct hfi1_filedata *fd, void __user *ubase,
87 			 __u32 len);
88 static int get_base_info(struct hfi1_filedata *fd, void __user *ubase,
89 			 __u32 len);
90 static int setup_base_ctxt(struct hfi1_filedata *fd);
91 static int setup_subctxt(struct hfi1_ctxtdata *uctxt);
92 
93 static int find_sub_ctxt(struct hfi1_filedata *fd,
94 			 const struct hfi1_user_info *uinfo);
95 static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd,
96 			 struct hfi1_user_info *uinfo);
97 static unsigned int poll_urgent(struct file *fp, struct poll_table_struct *pt);
98 static unsigned int poll_next(struct file *fp, struct poll_table_struct *pt);
99 static int user_event_ack(struct hfi1_ctxtdata *uctxt, u16 subctxt,
100 			  unsigned long events);
101 static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, u16 subctxt, u16 pkey);
102 static int manage_rcvq(struct hfi1_ctxtdata *uctxt, u16 subctxt,
103 		       int start_stop);
104 static int vma_fault(struct vm_fault *vmf);
105 static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
106 			    unsigned long arg);
107 
108 static const struct file_operations hfi1_file_ops = {
109 	.owner = THIS_MODULE,
110 	.write_iter = hfi1_write_iter,
111 	.open = hfi1_file_open,
112 	.release = hfi1_file_close,
113 	.unlocked_ioctl = hfi1_file_ioctl,
114 	.poll = hfi1_poll,
115 	.mmap = hfi1_file_mmap,
116 	.llseek = noop_llseek,
117 };
118 
119 static struct vm_operations_struct vm_ops = {
120 	.fault = vma_fault,
121 };
122 
123 /*
124  * Types of memories mapped into user processes' space
125  */
126 enum mmap_types {
127 	PIO_BUFS = 1,
128 	PIO_BUFS_SOP,
129 	PIO_CRED,
130 	RCV_HDRQ,
131 	RCV_EGRBUF,
132 	UREGS,
133 	EVENTS,
134 	STATUS,
135 	RTAIL,
136 	SUBCTXT_UREGS,
137 	SUBCTXT_RCV_HDRQ,
138 	SUBCTXT_EGRBUF,
139 	SDMA_COMP
140 };
141 
142 /*
143  * Masks and offsets defining the mmap tokens
144  */
145 #define HFI1_MMAP_OFFSET_MASK   0xfffULL
146 #define HFI1_MMAP_OFFSET_SHIFT  0
147 #define HFI1_MMAP_SUBCTXT_MASK  0xfULL
148 #define HFI1_MMAP_SUBCTXT_SHIFT 12
149 #define HFI1_MMAP_CTXT_MASK     0xffULL
150 #define HFI1_MMAP_CTXT_SHIFT    16
151 #define HFI1_MMAP_TYPE_MASK     0xfULL
152 #define HFI1_MMAP_TYPE_SHIFT    24
153 #define HFI1_MMAP_MAGIC_MASK    0xffffffffULL
154 #define HFI1_MMAP_MAGIC_SHIFT   32
155 
156 #define HFI1_MMAP_MAGIC         0xdabbad00
157 
158 #define HFI1_MMAP_TOKEN_SET(field, val)	\
159 	(((val) & HFI1_MMAP_##field##_MASK) << HFI1_MMAP_##field##_SHIFT)
160 #define HFI1_MMAP_TOKEN_GET(field, token) \
161 	(((token) >> HFI1_MMAP_##field##_SHIFT) & HFI1_MMAP_##field##_MASK)
162 #define HFI1_MMAP_TOKEN(type, ctxt, subctxt, addr)   \
163 	(HFI1_MMAP_TOKEN_SET(MAGIC, HFI1_MMAP_MAGIC) | \
164 	HFI1_MMAP_TOKEN_SET(TYPE, type) | \
165 	HFI1_MMAP_TOKEN_SET(CTXT, ctxt) | \
166 	HFI1_MMAP_TOKEN_SET(SUBCTXT, subctxt) | \
167 	HFI1_MMAP_TOKEN_SET(OFFSET, (offset_in_page(addr))))
168 
169 #define dbg(fmt, ...)				\
170 	pr_info(fmt, ##__VA_ARGS__)
171 
172 static inline int is_valid_mmap(u64 token)
173 {
174 	return (HFI1_MMAP_TOKEN_GET(MAGIC, token) == HFI1_MMAP_MAGIC);
175 }
176 
177 static int hfi1_file_open(struct inode *inode, struct file *fp)
178 {
179 	struct hfi1_filedata *fd;
180 	struct hfi1_devdata *dd = container_of(inode->i_cdev,
181 					       struct hfi1_devdata,
182 					       user_cdev);
183 
184 	if (!((dd->flags & HFI1_PRESENT) && dd->kregbase))
185 		return -EINVAL;
186 
187 	if (!atomic_inc_not_zero(&dd->user_refcount))
188 		return -ENXIO;
189 
190 	/* Just take a ref now. Not all opens result in a context assign */
191 	kobject_get(&dd->kobj);
192 
193 	/* The real work is performed later in assign_ctxt() */
194 
195 	fd = kzalloc(sizeof(*fd), GFP_KERNEL);
196 
197 	if (fd) {
198 		fd->rec_cpu_num = -1; /* no cpu affinity by default */
199 		fd->mm = current->mm;
200 		mmgrab(fd->mm);
201 		fd->dd = dd;
202 		fp->private_data = fd;
203 	} else {
204 		fp->private_data = NULL;
205 
206 		if (atomic_dec_and_test(&dd->user_refcount))
207 			complete(&dd->user_comp);
208 
209 		return -ENOMEM;
210 	}
211 
212 	return 0;
213 }
214 
215 static long hfi1_file_ioctl(struct file *fp, unsigned int cmd,
216 			    unsigned long arg)
217 {
218 	struct hfi1_filedata *fd = fp->private_data;
219 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
220 	struct hfi1_user_info uinfo;
221 	struct hfi1_tid_info tinfo;
222 	int ret = 0;
223 	unsigned long addr;
224 	int uval = 0;
225 	unsigned long ul_uval = 0;
226 	u16 uval16 = 0;
227 
228 	hfi1_cdbg(IOCTL, "IOCTL recv: 0x%x", cmd);
229 	if (cmd != HFI1_IOCTL_ASSIGN_CTXT &&
230 	    cmd != HFI1_IOCTL_GET_VERS &&
231 	    !uctxt)
232 		return -EINVAL;
233 
234 	switch (cmd) {
235 	case HFI1_IOCTL_ASSIGN_CTXT:
236 		if (uctxt)
237 			return -EINVAL;
238 
239 		if (copy_from_user(&uinfo,
240 				   (struct hfi1_user_info __user *)arg,
241 				   sizeof(uinfo)))
242 			return -EFAULT;
243 
244 		ret = assign_ctxt(fd, &uinfo);
245 		break;
246 	case HFI1_IOCTL_CTXT_INFO:
247 		ret = get_ctxt_info(fd, (void __user *)(unsigned long)arg,
248 				    sizeof(struct hfi1_ctxt_info));
249 		break;
250 	case HFI1_IOCTL_USER_INFO:
251 		ret = get_base_info(fd, (void __user *)(unsigned long)arg,
252 				    sizeof(struct hfi1_base_info));
253 		break;
254 	case HFI1_IOCTL_CREDIT_UPD:
255 		if (uctxt)
256 			sc_return_credits(uctxt->sc);
257 		break;
258 
259 	case HFI1_IOCTL_TID_UPDATE:
260 		if (copy_from_user(&tinfo,
261 				   (struct hfi11_tid_info __user *)arg,
262 				   sizeof(tinfo)))
263 			return -EFAULT;
264 
265 		ret = hfi1_user_exp_rcv_setup(fd, &tinfo);
266 		if (!ret) {
267 			/*
268 			 * Copy the number of tidlist entries we used
269 			 * and the length of the buffer we registered.
270 			 * These fields are adjacent in the structure so
271 			 * we can copy them at the same time.
272 			 */
273 			addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
274 			if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
275 					 sizeof(tinfo.tidcnt) +
276 					 sizeof(tinfo.length)))
277 				ret = -EFAULT;
278 		}
279 		break;
280 
281 	case HFI1_IOCTL_TID_FREE:
282 		if (copy_from_user(&tinfo,
283 				   (struct hfi11_tid_info __user *)arg,
284 				   sizeof(tinfo)))
285 			return -EFAULT;
286 
287 		ret = hfi1_user_exp_rcv_clear(fd, &tinfo);
288 		if (ret)
289 			break;
290 		addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
291 		if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
292 				 sizeof(tinfo.tidcnt)))
293 			ret = -EFAULT;
294 		break;
295 
296 	case HFI1_IOCTL_TID_INVAL_READ:
297 		if (copy_from_user(&tinfo,
298 				   (struct hfi11_tid_info __user *)arg,
299 				   sizeof(tinfo)))
300 			return -EFAULT;
301 
302 		ret = hfi1_user_exp_rcv_invalid(fd, &tinfo);
303 		if (ret)
304 			break;
305 		addr = arg + offsetof(struct hfi1_tid_info, tidcnt);
306 		if (copy_to_user((void __user *)addr, &tinfo.tidcnt,
307 				 sizeof(tinfo.tidcnt)))
308 			ret = -EFAULT;
309 		break;
310 
311 	case HFI1_IOCTL_RECV_CTRL:
312 		ret = get_user(uval, (int __user *)arg);
313 		if (ret != 0)
314 			return -EFAULT;
315 		ret = manage_rcvq(uctxt, fd->subctxt, uval);
316 		break;
317 
318 	case HFI1_IOCTL_POLL_TYPE:
319 		ret = get_user(uval, (int __user *)arg);
320 		if (ret != 0)
321 			return -EFAULT;
322 		uctxt->poll_type = (typeof(uctxt->poll_type))uval;
323 		break;
324 
325 	case HFI1_IOCTL_ACK_EVENT:
326 		ret = get_user(ul_uval, (unsigned long __user *)arg);
327 		if (ret != 0)
328 			return -EFAULT;
329 		ret = user_event_ack(uctxt, fd->subctxt, ul_uval);
330 		break;
331 
332 	case HFI1_IOCTL_SET_PKEY:
333 		ret = get_user(uval16, (u16 __user *)arg);
334 		if (ret != 0)
335 			return -EFAULT;
336 		if (HFI1_CAP_IS_USET(PKEY_CHECK))
337 			ret = set_ctxt_pkey(uctxt, fd->subctxt, uval16);
338 		else
339 			return -EPERM;
340 		break;
341 
342 	case HFI1_IOCTL_CTXT_RESET: {
343 		struct send_context *sc;
344 		struct hfi1_devdata *dd;
345 
346 		if (!uctxt || !uctxt->dd || !uctxt->sc)
347 			return -EINVAL;
348 
349 		/*
350 		 * There is no protection here. User level has to
351 		 * guarantee that no one will be writing to the send
352 		 * context while it is being re-initialized.
353 		 * If user level breaks that guarantee, it will break
354 		 * it's own context and no one else's.
355 		 */
356 		dd = uctxt->dd;
357 		sc = uctxt->sc;
358 		/*
359 		 * Wait until the interrupt handler has marked the
360 		 * context as halted or frozen. Report error if we time
361 		 * out.
362 		 */
363 		wait_event_interruptible_timeout(
364 			sc->halt_wait, (sc->flags & SCF_HALTED),
365 			msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
366 		if (!(sc->flags & SCF_HALTED))
367 			return -ENOLCK;
368 
369 		/*
370 		 * If the send context was halted due to a Freeze,
371 		 * wait until the device has been "unfrozen" before
372 		 * resetting the context.
373 		 */
374 		if (sc->flags & SCF_FROZEN) {
375 			wait_event_interruptible_timeout(
376 				dd->event_queue,
377 				!(ACCESS_ONCE(dd->flags) & HFI1_FROZEN),
378 				msecs_to_jiffies(SEND_CTXT_HALT_TIMEOUT));
379 			if (dd->flags & HFI1_FROZEN)
380 				return -ENOLCK;
381 
382 			if (dd->flags & HFI1_FORCED_FREEZE)
383 				/*
384 				 * Don't allow context reset if we are into
385 				 * forced freeze
386 				 */
387 				return -ENODEV;
388 
389 			sc_disable(sc);
390 			ret = sc_enable(sc);
391 			hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_ENB,
392 				     uctxt->ctxt);
393 		} else {
394 			ret = sc_restart(sc);
395 		}
396 		if (!ret)
397 			sc_return_credits(sc);
398 		break;
399 	}
400 
401 	case HFI1_IOCTL_GET_VERS:
402 		uval = HFI1_USER_SWVERSION;
403 		if (put_user(uval, (int __user *)arg))
404 			return -EFAULT;
405 		break;
406 
407 	default:
408 		return -EINVAL;
409 	}
410 
411 	return ret;
412 }
413 
414 static ssize_t hfi1_write_iter(struct kiocb *kiocb, struct iov_iter *from)
415 {
416 	struct hfi1_filedata *fd = kiocb->ki_filp->private_data;
417 	struct hfi1_user_sdma_pkt_q *pq = fd->pq;
418 	struct hfi1_user_sdma_comp_q *cq = fd->cq;
419 	int done = 0, reqs = 0;
420 	unsigned long dim = from->nr_segs;
421 
422 	if (!cq || !pq)
423 		return -EIO;
424 
425 	if (!iter_is_iovec(from) || !dim)
426 		return -EINVAL;
427 
428 	hfi1_cdbg(SDMA, "SDMA request from %u:%u (%lu)",
429 		  fd->uctxt->ctxt, fd->subctxt, dim);
430 
431 	if (atomic_read(&pq->n_reqs) == pq->n_max_reqs)
432 		return -ENOSPC;
433 
434 	while (dim) {
435 		int ret;
436 		unsigned long count = 0;
437 
438 		ret = hfi1_user_sdma_process_request(
439 			fd, (struct iovec *)(from->iov + done),
440 			dim, &count);
441 		if (ret) {
442 			reqs = ret;
443 			break;
444 		}
445 		dim -= count;
446 		done += count;
447 		reqs++;
448 	}
449 
450 	return reqs;
451 }
452 
453 static int hfi1_file_mmap(struct file *fp, struct vm_area_struct *vma)
454 {
455 	struct hfi1_filedata *fd = fp->private_data;
456 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
457 	struct hfi1_devdata *dd;
458 	unsigned long flags;
459 	u64 token = vma->vm_pgoff << PAGE_SHIFT,
460 		memaddr = 0;
461 	void *memvirt = NULL;
462 	u8 subctxt, mapio = 0, vmf = 0, type;
463 	ssize_t memlen = 0;
464 	int ret = 0;
465 	u16 ctxt;
466 
467 	if (!is_valid_mmap(token) || !uctxt ||
468 	    !(vma->vm_flags & VM_SHARED)) {
469 		ret = -EINVAL;
470 		goto done;
471 	}
472 	dd = uctxt->dd;
473 	ctxt = HFI1_MMAP_TOKEN_GET(CTXT, token);
474 	subctxt = HFI1_MMAP_TOKEN_GET(SUBCTXT, token);
475 	type = HFI1_MMAP_TOKEN_GET(TYPE, token);
476 	if (ctxt != uctxt->ctxt || subctxt != fd->subctxt) {
477 		ret = -EINVAL;
478 		goto done;
479 	}
480 
481 	flags = vma->vm_flags;
482 
483 	switch (type) {
484 	case PIO_BUFS:
485 	case PIO_BUFS_SOP:
486 		memaddr = ((dd->physaddr + TXE_PIO_SEND) +
487 				/* chip pio base */
488 			   (uctxt->sc->hw_context * BIT(16))) +
489 				/* 64K PIO space / ctxt */
490 			(type == PIO_BUFS_SOP ?
491 				(TXE_PIO_SIZE / 2) : 0); /* sop? */
492 		/*
493 		 * Map only the amount allocated to the context, not the
494 		 * entire available context's PIO space.
495 		 */
496 		memlen = PAGE_ALIGN(uctxt->sc->credits * PIO_BLOCK_SIZE);
497 		flags &= ~VM_MAYREAD;
498 		flags |= VM_DONTCOPY | VM_DONTEXPAND;
499 		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
500 		mapio = 1;
501 		break;
502 	case PIO_CRED:
503 		if (flags & VM_WRITE) {
504 			ret = -EPERM;
505 			goto done;
506 		}
507 		/*
508 		 * The credit return location for this context could be on the
509 		 * second or third page allocated for credit returns (if number
510 		 * of enabled contexts > 64 and 128 respectively).
511 		 */
512 		memvirt = dd->cr_base[uctxt->numa_id].va;
513 		memaddr = virt_to_phys(memvirt) +
514 			(((u64)uctxt->sc->hw_free -
515 			  (u64)dd->cr_base[uctxt->numa_id].va) & PAGE_MASK);
516 		memlen = PAGE_SIZE;
517 		flags &= ~VM_MAYWRITE;
518 		flags |= VM_DONTCOPY | VM_DONTEXPAND;
519 		/*
520 		 * The driver has already allocated memory for credit
521 		 * returns and programmed it into the chip. Has that
522 		 * memory been flagged as non-cached?
523 		 */
524 		/* vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); */
525 		mapio = 1;
526 		break;
527 	case RCV_HDRQ:
528 		memlen = uctxt->rcvhdrq_size;
529 		memvirt = uctxt->rcvhdrq;
530 		break;
531 	case RCV_EGRBUF: {
532 		unsigned long addr;
533 		int i;
534 		/*
535 		 * The RcvEgr buffer need to be handled differently
536 		 * as multiple non-contiguous pages need to be mapped
537 		 * into the user process.
538 		 */
539 		memlen = uctxt->egrbufs.size;
540 		if ((vma->vm_end - vma->vm_start) != memlen) {
541 			dd_dev_err(dd, "Eager buffer map size invalid (%lu != %lu)\n",
542 				   (vma->vm_end - vma->vm_start), memlen);
543 			ret = -EINVAL;
544 			goto done;
545 		}
546 		if (vma->vm_flags & VM_WRITE) {
547 			ret = -EPERM;
548 			goto done;
549 		}
550 		vma->vm_flags &= ~VM_MAYWRITE;
551 		addr = vma->vm_start;
552 		for (i = 0 ; i < uctxt->egrbufs.numbufs; i++) {
553 			memlen = uctxt->egrbufs.buffers[i].len;
554 			memvirt = uctxt->egrbufs.buffers[i].addr;
555 			ret = remap_pfn_range(
556 				vma, addr,
557 				/*
558 				 * virt_to_pfn() does the same, but
559 				 * it's not available on x86_64
560 				 * when CONFIG_MMU is enabled.
561 				 */
562 				PFN_DOWN(__pa(memvirt)),
563 				memlen,
564 				vma->vm_page_prot);
565 			if (ret < 0)
566 				goto done;
567 			addr += memlen;
568 		}
569 		ret = 0;
570 		goto done;
571 	}
572 	case UREGS:
573 		/*
574 		 * Map only the page that contains this context's user
575 		 * registers.
576 		 */
577 		memaddr = (unsigned long)
578 			(dd->physaddr + RXE_PER_CONTEXT_USER)
579 			+ (uctxt->ctxt * RXE_PER_CONTEXT_SIZE);
580 		/*
581 		 * TidFlow table is on the same page as the rest of the
582 		 * user registers.
583 		 */
584 		memlen = PAGE_SIZE;
585 		flags |= VM_DONTCOPY | VM_DONTEXPAND;
586 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
587 		mapio = 1;
588 		break;
589 	case EVENTS:
590 		/*
591 		 * Use the page where this context's flags are. User level
592 		 * knows where it's own bitmap is within the page.
593 		 */
594 		memaddr = (unsigned long)(dd->events +
595 				  ((uctxt->ctxt - dd->first_dyn_alloc_ctxt) *
596 				   HFI1_MAX_SHARED_CTXTS)) & PAGE_MASK;
597 		memlen = PAGE_SIZE;
598 		/*
599 		 * v3.7 removes VM_RESERVED but the effect is kept by
600 		 * using VM_IO.
601 		 */
602 		flags |= VM_IO | VM_DONTEXPAND;
603 		vmf = 1;
604 		break;
605 	case STATUS:
606 		if (flags & (unsigned long)(VM_WRITE | VM_EXEC)) {
607 			ret = -EPERM;
608 			goto done;
609 		}
610 		memaddr = kvirt_to_phys((void *)dd->status);
611 		memlen = PAGE_SIZE;
612 		flags |= VM_IO | VM_DONTEXPAND;
613 		break;
614 	case RTAIL:
615 		if (!HFI1_CAP_IS_USET(DMA_RTAIL)) {
616 			/*
617 			 * If the memory allocation failed, the context alloc
618 			 * also would have failed, so we would never get here
619 			 */
620 			ret = -EINVAL;
621 			goto done;
622 		}
623 		if (flags & VM_WRITE) {
624 			ret = -EPERM;
625 			goto done;
626 		}
627 		memlen = PAGE_SIZE;
628 		memvirt = (void *)uctxt->rcvhdrtail_kvaddr;
629 		flags &= ~VM_MAYWRITE;
630 		break;
631 	case SUBCTXT_UREGS:
632 		memaddr = (u64)uctxt->subctxt_uregbase;
633 		memlen = PAGE_SIZE;
634 		flags |= VM_IO | VM_DONTEXPAND;
635 		vmf = 1;
636 		break;
637 	case SUBCTXT_RCV_HDRQ:
638 		memaddr = (u64)uctxt->subctxt_rcvhdr_base;
639 		memlen = uctxt->rcvhdrq_size * uctxt->subctxt_cnt;
640 		flags |= VM_IO | VM_DONTEXPAND;
641 		vmf = 1;
642 		break;
643 	case SUBCTXT_EGRBUF:
644 		memaddr = (u64)uctxt->subctxt_rcvegrbuf;
645 		memlen = uctxt->egrbufs.size * uctxt->subctxt_cnt;
646 		flags |= VM_IO | VM_DONTEXPAND;
647 		flags &= ~VM_MAYWRITE;
648 		vmf = 1;
649 		break;
650 	case SDMA_COMP: {
651 		struct hfi1_user_sdma_comp_q *cq = fd->cq;
652 
653 		if (!cq) {
654 			ret = -EFAULT;
655 			goto done;
656 		}
657 		memaddr = (u64)cq->comps;
658 		memlen = PAGE_ALIGN(sizeof(*cq->comps) * cq->nentries);
659 		flags |= VM_IO | VM_DONTEXPAND;
660 		vmf = 1;
661 		break;
662 	}
663 	default:
664 		ret = -EINVAL;
665 		break;
666 	}
667 
668 	if ((vma->vm_end - vma->vm_start) != memlen) {
669 		hfi1_cdbg(PROC, "%u:%u Memory size mismatch %lu:%lu",
670 			  uctxt->ctxt, fd->subctxt,
671 			  (vma->vm_end - vma->vm_start), memlen);
672 		ret = -EINVAL;
673 		goto done;
674 	}
675 
676 	vma->vm_flags = flags;
677 	hfi1_cdbg(PROC,
678 		  "%u:%u type:%u io/vf:%d/%d, addr:0x%llx, len:%lu(%lu), flags:0x%lx\n",
679 		    ctxt, subctxt, type, mapio, vmf, memaddr, memlen,
680 		    vma->vm_end - vma->vm_start, vma->vm_flags);
681 	if (vmf) {
682 		vma->vm_pgoff = PFN_DOWN(memaddr);
683 		vma->vm_ops = &vm_ops;
684 		ret = 0;
685 	} else if (mapio) {
686 		ret = io_remap_pfn_range(vma, vma->vm_start,
687 					 PFN_DOWN(memaddr),
688 					 memlen,
689 					 vma->vm_page_prot);
690 	} else if (memvirt) {
691 		ret = remap_pfn_range(vma, vma->vm_start,
692 				      PFN_DOWN(__pa(memvirt)),
693 				      memlen,
694 				      vma->vm_page_prot);
695 	} else {
696 		ret = remap_pfn_range(vma, vma->vm_start,
697 				      PFN_DOWN(memaddr),
698 				      memlen,
699 				      vma->vm_page_prot);
700 	}
701 done:
702 	return ret;
703 }
704 
705 /*
706  * Local (non-chip) user memory is not mapped right away but as it is
707  * accessed by the user-level code.
708  */
709 static int vma_fault(struct vm_fault *vmf)
710 {
711 	struct page *page;
712 
713 	page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
714 	if (!page)
715 		return VM_FAULT_SIGBUS;
716 
717 	get_page(page);
718 	vmf->page = page;
719 
720 	return 0;
721 }
722 
723 static unsigned int hfi1_poll(struct file *fp, struct poll_table_struct *pt)
724 {
725 	struct hfi1_ctxtdata *uctxt;
726 	unsigned pollflag;
727 
728 	uctxt = ((struct hfi1_filedata *)fp->private_data)->uctxt;
729 	if (!uctxt)
730 		pollflag = POLLERR;
731 	else if (uctxt->poll_type == HFI1_POLL_TYPE_URGENT)
732 		pollflag = poll_urgent(fp, pt);
733 	else  if (uctxt->poll_type == HFI1_POLL_TYPE_ANYRCV)
734 		pollflag = poll_next(fp, pt);
735 	else /* invalid */
736 		pollflag = POLLERR;
737 
738 	return pollflag;
739 }
740 
741 static int hfi1_file_close(struct inode *inode, struct file *fp)
742 {
743 	struct hfi1_filedata *fdata = fp->private_data;
744 	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
745 	struct hfi1_devdata *dd = container_of(inode->i_cdev,
746 					       struct hfi1_devdata,
747 					       user_cdev);
748 	unsigned long flags, *ev;
749 
750 	fp->private_data = NULL;
751 
752 	if (!uctxt)
753 		goto done;
754 
755 	hfi1_cdbg(PROC, "freeing ctxt %u:%u", uctxt->ctxt, fdata->subctxt);
756 	mutex_lock(&hfi1_mutex);
757 
758 	flush_wc();
759 	/* drain user sdma queue */
760 	hfi1_user_sdma_free_queues(fdata);
761 
762 	/* release the cpu */
763 	hfi1_put_proc_affinity(fdata->rec_cpu_num);
764 
765 	/* clean up rcv side */
766 	hfi1_user_exp_rcv_free(fdata);
767 
768 	/*
769 	 * Clear any left over, unhandled events so the next process that
770 	 * gets this context doesn't get confused.
771 	 */
772 	ev = dd->events + ((uctxt->ctxt - dd->first_dyn_alloc_ctxt) *
773 			   HFI1_MAX_SHARED_CTXTS) + fdata->subctxt;
774 	*ev = 0;
775 
776 	__clear_bit(fdata->subctxt, uctxt->in_use_ctxts);
777 	if (!bitmap_empty(uctxt->in_use_ctxts, HFI1_MAX_SHARED_CTXTS)) {
778 		mutex_unlock(&hfi1_mutex);
779 		goto done;
780 	}
781 
782 	spin_lock_irqsave(&dd->uctxt_lock, flags);
783 	/*
784 	 * Disable receive context and interrupt available, reset all
785 	 * RcvCtxtCtrl bits to default values.
786 	 */
787 	hfi1_rcvctrl(dd, HFI1_RCVCTRL_CTXT_DIS |
788 		     HFI1_RCVCTRL_TIDFLOW_DIS |
789 		     HFI1_RCVCTRL_INTRAVAIL_DIS |
790 		     HFI1_RCVCTRL_TAILUPD_DIS |
791 		     HFI1_RCVCTRL_ONE_PKT_EGR_DIS |
792 		     HFI1_RCVCTRL_NO_RHQ_DROP_DIS |
793 		     HFI1_RCVCTRL_NO_EGR_DROP_DIS, uctxt->ctxt);
794 	/* Clear the context's J_KEY */
795 	hfi1_clear_ctxt_jkey(dd, uctxt->ctxt);
796 	/*
797 	 * Reset context integrity checks to default.
798 	 * (writes to CSRs probably belong in chip.c)
799 	 */
800 	write_kctxt_csr(dd, uctxt->sc->hw_context, SEND_CTXT_CHECK_ENABLE,
801 			hfi1_pkt_default_send_ctxt_mask(dd, uctxt->sc->type));
802 	sc_disable(uctxt->sc);
803 	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
804 
805 	dd->rcd[uctxt->ctxt] = NULL;
806 
807 	hfi1_user_exp_rcv_grp_free(uctxt);
808 	hfi1_clear_ctxt_pkey(dd, uctxt);
809 
810 	uctxt->rcvwait_to = 0;
811 	uctxt->piowait_to = 0;
812 	uctxt->rcvnowait = 0;
813 	uctxt->pionowait = 0;
814 	uctxt->event_flags = 0;
815 
816 	hfi1_stats.sps_ctxts--;
817 	if (++dd->freectxts == dd->num_user_contexts)
818 		aspm_enable_all(dd);
819 	mutex_unlock(&hfi1_mutex);
820 	hfi1_free_ctxtdata(dd, uctxt);
821 done:
822 	mmdrop(fdata->mm);
823 	kobject_put(&dd->kobj);
824 
825 	if (atomic_dec_and_test(&dd->user_refcount))
826 		complete(&dd->user_comp);
827 
828 	kfree(fdata);
829 	return 0;
830 }
831 
832 /*
833  * Convert kernel *virtual* addresses to physical addresses.
834  * This is used to vmalloc'ed addresses.
835  */
836 static u64 kvirt_to_phys(void *addr)
837 {
838 	struct page *page;
839 	u64 paddr = 0;
840 
841 	page = vmalloc_to_page(addr);
842 	if (page)
843 		paddr = page_to_pfn(page) << PAGE_SHIFT;
844 
845 	return paddr;
846 }
847 
848 static int assign_ctxt(struct hfi1_filedata *fd, struct hfi1_user_info *uinfo)
849 {
850 	int ret;
851 	unsigned int swmajor, swminor;
852 
853 	swmajor = uinfo->userversion >> 16;
854 	if (swmajor != HFI1_USER_SWMAJOR)
855 		return -ENODEV;
856 
857 	swminor = uinfo->userversion & 0xffff;
858 
859 	mutex_lock(&hfi1_mutex);
860 	/*
861 	 * Get a sub context if necessary.
862 	 * ret < 0 error, 0 no context, 1 sub-context found
863 	 */
864 	ret = 0;
865 	if (uinfo->subctxt_cnt) {
866 		ret = find_sub_ctxt(fd, uinfo);
867 		if (ret > 0)
868 			fd->rec_cpu_num =
869 				hfi1_get_proc_affinity(fd->uctxt->numa_id);
870 	}
871 
872 	/*
873 	 * Allocate a base context if context sharing is not required or we
874 	 * couldn't find a sub context.
875 	 */
876 	if (!ret)
877 		ret = allocate_ctxt(fd, fd->dd, uinfo);
878 
879 	mutex_unlock(&hfi1_mutex);
880 
881 	/* Depending on the context type, do the appropriate init */
882 	if (ret > 0) {
883 		/*
884 		 * sub-context info can only be set up after the base
885 		 * context has been completed.
886 		 */
887 		ret = wait_event_interruptible(fd->uctxt->wait, !test_bit(
888 					       HFI1_CTXT_BASE_UNINIT,
889 					       &fd->uctxt->event_flags));
890 		if (test_bit(HFI1_CTXT_BASE_FAILED, &fd->uctxt->event_flags)) {
891 			clear_bit(fd->subctxt, fd->uctxt->in_use_ctxts);
892 			return -ENOMEM;
893 		}
894 		/* The only thing a sub context needs is the user_xxx stuff */
895 		if (!ret)
896 			ret = init_user_ctxt(fd);
897 
898 		if (ret)
899 			clear_bit(fd->subctxt, fd->uctxt->in_use_ctxts);
900 	} else if (!ret) {
901 		ret = setup_base_ctxt(fd);
902 		if (fd->uctxt->subctxt_cnt) {
903 			/* If there is an error, set the failed bit. */
904 			if (ret)
905 				set_bit(HFI1_CTXT_BASE_FAILED,
906 					&fd->uctxt->event_flags);
907 			/*
908 			 * Base context is done, notify anybody using a
909 			 * sub-context that is waiting for this completion
910 			 */
911 			clear_bit(HFI1_CTXT_BASE_UNINIT,
912 				  &fd->uctxt->event_flags);
913 			wake_up(&fd->uctxt->wait);
914 		}
915 	}
916 
917 	return ret;
918 }
919 
920 /*
921  * The hfi1_mutex must be held when this function is called.  It is
922  * necessary to ensure serialized access to the bitmask in_use_ctxts.
923  */
924 static int find_sub_ctxt(struct hfi1_filedata *fd,
925 			 const struct hfi1_user_info *uinfo)
926 {
927 	int i;
928 	struct hfi1_devdata *dd = fd->dd;
929 	u16 subctxt;
930 
931 	for (i = dd->first_dyn_alloc_ctxt; i < dd->num_rcv_contexts; i++) {
932 		struct hfi1_ctxtdata *uctxt = dd->rcd[i];
933 
934 		/* Skip ctxts which are not yet open */
935 		if (!uctxt ||
936 		    bitmap_empty(uctxt->in_use_ctxts,
937 				 HFI1_MAX_SHARED_CTXTS))
938 			continue;
939 
940 		/* Skip dynamically allocted kernel contexts */
941 		if (uctxt->sc && (uctxt->sc->type == SC_KERNEL))
942 			continue;
943 
944 		/* Skip ctxt if it doesn't match the requested one */
945 		if (memcmp(uctxt->uuid, uinfo->uuid,
946 			   sizeof(uctxt->uuid)) ||
947 		    uctxt->jkey != generate_jkey(current_uid()) ||
948 		    uctxt->subctxt_id != uinfo->subctxt_id ||
949 		    uctxt->subctxt_cnt != uinfo->subctxt_cnt)
950 			continue;
951 
952 		/* Verify the sharing process matches the master */
953 		if (uctxt->userversion != uinfo->userversion)
954 			return -EINVAL;
955 
956 		/* Find an unused context */
957 		subctxt = find_first_zero_bit(uctxt->in_use_ctxts,
958 					      HFI1_MAX_SHARED_CTXTS);
959 		if (subctxt >= uctxt->subctxt_cnt)
960 			return -EBUSY;
961 
962 		fd->uctxt = uctxt;
963 		fd->subctxt = subctxt;
964 		__set_bit(fd->subctxt, uctxt->in_use_ctxts);
965 
966 		return 1;
967 	}
968 
969 	return 0;
970 }
971 
972 static int allocate_ctxt(struct hfi1_filedata *fd, struct hfi1_devdata *dd,
973 			 struct hfi1_user_info *uinfo)
974 {
975 	struct hfi1_ctxtdata *uctxt;
976 	unsigned int ctxt;
977 	int ret, numa;
978 
979 	if (dd->flags & HFI1_FROZEN) {
980 		/*
981 		 * Pick an error that is unique from all other errors
982 		 * that are returned so the user process knows that
983 		 * it tried to allocate while the SPC was frozen.  It
984 		 * it should be able to retry with success in a short
985 		 * while.
986 		 */
987 		return -EIO;
988 	}
989 
990 	/*
991 	 * This check is sort of redundant to the next EBUSY error. It would
992 	 * also indicate an inconsistancy in the driver if this value was
993 	 * zero, but there were still contexts available.
994 	 */
995 	if (!dd->freectxts)
996 		return -EBUSY;
997 
998 	for (ctxt = dd->first_dyn_alloc_ctxt;
999 	     ctxt < dd->num_rcv_contexts; ctxt++)
1000 		if (!dd->rcd[ctxt])
1001 			break;
1002 
1003 	if (ctxt == dd->num_rcv_contexts)
1004 		return -EBUSY;
1005 
1006 	/*
1007 	 * If we don't have a NUMA node requested, preference is towards
1008 	 * device NUMA node.
1009 	 */
1010 	fd->rec_cpu_num = hfi1_get_proc_affinity(dd->node);
1011 	if (fd->rec_cpu_num != -1)
1012 		numa = cpu_to_node(fd->rec_cpu_num);
1013 	else
1014 		numa = numa_node_id();
1015 	uctxt = hfi1_create_ctxtdata(dd->pport, ctxt, numa);
1016 	if (!uctxt) {
1017 		dd_dev_err(dd,
1018 			   "Unable to allocate ctxtdata memory, failing open\n");
1019 		return -ENOMEM;
1020 	}
1021 	hfi1_cdbg(PROC, "[%u:%u] pid %u assigned to CPU %d (NUMA %u)",
1022 		  uctxt->ctxt, fd->subctxt, current->pid, fd->rec_cpu_num,
1023 		  uctxt->numa_id);
1024 
1025 	/*
1026 	 * Allocate and enable a PIO send context.
1027 	 */
1028 	uctxt->sc = sc_alloc(dd, SC_USER, uctxt->rcvhdrqentsize,
1029 			     uctxt->dd->node);
1030 	if (!uctxt->sc) {
1031 		ret = -ENOMEM;
1032 		goto ctxdata_free;
1033 	}
1034 	hfi1_cdbg(PROC, "allocated send context %u(%u)\n", uctxt->sc->sw_index,
1035 		  uctxt->sc->hw_context);
1036 	ret = sc_enable(uctxt->sc);
1037 	if (ret)
1038 		goto ctxdata_free;
1039 
1040 	/*
1041 	 * Setup sub context resources if the user-level has requested
1042 	 * sub contexts.
1043 	 * This has to be done here so the rest of the sub-contexts find the
1044 	 * proper master.
1045 	 */
1046 	if (uinfo->subctxt_cnt) {
1047 		ret = init_subctxts(uctxt, uinfo);
1048 		/*
1049 		 * On error, we don't need to disable and de-allocate the
1050 		 * send context because it will be done during file close
1051 		 */
1052 		if (ret)
1053 			goto ctxdata_free;
1054 	}
1055 	uctxt->userversion = uinfo->userversion;
1056 	uctxt->flags = hfi1_cap_mask; /* save current flag state */
1057 	init_waitqueue_head(&uctxt->wait);
1058 	strlcpy(uctxt->comm, current->comm, sizeof(uctxt->comm));
1059 	memcpy(uctxt->uuid, uinfo->uuid, sizeof(uctxt->uuid));
1060 	uctxt->jkey = generate_jkey(current_uid());
1061 	INIT_LIST_HEAD(&uctxt->sdma_queues);
1062 	spin_lock_init(&uctxt->sdma_qlock);
1063 	hfi1_stats.sps_ctxts++;
1064 	/*
1065 	 * Disable ASPM when there are open user/PSM contexts to avoid
1066 	 * issues with ASPM L1 exit latency
1067 	 */
1068 	if (dd->freectxts-- == dd->num_user_contexts)
1069 		aspm_disable_all(dd);
1070 	fd->uctxt = uctxt;
1071 
1072 	return 0;
1073 
1074 ctxdata_free:
1075 	dd->rcd[ctxt] = NULL;
1076 	hfi1_free_ctxtdata(dd, uctxt);
1077 	return ret;
1078 }
1079 
1080 static int init_subctxts(struct hfi1_ctxtdata *uctxt,
1081 			 const struct hfi1_user_info *uinfo)
1082 {
1083 	u16 num_subctxts;
1084 
1085 	num_subctxts = uinfo->subctxt_cnt;
1086 	if (num_subctxts > HFI1_MAX_SHARED_CTXTS)
1087 		return -EINVAL;
1088 
1089 	uctxt->subctxt_cnt = uinfo->subctxt_cnt;
1090 	uctxt->subctxt_id = uinfo->subctxt_id;
1091 	uctxt->redirect_seq_cnt = 1;
1092 	set_bit(HFI1_CTXT_BASE_UNINIT, &uctxt->event_flags);
1093 
1094 	return 0;
1095 }
1096 
1097 static int setup_subctxt(struct hfi1_ctxtdata *uctxt)
1098 {
1099 	int ret = 0;
1100 	u16 num_subctxts = uctxt->subctxt_cnt;
1101 
1102 	uctxt->subctxt_uregbase = vmalloc_user(PAGE_SIZE);
1103 	if (!uctxt->subctxt_uregbase)
1104 		return -ENOMEM;
1105 
1106 	/* We can take the size of the RcvHdr Queue from the master */
1107 	uctxt->subctxt_rcvhdr_base = vmalloc_user(uctxt->rcvhdrq_size *
1108 						  num_subctxts);
1109 	if (!uctxt->subctxt_rcvhdr_base) {
1110 		ret = -ENOMEM;
1111 		goto bail_ureg;
1112 	}
1113 
1114 	uctxt->subctxt_rcvegrbuf = vmalloc_user(uctxt->egrbufs.size *
1115 						num_subctxts);
1116 	if (!uctxt->subctxt_rcvegrbuf) {
1117 		ret = -ENOMEM;
1118 		goto bail_rhdr;
1119 	}
1120 
1121 	return 0;
1122 
1123 bail_rhdr:
1124 	vfree(uctxt->subctxt_rcvhdr_base);
1125 	uctxt->subctxt_rcvhdr_base = NULL;
1126 bail_ureg:
1127 	vfree(uctxt->subctxt_uregbase);
1128 	uctxt->subctxt_uregbase = NULL;
1129 
1130 	return ret;
1131 }
1132 
1133 static void user_init(struct hfi1_ctxtdata *uctxt)
1134 {
1135 	unsigned int rcvctrl_ops = 0;
1136 
1137 	/* initialize poll variables... */
1138 	uctxt->urgent = 0;
1139 	uctxt->urgent_poll = 0;
1140 
1141 	/*
1142 	 * Now enable the ctxt for receive.
1143 	 * For chips that are set to DMA the tail register to memory
1144 	 * when they change (and when the update bit transitions from
1145 	 * 0 to 1.  So for those chips, we turn it off and then back on.
1146 	 * This will (very briefly) affect any other open ctxts, but the
1147 	 * duration is very short, and therefore isn't an issue.  We
1148 	 * explicitly set the in-memory tail copy to 0 beforehand, so we
1149 	 * don't have to wait to be sure the DMA update has happened
1150 	 * (chip resets head/tail to 0 on transition to enable).
1151 	 */
1152 	if (uctxt->rcvhdrtail_kvaddr)
1153 		clear_rcvhdrtail(uctxt);
1154 
1155 	/* Setup J_KEY before enabling the context */
1156 	hfi1_set_ctxt_jkey(uctxt->dd, uctxt->ctxt, uctxt->jkey);
1157 
1158 	rcvctrl_ops = HFI1_RCVCTRL_CTXT_ENB;
1159 	if (HFI1_CAP_UGET_MASK(uctxt->flags, HDRSUPP))
1160 		rcvctrl_ops |= HFI1_RCVCTRL_TIDFLOW_ENB;
1161 	/*
1162 	 * Ignore the bit in the flags for now until proper
1163 	 * support for multiple packet per rcv array entry is
1164 	 * added.
1165 	 */
1166 	if (!HFI1_CAP_UGET_MASK(uctxt->flags, MULTI_PKT_EGR))
1167 		rcvctrl_ops |= HFI1_RCVCTRL_ONE_PKT_EGR_ENB;
1168 	if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_EGR_FULL))
1169 		rcvctrl_ops |= HFI1_RCVCTRL_NO_EGR_DROP_ENB;
1170 	if (HFI1_CAP_UGET_MASK(uctxt->flags, NODROP_RHQ_FULL))
1171 		rcvctrl_ops |= HFI1_RCVCTRL_NO_RHQ_DROP_ENB;
1172 	/*
1173 	 * The RcvCtxtCtrl.TailUpd bit has to be explicitly written.
1174 	 * We can't rely on the correct value to be set from prior
1175 	 * uses of the chip or ctxt. Therefore, add the rcvctrl op
1176 	 * for both cases.
1177 	 */
1178 	if (HFI1_CAP_UGET_MASK(uctxt->flags, DMA_RTAIL))
1179 		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_ENB;
1180 	else
1181 		rcvctrl_ops |= HFI1_RCVCTRL_TAILUPD_DIS;
1182 	hfi1_rcvctrl(uctxt->dd, rcvctrl_ops, uctxt->ctxt);
1183 }
1184 
1185 static int get_ctxt_info(struct hfi1_filedata *fd, void __user *ubase,
1186 			 __u32 len)
1187 {
1188 	struct hfi1_ctxt_info cinfo;
1189 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
1190 	int ret = 0;
1191 
1192 	memset(&cinfo, 0, sizeof(cinfo));
1193 	cinfo.runtime_flags = (((uctxt->flags >> HFI1_CAP_MISC_SHIFT) &
1194 				HFI1_CAP_MISC_MASK) << HFI1_CAP_USER_SHIFT) |
1195 			HFI1_CAP_UGET_MASK(uctxt->flags, MASK) |
1196 			HFI1_CAP_KGET_MASK(uctxt->flags, K2U);
1197 	/* adjust flag if this fd is not able to cache */
1198 	if (!fd->handler)
1199 		cinfo.runtime_flags |= HFI1_CAP_TID_UNMAP; /* no caching */
1200 
1201 	cinfo.num_active = hfi1_count_active_units();
1202 	cinfo.unit = uctxt->dd->unit;
1203 	cinfo.ctxt = uctxt->ctxt;
1204 	cinfo.subctxt = fd->subctxt;
1205 	cinfo.rcvtids = roundup(uctxt->egrbufs.alloced,
1206 				uctxt->dd->rcv_entries.group_size) +
1207 		uctxt->expected_count;
1208 	cinfo.credits = uctxt->sc->credits;
1209 	cinfo.numa_node = uctxt->numa_id;
1210 	cinfo.rec_cpu = fd->rec_cpu_num;
1211 	cinfo.send_ctxt = uctxt->sc->hw_context;
1212 
1213 	cinfo.egrtids = uctxt->egrbufs.alloced;
1214 	cinfo.rcvhdrq_cnt = uctxt->rcvhdrq_cnt;
1215 	cinfo.rcvhdrq_entsize = uctxt->rcvhdrqentsize << 2;
1216 	cinfo.sdma_ring_size = fd->cq->nentries;
1217 	cinfo.rcvegr_size = uctxt->egrbufs.rcvtid_size;
1218 
1219 	trace_hfi1_ctxt_info(uctxt->dd, uctxt->ctxt, fd->subctxt, cinfo);
1220 	if (copy_to_user(ubase, &cinfo, sizeof(cinfo)))
1221 		ret = -EFAULT;
1222 
1223 	return ret;
1224 }
1225 
1226 static int init_user_ctxt(struct hfi1_filedata *fd)
1227 {
1228 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
1229 	int ret;
1230 
1231 	ret = hfi1_user_sdma_alloc_queues(uctxt, fd);
1232 	if (ret)
1233 		return ret;
1234 
1235 	ret = hfi1_user_exp_rcv_init(fd);
1236 
1237 	return ret;
1238 }
1239 
1240 static int setup_base_ctxt(struct hfi1_filedata *fd)
1241 {
1242 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
1243 	struct hfi1_devdata *dd = uctxt->dd;
1244 	int ret = 0;
1245 
1246 	hfi1_init_ctxt(uctxt->sc);
1247 
1248 	/* Now allocate the RcvHdr queue and eager buffers. */
1249 	ret = hfi1_create_rcvhdrq(dd, uctxt);
1250 	if (ret)
1251 		return ret;
1252 
1253 	ret = hfi1_setup_eagerbufs(uctxt);
1254 	if (ret)
1255 		goto setup_failed;
1256 
1257 	/* If sub-contexts are enabled, do the appropriate setup */
1258 	if (uctxt->subctxt_cnt)
1259 		ret = setup_subctxt(uctxt);
1260 	if (ret)
1261 		goto setup_failed;
1262 
1263 	ret = hfi1_user_exp_rcv_grp_init(fd);
1264 	if (ret)
1265 		goto setup_failed;
1266 
1267 	ret = init_user_ctxt(fd);
1268 	if (ret)
1269 		goto setup_failed;
1270 
1271 	user_init(uctxt);
1272 
1273 	return 0;
1274 
1275 setup_failed:
1276 	hfi1_free_ctxtdata(dd, uctxt);
1277 	return ret;
1278 }
1279 
1280 static int get_base_info(struct hfi1_filedata *fd, void __user *ubase,
1281 			 __u32 len)
1282 {
1283 	struct hfi1_base_info binfo;
1284 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
1285 	struct hfi1_devdata *dd = uctxt->dd;
1286 	ssize_t sz;
1287 	unsigned offset;
1288 	int ret = 0;
1289 
1290 	trace_hfi1_uctxtdata(uctxt->dd, uctxt, fd->subctxt);
1291 
1292 	memset(&binfo, 0, sizeof(binfo));
1293 	binfo.hw_version = dd->revision;
1294 	binfo.sw_version = HFI1_KERN_SWVERSION;
1295 	binfo.bthqp = kdeth_qp;
1296 	binfo.jkey = uctxt->jkey;
1297 	/*
1298 	 * If more than 64 contexts are enabled the allocated credit
1299 	 * return will span two or three contiguous pages. Since we only
1300 	 * map the page containing the context's credit return address,
1301 	 * we need to calculate the offset in the proper page.
1302 	 */
1303 	offset = ((u64)uctxt->sc->hw_free -
1304 		  (u64)dd->cr_base[uctxt->numa_id].va) % PAGE_SIZE;
1305 	binfo.sc_credits_addr = HFI1_MMAP_TOKEN(PIO_CRED, uctxt->ctxt,
1306 						fd->subctxt, offset);
1307 	binfo.pio_bufbase = HFI1_MMAP_TOKEN(PIO_BUFS, uctxt->ctxt,
1308 					    fd->subctxt,
1309 					    uctxt->sc->base_addr);
1310 	binfo.pio_bufbase_sop = HFI1_MMAP_TOKEN(PIO_BUFS_SOP,
1311 						uctxt->ctxt,
1312 						fd->subctxt,
1313 						uctxt->sc->base_addr);
1314 	binfo.rcvhdr_bufbase = HFI1_MMAP_TOKEN(RCV_HDRQ, uctxt->ctxt,
1315 					       fd->subctxt,
1316 					       uctxt->rcvhdrq);
1317 	binfo.rcvegr_bufbase = HFI1_MMAP_TOKEN(RCV_EGRBUF, uctxt->ctxt,
1318 					       fd->subctxt,
1319 					       uctxt->egrbufs.rcvtids[0].dma);
1320 	binfo.sdma_comp_bufbase = HFI1_MMAP_TOKEN(SDMA_COMP, uctxt->ctxt,
1321 						 fd->subctxt, 0);
1322 	/*
1323 	 * user regs are at
1324 	 * (RXE_PER_CONTEXT_USER + (ctxt * RXE_PER_CONTEXT_SIZE))
1325 	 */
1326 	binfo.user_regbase = HFI1_MMAP_TOKEN(UREGS, uctxt->ctxt,
1327 					    fd->subctxt, 0);
1328 	offset = offset_in_page((((uctxt->ctxt - dd->first_dyn_alloc_ctxt) *
1329 		    HFI1_MAX_SHARED_CTXTS) + fd->subctxt) *
1330 		  sizeof(*dd->events));
1331 	binfo.events_bufbase = HFI1_MMAP_TOKEN(EVENTS, uctxt->ctxt,
1332 					      fd->subctxt,
1333 					      offset);
1334 	binfo.status_bufbase = HFI1_MMAP_TOKEN(STATUS, uctxt->ctxt,
1335 					      fd->subctxt,
1336 					      dd->status);
1337 	if (HFI1_CAP_IS_USET(DMA_RTAIL))
1338 		binfo.rcvhdrtail_base = HFI1_MMAP_TOKEN(RTAIL, uctxt->ctxt,
1339 						       fd->subctxt, 0);
1340 	if (uctxt->subctxt_cnt) {
1341 		binfo.subctxt_uregbase = HFI1_MMAP_TOKEN(SUBCTXT_UREGS,
1342 							uctxt->ctxt,
1343 							fd->subctxt, 0);
1344 		binfo.subctxt_rcvhdrbuf = HFI1_MMAP_TOKEN(SUBCTXT_RCV_HDRQ,
1345 							 uctxt->ctxt,
1346 							 fd->subctxt, 0);
1347 		binfo.subctxt_rcvegrbuf = HFI1_MMAP_TOKEN(SUBCTXT_EGRBUF,
1348 							 uctxt->ctxt,
1349 							 fd->subctxt, 0);
1350 	}
1351 	sz = (len < sizeof(binfo)) ? len : sizeof(binfo);
1352 	if (copy_to_user(ubase, &binfo, sz))
1353 		ret = -EFAULT;
1354 	return ret;
1355 }
1356 
1357 static unsigned int poll_urgent(struct file *fp,
1358 				struct poll_table_struct *pt)
1359 {
1360 	struct hfi1_filedata *fd = fp->private_data;
1361 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
1362 	struct hfi1_devdata *dd = uctxt->dd;
1363 	unsigned pollflag;
1364 
1365 	poll_wait(fp, &uctxt->wait, pt);
1366 
1367 	spin_lock_irq(&dd->uctxt_lock);
1368 	if (uctxt->urgent != uctxt->urgent_poll) {
1369 		pollflag = POLLIN | POLLRDNORM;
1370 		uctxt->urgent_poll = uctxt->urgent;
1371 	} else {
1372 		pollflag = 0;
1373 		set_bit(HFI1_CTXT_WAITING_URG, &uctxt->event_flags);
1374 	}
1375 	spin_unlock_irq(&dd->uctxt_lock);
1376 
1377 	return pollflag;
1378 }
1379 
1380 static unsigned int poll_next(struct file *fp,
1381 			      struct poll_table_struct *pt)
1382 {
1383 	struct hfi1_filedata *fd = fp->private_data;
1384 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
1385 	struct hfi1_devdata *dd = uctxt->dd;
1386 	unsigned pollflag;
1387 
1388 	poll_wait(fp, &uctxt->wait, pt);
1389 
1390 	spin_lock_irq(&dd->uctxt_lock);
1391 	if (hdrqempty(uctxt)) {
1392 		set_bit(HFI1_CTXT_WAITING_RCV, &uctxt->event_flags);
1393 		hfi1_rcvctrl(dd, HFI1_RCVCTRL_INTRAVAIL_ENB, uctxt->ctxt);
1394 		pollflag = 0;
1395 	} else {
1396 		pollflag = POLLIN | POLLRDNORM;
1397 	}
1398 	spin_unlock_irq(&dd->uctxt_lock);
1399 
1400 	return pollflag;
1401 }
1402 
1403 /*
1404  * Find all user contexts in use, and set the specified bit in their
1405  * event mask.
1406  * See also find_ctxt() for a similar use, that is specific to send buffers.
1407  */
1408 int hfi1_set_uevent_bits(struct hfi1_pportdata *ppd, const int evtbit)
1409 {
1410 	struct hfi1_ctxtdata *uctxt;
1411 	struct hfi1_devdata *dd = ppd->dd;
1412 	unsigned ctxt;
1413 	int ret = 0;
1414 	unsigned long flags;
1415 
1416 	if (!dd->events) {
1417 		ret = -EINVAL;
1418 		goto done;
1419 	}
1420 
1421 	spin_lock_irqsave(&dd->uctxt_lock, flags);
1422 	for (ctxt = dd->first_dyn_alloc_ctxt; ctxt < dd->num_rcv_contexts;
1423 	     ctxt++) {
1424 		uctxt = dd->rcd[ctxt];
1425 		if (uctxt) {
1426 			unsigned long *evs = dd->events +
1427 				(uctxt->ctxt - dd->first_dyn_alloc_ctxt) *
1428 				HFI1_MAX_SHARED_CTXTS;
1429 			int i;
1430 			/*
1431 			 * subctxt_cnt is 0 if not shared, so do base
1432 			 * separately, first, then remaining subctxt, if any
1433 			 */
1434 			set_bit(evtbit, evs);
1435 			for (i = 1; i < uctxt->subctxt_cnt; i++)
1436 				set_bit(evtbit, evs + i);
1437 		}
1438 	}
1439 	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
1440 done:
1441 	return ret;
1442 }
1443 
1444 /**
1445  * manage_rcvq - manage a context's receive queue
1446  * @uctxt: the context
1447  * @subctxt: the sub-context
1448  * @start_stop: action to carry out
1449  *
1450  * start_stop == 0 disables receive on the context, for use in queue
1451  * overflow conditions.  start_stop==1 re-enables, to be used to
1452  * re-init the software copy of the head register
1453  */
1454 static int manage_rcvq(struct hfi1_ctxtdata *uctxt, u16 subctxt,
1455 		       int start_stop)
1456 {
1457 	struct hfi1_devdata *dd = uctxt->dd;
1458 	unsigned int rcvctrl_op;
1459 
1460 	if (subctxt)
1461 		goto bail;
1462 	/* atomically clear receive enable ctxt. */
1463 	if (start_stop) {
1464 		/*
1465 		 * On enable, force in-memory copy of the tail register to
1466 		 * 0, so that protocol code doesn't have to worry about
1467 		 * whether or not the chip has yet updated the in-memory
1468 		 * copy or not on return from the system call. The chip
1469 		 * always resets it's tail register back to 0 on a
1470 		 * transition from disabled to enabled.
1471 		 */
1472 		if (uctxt->rcvhdrtail_kvaddr)
1473 			clear_rcvhdrtail(uctxt);
1474 		rcvctrl_op = HFI1_RCVCTRL_CTXT_ENB;
1475 	} else {
1476 		rcvctrl_op = HFI1_RCVCTRL_CTXT_DIS;
1477 	}
1478 	hfi1_rcvctrl(dd, rcvctrl_op, uctxt->ctxt);
1479 	/* always; new head should be equal to new tail; see above */
1480 bail:
1481 	return 0;
1482 }
1483 
1484 /*
1485  * clear the event notifier events for this context.
1486  * User process then performs actions appropriate to bit having been
1487  * set, if desired, and checks again in future.
1488  */
1489 static int user_event_ack(struct hfi1_ctxtdata *uctxt, u16 subctxt,
1490 			  unsigned long events)
1491 {
1492 	int i;
1493 	struct hfi1_devdata *dd = uctxt->dd;
1494 	unsigned long *evs;
1495 
1496 	if (!dd->events)
1497 		return 0;
1498 
1499 	evs = dd->events + ((uctxt->ctxt - dd->first_dyn_alloc_ctxt) *
1500 			    HFI1_MAX_SHARED_CTXTS) + subctxt;
1501 
1502 	for (i = 0; i <= _HFI1_MAX_EVENT_BIT; i++) {
1503 		if (!test_bit(i, &events))
1504 			continue;
1505 		clear_bit(i, evs);
1506 	}
1507 	return 0;
1508 }
1509 
1510 static int set_ctxt_pkey(struct hfi1_ctxtdata *uctxt, u16 subctxt, u16 pkey)
1511 {
1512 	int ret = -ENOENT, i, intable = 0;
1513 	struct hfi1_pportdata *ppd = uctxt->ppd;
1514 	struct hfi1_devdata *dd = uctxt->dd;
1515 
1516 	if (pkey == LIM_MGMT_P_KEY || pkey == FULL_MGMT_P_KEY) {
1517 		ret = -EINVAL;
1518 		goto done;
1519 	}
1520 
1521 	for (i = 0; i < ARRAY_SIZE(ppd->pkeys); i++)
1522 		if (pkey == ppd->pkeys[i]) {
1523 			intable = 1;
1524 			break;
1525 		}
1526 
1527 	if (intable)
1528 		ret = hfi1_set_ctxt_pkey(dd, uctxt->ctxt, pkey);
1529 done:
1530 	return ret;
1531 }
1532 
1533 static void user_remove(struct hfi1_devdata *dd)
1534 {
1535 
1536 	hfi1_cdev_cleanup(&dd->user_cdev, &dd->user_device);
1537 }
1538 
1539 static int user_add(struct hfi1_devdata *dd)
1540 {
1541 	char name[10];
1542 	int ret;
1543 
1544 	snprintf(name, sizeof(name), "%s_%d", class_name(), dd->unit);
1545 	ret = hfi1_cdev_init(dd->unit, name, &hfi1_file_ops,
1546 			     &dd->user_cdev, &dd->user_device,
1547 			     true, &dd->kobj);
1548 	if (ret)
1549 		user_remove(dd);
1550 
1551 	return ret;
1552 }
1553 
1554 /*
1555  * Create per-unit files in /dev
1556  */
1557 int hfi1_device_create(struct hfi1_devdata *dd)
1558 {
1559 	return user_add(dd);
1560 }
1561 
1562 /*
1563  * Remove per-unit files in /dev
1564  * void, core kernel returns no errors for this stuff
1565  */
1566 void hfi1_device_remove(struct hfi1_devdata *dd)
1567 {
1568 	user_remove(dd);
1569 }
1570