xref: /linux/drivers/infiniband/hw/qib/qib_file_ops.c (revision f2ee442115c9b6219083c019939a9cc0c9abb2f8)
1 /*
2  * Copyright (c) 2006, 2007, 2008, 2009, 2010 QLogic Corporation.
3  * All rights reserved.
4  * Copyright (c) 2003, 2004, 2005, 2006 PathScale, Inc. All rights reserved.
5  *
6  * This software is available to you under a choice of one of two
7  * licenses.  You may choose to be licensed under the terms of the GNU
8  * General Public License (GPL) Version 2, available from the file
9  * COPYING in the main directory of this source tree, or the
10  * OpenIB.org BSD license below:
11  *
12  *     Redistribution and use in source and binary forms, with or
13  *     without modification, are permitted provided that the following
14  *     conditions are met:
15  *
16  *      - Redistributions of source code must retain the above
17  *        copyright notice, this list of conditions and the following
18  *        disclaimer.
19  *
20  *      - Redistributions in binary form must reproduce the above
21  *        copyright notice, this list of conditions and the following
22  *        disclaimer in the documentation and/or other materials
23  *        provided with the distribution.
24  *
25  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
26  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
27  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
28  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
29  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
30  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
31  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
32  * SOFTWARE.
33  */
34 
35 #include <linux/pci.h>
36 #include <linux/poll.h>
37 #include <linux/cdev.h>
38 #include <linux/swap.h>
39 #include <linux/vmalloc.h>
40 #include <linux/highmem.h>
41 #include <linux/io.h>
42 #include <linux/uio.h>
43 #include <linux/jiffies.h>
44 #include <asm/pgtable.h>
45 #include <linux/delay.h>
46 #include <linux/export.h>
47 
48 #include "qib.h"
49 #include "qib_common.h"
50 #include "qib_user_sdma.h"
51 
52 static int qib_open(struct inode *, struct file *);
53 static int qib_close(struct inode *, struct file *);
54 static ssize_t qib_write(struct file *, const char __user *, size_t, loff_t *);
55 static ssize_t qib_aio_write(struct kiocb *, const struct iovec *,
56 			     unsigned long, loff_t);
57 static unsigned int qib_poll(struct file *, struct poll_table_struct *);
58 static int qib_mmapf(struct file *, struct vm_area_struct *);
59 
60 static const struct file_operations qib_file_ops = {
61 	.owner = THIS_MODULE,
62 	.write = qib_write,
63 	.aio_write = qib_aio_write,
64 	.open = qib_open,
65 	.release = qib_close,
66 	.poll = qib_poll,
67 	.mmap = qib_mmapf,
68 	.llseek = noop_llseek,
69 };
70 
71 /*
72  * Convert kernel virtual addresses to physical addresses so they don't
73  * potentially conflict with the chip addresses used as mmap offsets.
74  * It doesn't really matter what mmap offset we use as long as we can
75  * interpret it correctly.
76  */
77 static u64 cvt_kvaddr(void *p)
78 {
79 	struct page *page;
80 	u64 paddr = 0;
81 
82 	page = vmalloc_to_page(p);
83 	if (page)
84 		paddr = page_to_pfn(page) << PAGE_SHIFT;
85 
86 	return paddr;
87 }
88 
89 static int qib_get_base_info(struct file *fp, void __user *ubase,
90 			     size_t ubase_size)
91 {
92 	struct qib_ctxtdata *rcd = ctxt_fp(fp);
93 	int ret = 0;
94 	struct qib_base_info *kinfo = NULL;
95 	struct qib_devdata *dd = rcd->dd;
96 	struct qib_pportdata *ppd = rcd->ppd;
97 	unsigned subctxt_cnt;
98 	int shared, master;
99 	size_t sz;
100 
101 	subctxt_cnt = rcd->subctxt_cnt;
102 	if (!subctxt_cnt) {
103 		shared = 0;
104 		master = 0;
105 		subctxt_cnt = 1;
106 	} else {
107 		shared = 1;
108 		master = !subctxt_fp(fp);
109 	}
110 
111 	sz = sizeof(*kinfo);
112 	/* If context sharing is not requested, allow the old size structure */
113 	if (!shared)
114 		sz -= 7 * sizeof(u64);
115 	if (ubase_size < sz) {
116 		ret = -EINVAL;
117 		goto bail;
118 	}
119 
120 	kinfo = kzalloc(sizeof(*kinfo), GFP_KERNEL);
121 	if (kinfo == NULL) {
122 		ret = -ENOMEM;
123 		goto bail;
124 	}
125 
126 	ret = dd->f_get_base_info(rcd, kinfo);
127 	if (ret < 0)
128 		goto bail;
129 
130 	kinfo->spi_rcvhdr_cnt = dd->rcvhdrcnt;
131 	kinfo->spi_rcvhdrent_size = dd->rcvhdrentsize;
132 	kinfo->spi_tidegrcnt = rcd->rcvegrcnt;
133 	kinfo->spi_rcv_egrbufsize = dd->rcvegrbufsize;
134 	/*
135 	 * have to mmap whole thing
136 	 */
137 	kinfo->spi_rcv_egrbuftotlen =
138 		rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size;
139 	kinfo->spi_rcv_egrperchunk = rcd->rcvegrbufs_perchunk;
140 	kinfo->spi_rcv_egrchunksize = kinfo->spi_rcv_egrbuftotlen /
141 		rcd->rcvegrbuf_chunks;
142 	kinfo->spi_tidcnt = dd->rcvtidcnt / subctxt_cnt;
143 	if (master)
144 		kinfo->spi_tidcnt += dd->rcvtidcnt % subctxt_cnt;
145 	/*
146 	 * for this use, may be cfgctxts summed over all chips that
147 	 * are are configured and present
148 	 */
149 	kinfo->spi_nctxts = dd->cfgctxts;
150 	/* unit (chip/board) our context is on */
151 	kinfo->spi_unit = dd->unit;
152 	kinfo->spi_port = ppd->port;
153 	/* for now, only a single page */
154 	kinfo->spi_tid_maxsize = PAGE_SIZE;
155 
156 	/*
157 	 * Doing this per context, and based on the skip value, etc.  This has
158 	 * to be the actual buffer size, since the protocol code treats it
159 	 * as an array.
160 	 *
161 	 * These have to be set to user addresses in the user code via mmap.
162 	 * These values are used on return to user code for the mmap target
163 	 * addresses only.  For 32 bit, same 44 bit address problem, so use
164 	 * the physical address, not virtual.  Before 2.6.11, using the
165 	 * page_address() macro worked, but in 2.6.11, even that returns the
166 	 * full 64 bit address (upper bits all 1's).  So far, using the
167 	 * physical addresses (or chip offsets, for chip mapping) works, but
168 	 * no doubt some future kernel release will change that, and we'll be
169 	 * on to yet another method of dealing with this.
170 	 * Normally only one of rcvhdr_tailaddr or rhf_offset is useful
171 	 * since the chips with non-zero rhf_offset don't normally
172 	 * enable tail register updates to host memory, but for testing,
173 	 * both can be enabled and used.
174 	 */
175 	kinfo->spi_rcvhdr_base = (u64) rcd->rcvhdrq_phys;
176 	kinfo->spi_rcvhdr_tailaddr = (u64) rcd->rcvhdrqtailaddr_phys;
177 	kinfo->spi_rhf_offset = dd->rhf_offset;
178 	kinfo->spi_rcv_egrbufs = (u64) rcd->rcvegr_phys;
179 	kinfo->spi_pioavailaddr = (u64) dd->pioavailregs_phys;
180 	/* setup per-unit (not port) status area for user programs */
181 	kinfo->spi_status = (u64) kinfo->spi_pioavailaddr +
182 		(char *) ppd->statusp -
183 		(char *) dd->pioavailregs_dma;
184 	kinfo->spi_uregbase = (u64) dd->uregbase + dd->ureg_align * rcd->ctxt;
185 	if (!shared) {
186 		kinfo->spi_piocnt = rcd->piocnt;
187 		kinfo->spi_piobufbase = (u64) rcd->piobufs;
188 		kinfo->spi_sendbuf_status = cvt_kvaddr(rcd->user_event_mask);
189 	} else if (master) {
190 		kinfo->spi_piocnt = (rcd->piocnt / subctxt_cnt) +
191 				    (rcd->piocnt % subctxt_cnt);
192 		/* Master's PIO buffers are after all the slave's */
193 		kinfo->spi_piobufbase = (u64) rcd->piobufs +
194 			dd->palign *
195 			(rcd->piocnt - kinfo->spi_piocnt);
196 	} else {
197 		unsigned slave = subctxt_fp(fp) - 1;
198 
199 		kinfo->spi_piocnt = rcd->piocnt / subctxt_cnt;
200 		kinfo->spi_piobufbase = (u64) rcd->piobufs +
201 			dd->palign * kinfo->spi_piocnt * slave;
202 	}
203 
204 	if (shared) {
205 		kinfo->spi_sendbuf_status =
206 			cvt_kvaddr(&rcd->user_event_mask[subctxt_fp(fp)]);
207 		/* only spi_subctxt_* fields should be set in this block! */
208 		kinfo->spi_subctxt_uregbase = cvt_kvaddr(rcd->subctxt_uregbase);
209 
210 		kinfo->spi_subctxt_rcvegrbuf =
211 			cvt_kvaddr(rcd->subctxt_rcvegrbuf);
212 		kinfo->spi_subctxt_rcvhdr_base =
213 			cvt_kvaddr(rcd->subctxt_rcvhdr_base);
214 	}
215 
216 	/*
217 	 * All user buffers are 2KB buffers.  If we ever support
218 	 * giving 4KB buffers to user processes, this will need some
219 	 * work.  Can't use piobufbase directly, because it has
220 	 * both 2K and 4K buffer base values.
221 	 */
222 	kinfo->spi_pioindex = (kinfo->spi_piobufbase - dd->pio2k_bufbase) /
223 		dd->palign;
224 	kinfo->spi_pioalign = dd->palign;
225 	kinfo->spi_qpair = QIB_KD_QP;
226 	/*
227 	 * user mode PIO buffers are always 2KB, even when 4KB can
228 	 * be received, and sent via the kernel; this is ibmaxlen
229 	 * for 2K MTU.
230 	 */
231 	kinfo->spi_piosize = dd->piosize2k - 2 * sizeof(u32);
232 	kinfo->spi_mtu = ppd->ibmaxlen; /* maxlen, not ibmtu */
233 	kinfo->spi_ctxt = rcd->ctxt;
234 	kinfo->spi_subctxt = subctxt_fp(fp);
235 	kinfo->spi_sw_version = QIB_KERN_SWVERSION;
236 	kinfo->spi_sw_version |= 1U << 31; /* QLogic-built, not kernel.org */
237 	kinfo->spi_hw_version = dd->revision;
238 
239 	if (master)
240 		kinfo->spi_runtime_flags |= QIB_RUNTIME_MASTER;
241 
242 	sz = (ubase_size < sizeof(*kinfo)) ? ubase_size : sizeof(*kinfo);
243 	if (copy_to_user(ubase, kinfo, sz))
244 		ret = -EFAULT;
245 bail:
246 	kfree(kinfo);
247 	return ret;
248 }
249 
250 /**
251  * qib_tid_update - update a context TID
252  * @rcd: the context
253  * @fp: the qib device file
254  * @ti: the TID information
255  *
256  * The new implementation as of Oct 2004 is that the driver assigns
257  * the tid and returns it to the caller.   To reduce search time, we
258  * keep a cursor for each context, walking the shadow tid array to find
259  * one that's not in use.
260  *
261  * For now, if we can't allocate the full list, we fail, although
262  * in the long run, we'll allocate as many as we can, and the
263  * caller will deal with that by trying the remaining pages later.
264  * That means that when we fail, we have to mark the tids as not in
265  * use again, in our shadow copy.
266  *
267  * It's up to the caller to free the tids when they are done.
268  * We'll unlock the pages as they free them.
269  *
270  * Also, right now we are locking one page at a time, but since
271  * the intended use of this routine is for a single group of
272  * virtually contiguous pages, that should change to improve
273  * performance.
274  */
275 static int qib_tid_update(struct qib_ctxtdata *rcd, struct file *fp,
276 			  const struct qib_tid_info *ti)
277 {
278 	int ret = 0, ntids;
279 	u32 tid, ctxttid, cnt, i, tidcnt, tidoff;
280 	u16 *tidlist;
281 	struct qib_devdata *dd = rcd->dd;
282 	u64 physaddr;
283 	unsigned long vaddr;
284 	u64 __iomem *tidbase;
285 	unsigned long tidmap[8];
286 	struct page **pagep = NULL;
287 	unsigned subctxt = subctxt_fp(fp);
288 
289 	if (!dd->pageshadow) {
290 		ret = -ENOMEM;
291 		goto done;
292 	}
293 
294 	cnt = ti->tidcnt;
295 	if (!cnt) {
296 		ret = -EFAULT;
297 		goto done;
298 	}
299 	ctxttid = rcd->ctxt * dd->rcvtidcnt;
300 	if (!rcd->subctxt_cnt) {
301 		tidcnt = dd->rcvtidcnt;
302 		tid = rcd->tidcursor;
303 		tidoff = 0;
304 	} else if (!subctxt) {
305 		tidcnt = (dd->rcvtidcnt / rcd->subctxt_cnt) +
306 			 (dd->rcvtidcnt % rcd->subctxt_cnt);
307 		tidoff = dd->rcvtidcnt - tidcnt;
308 		ctxttid += tidoff;
309 		tid = tidcursor_fp(fp);
310 	} else {
311 		tidcnt = dd->rcvtidcnt / rcd->subctxt_cnt;
312 		tidoff = tidcnt * (subctxt - 1);
313 		ctxttid += tidoff;
314 		tid = tidcursor_fp(fp);
315 	}
316 	if (cnt > tidcnt) {
317 		/* make sure it all fits in tid_pg_list */
318 		qib_devinfo(dd->pcidev, "Process tried to allocate %u "
319 			 "TIDs, only trying max (%u)\n", cnt, tidcnt);
320 		cnt = tidcnt;
321 	}
322 	pagep = (struct page **) rcd->tid_pg_list;
323 	tidlist = (u16 *) &pagep[dd->rcvtidcnt];
324 	pagep += tidoff;
325 	tidlist += tidoff;
326 
327 	memset(tidmap, 0, sizeof(tidmap));
328 	/* before decrement; chip actual # */
329 	ntids = tidcnt;
330 	tidbase = (u64 __iomem *) (((char __iomem *) dd->kregbase) +
331 				   dd->rcvtidbase +
332 				   ctxttid * sizeof(*tidbase));
333 
334 	/* virtual address of first page in transfer */
335 	vaddr = ti->tidvaddr;
336 	if (!access_ok(VERIFY_WRITE, (void __user *) vaddr,
337 		       cnt * PAGE_SIZE)) {
338 		ret = -EFAULT;
339 		goto done;
340 	}
341 	ret = qib_get_user_pages(vaddr, cnt, pagep);
342 	if (ret) {
343 		/*
344 		 * if (ret == -EBUSY)
345 		 * We can't continue because the pagep array won't be
346 		 * initialized. This should never happen,
347 		 * unless perhaps the user has mpin'ed the pages
348 		 * themselves.
349 		 */
350 		qib_devinfo(dd->pcidev,
351 			 "Failed to lock addr %p, %u pages: "
352 			 "errno %d\n", (void *) vaddr, cnt, -ret);
353 		goto done;
354 	}
355 	for (i = 0; i < cnt; i++, vaddr += PAGE_SIZE) {
356 		for (; ntids--; tid++) {
357 			if (tid == tidcnt)
358 				tid = 0;
359 			if (!dd->pageshadow[ctxttid + tid])
360 				break;
361 		}
362 		if (ntids < 0) {
363 			/*
364 			 * Oops, wrapped all the way through their TIDs,
365 			 * and didn't have enough free; see comments at
366 			 * start of routine
367 			 */
368 			i--;    /* last tidlist[i] not filled in */
369 			ret = -ENOMEM;
370 			break;
371 		}
372 		tidlist[i] = tid + tidoff;
373 		/* we "know" system pages and TID pages are same size */
374 		dd->pageshadow[ctxttid + tid] = pagep[i];
375 		dd->physshadow[ctxttid + tid] =
376 			qib_map_page(dd->pcidev, pagep[i], 0, PAGE_SIZE,
377 				     PCI_DMA_FROMDEVICE);
378 		/*
379 		 * don't need atomic or it's overhead
380 		 */
381 		__set_bit(tid, tidmap);
382 		physaddr = dd->physshadow[ctxttid + tid];
383 		/* PERFORMANCE: below should almost certainly be cached */
384 		dd->f_put_tid(dd, &tidbase[tid],
385 				  RCVHQ_RCV_TYPE_EXPECTED, physaddr);
386 		/*
387 		 * don't check this tid in qib_ctxtshadow, since we
388 		 * just filled it in; start with the next one.
389 		 */
390 		tid++;
391 	}
392 
393 	if (ret) {
394 		u32 limit;
395 cleanup:
396 		/* jump here if copy out of updated info failed... */
397 		/* same code that's in qib_free_tid() */
398 		limit = sizeof(tidmap) * BITS_PER_BYTE;
399 		if (limit > tidcnt)
400 			/* just in case size changes in future */
401 			limit = tidcnt;
402 		tid = find_first_bit((const unsigned long *)tidmap, limit);
403 		for (; tid < limit; tid++) {
404 			if (!test_bit(tid, tidmap))
405 				continue;
406 			if (dd->pageshadow[ctxttid + tid]) {
407 				dma_addr_t phys;
408 
409 				phys = dd->physshadow[ctxttid + tid];
410 				dd->physshadow[ctxttid + tid] = dd->tidinvalid;
411 				/* PERFORMANCE: below should almost certainly
412 				 * be cached
413 				 */
414 				dd->f_put_tid(dd, &tidbase[tid],
415 					      RCVHQ_RCV_TYPE_EXPECTED,
416 					      dd->tidinvalid);
417 				pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,
418 					       PCI_DMA_FROMDEVICE);
419 				dd->pageshadow[ctxttid + tid] = NULL;
420 			}
421 		}
422 		qib_release_user_pages(pagep, cnt);
423 	} else {
424 		/*
425 		 * Copy the updated array, with qib_tid's filled in, back
426 		 * to user.  Since we did the copy in already, this "should
427 		 * never fail" If it does, we have to clean up...
428 		 */
429 		if (copy_to_user((void __user *)
430 				 (unsigned long) ti->tidlist,
431 				 tidlist, cnt * sizeof(*tidlist))) {
432 			ret = -EFAULT;
433 			goto cleanup;
434 		}
435 		if (copy_to_user((void __user *) (unsigned long) ti->tidmap,
436 				 tidmap, sizeof tidmap)) {
437 			ret = -EFAULT;
438 			goto cleanup;
439 		}
440 		if (tid == tidcnt)
441 			tid = 0;
442 		if (!rcd->subctxt_cnt)
443 			rcd->tidcursor = tid;
444 		else
445 			tidcursor_fp(fp) = tid;
446 	}
447 
448 done:
449 	return ret;
450 }
451 
452 /**
453  * qib_tid_free - free a context TID
454  * @rcd: the context
455  * @subctxt: the subcontext
456  * @ti: the TID info
457  *
458  * right now we are unlocking one page at a time, but since
459  * the intended use of this routine is for a single group of
460  * virtually contiguous pages, that should change to improve
461  * performance.  We check that the TID is in range for this context
462  * but otherwise don't check validity; if user has an error and
463  * frees the wrong tid, it's only their own data that can thereby
464  * be corrupted.  We do check that the TID was in use, for sanity
465  * We always use our idea of the saved address, not the address that
466  * they pass in to us.
467  */
468 static int qib_tid_free(struct qib_ctxtdata *rcd, unsigned subctxt,
469 			const struct qib_tid_info *ti)
470 {
471 	int ret = 0;
472 	u32 tid, ctxttid, cnt, limit, tidcnt;
473 	struct qib_devdata *dd = rcd->dd;
474 	u64 __iomem *tidbase;
475 	unsigned long tidmap[8];
476 
477 	if (!dd->pageshadow) {
478 		ret = -ENOMEM;
479 		goto done;
480 	}
481 
482 	if (copy_from_user(tidmap, (void __user *)(unsigned long)ti->tidmap,
483 			   sizeof tidmap)) {
484 		ret = -EFAULT;
485 		goto done;
486 	}
487 
488 	ctxttid = rcd->ctxt * dd->rcvtidcnt;
489 	if (!rcd->subctxt_cnt)
490 		tidcnt = dd->rcvtidcnt;
491 	else if (!subctxt) {
492 		tidcnt = (dd->rcvtidcnt / rcd->subctxt_cnt) +
493 			 (dd->rcvtidcnt % rcd->subctxt_cnt);
494 		ctxttid += dd->rcvtidcnt - tidcnt;
495 	} else {
496 		tidcnt = dd->rcvtidcnt / rcd->subctxt_cnt;
497 		ctxttid += tidcnt * (subctxt - 1);
498 	}
499 	tidbase = (u64 __iomem *) ((char __iomem *)(dd->kregbase) +
500 				   dd->rcvtidbase +
501 				   ctxttid * sizeof(*tidbase));
502 
503 	limit = sizeof(tidmap) * BITS_PER_BYTE;
504 	if (limit > tidcnt)
505 		/* just in case size changes in future */
506 		limit = tidcnt;
507 	tid = find_first_bit(tidmap, limit);
508 	for (cnt = 0; tid < limit; tid++) {
509 		/*
510 		 * small optimization; if we detect a run of 3 or so without
511 		 * any set, use find_first_bit again.  That's mainly to
512 		 * accelerate the case where we wrapped, so we have some at
513 		 * the beginning, and some at the end, and a big gap
514 		 * in the middle.
515 		 */
516 		if (!test_bit(tid, tidmap))
517 			continue;
518 		cnt++;
519 		if (dd->pageshadow[ctxttid + tid]) {
520 			struct page *p;
521 			dma_addr_t phys;
522 
523 			p = dd->pageshadow[ctxttid + tid];
524 			dd->pageshadow[ctxttid + tid] = NULL;
525 			phys = dd->physshadow[ctxttid + tid];
526 			dd->physshadow[ctxttid + tid] = dd->tidinvalid;
527 			/* PERFORMANCE: below should almost certainly be
528 			 * cached
529 			 */
530 			dd->f_put_tid(dd, &tidbase[tid],
531 				      RCVHQ_RCV_TYPE_EXPECTED, dd->tidinvalid);
532 			pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,
533 				       PCI_DMA_FROMDEVICE);
534 			qib_release_user_pages(&p, 1);
535 		}
536 	}
537 done:
538 	return ret;
539 }
540 
541 /**
542  * qib_set_part_key - set a partition key
543  * @rcd: the context
544  * @key: the key
545  *
546  * We can have up to 4 active at a time (other than the default, which is
547  * always allowed).  This is somewhat tricky, since multiple contexts may set
548  * the same key, so we reference count them, and clean up at exit.  All 4
549  * partition keys are packed into a single qlogic_ib register.  It's an
550  * error for a process to set the same pkey multiple times.  We provide no
551  * mechanism to de-allocate a pkey at this time, we may eventually need to
552  * do that.  I've used the atomic operations, and no locking, and only make
553  * a single pass through what's available.  This should be more than
554  * adequate for some time. I'll think about spinlocks or the like if and as
555  * it's necessary.
556  */
557 static int qib_set_part_key(struct qib_ctxtdata *rcd, u16 key)
558 {
559 	struct qib_pportdata *ppd = rcd->ppd;
560 	int i, any = 0, pidx = -1;
561 	u16 lkey = key & 0x7FFF;
562 	int ret;
563 
564 	if (lkey == (QIB_DEFAULT_P_KEY & 0x7FFF)) {
565 		/* nothing to do; this key always valid */
566 		ret = 0;
567 		goto bail;
568 	}
569 
570 	if (!lkey) {
571 		ret = -EINVAL;
572 		goto bail;
573 	}
574 
575 	/*
576 	 * Set the full membership bit, because it has to be
577 	 * set in the register or the packet, and it seems
578 	 * cleaner to set in the register than to force all
579 	 * callers to set it.
580 	 */
581 	key |= 0x8000;
582 
583 	for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) {
584 		if (!rcd->pkeys[i] && pidx == -1)
585 			pidx = i;
586 		if (rcd->pkeys[i] == key) {
587 			ret = -EEXIST;
588 			goto bail;
589 		}
590 	}
591 	if (pidx == -1) {
592 		ret = -EBUSY;
593 		goto bail;
594 	}
595 	for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
596 		if (!ppd->pkeys[i]) {
597 			any++;
598 			continue;
599 		}
600 		if (ppd->pkeys[i] == key) {
601 			atomic_t *pkrefs = &ppd->pkeyrefs[i];
602 
603 			if (atomic_inc_return(pkrefs) > 1) {
604 				rcd->pkeys[pidx] = key;
605 				ret = 0;
606 				goto bail;
607 			} else {
608 				/*
609 				 * lost race, decrement count, catch below
610 				 */
611 				atomic_dec(pkrefs);
612 				any++;
613 			}
614 		}
615 		if ((ppd->pkeys[i] & 0x7FFF) == lkey) {
616 			/*
617 			 * It makes no sense to have both the limited and
618 			 * full membership PKEY set at the same time since
619 			 * the unlimited one will disable the limited one.
620 			 */
621 			ret = -EEXIST;
622 			goto bail;
623 		}
624 	}
625 	if (!any) {
626 		ret = -EBUSY;
627 		goto bail;
628 	}
629 	for (any = i = 0; i < ARRAY_SIZE(ppd->pkeys); i++) {
630 		if (!ppd->pkeys[i] &&
631 		    atomic_inc_return(&ppd->pkeyrefs[i]) == 1) {
632 			rcd->pkeys[pidx] = key;
633 			ppd->pkeys[i] = key;
634 			(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0);
635 			ret = 0;
636 			goto bail;
637 		}
638 	}
639 	ret = -EBUSY;
640 
641 bail:
642 	return ret;
643 }
644 
645 /**
646  * qib_manage_rcvq - manage a context's receive queue
647  * @rcd: the context
648  * @subctxt: the subcontext
649  * @start_stop: action to carry out
650  *
651  * start_stop == 0 disables receive on the context, for use in queue
652  * overflow conditions.  start_stop==1 re-enables, to be used to
653  * re-init the software copy of the head register
654  */
655 static int qib_manage_rcvq(struct qib_ctxtdata *rcd, unsigned subctxt,
656 			   int start_stop)
657 {
658 	struct qib_devdata *dd = rcd->dd;
659 	unsigned int rcvctrl_op;
660 
661 	if (subctxt)
662 		goto bail;
663 	/* atomically clear receive enable ctxt. */
664 	if (start_stop) {
665 		/*
666 		 * On enable, force in-memory copy of the tail register to
667 		 * 0, so that protocol code doesn't have to worry about
668 		 * whether or not the chip has yet updated the in-memory
669 		 * copy or not on return from the system call. The chip
670 		 * always resets it's tail register back to 0 on a
671 		 * transition from disabled to enabled.
672 		 */
673 		if (rcd->rcvhdrtail_kvaddr)
674 			qib_clear_rcvhdrtail(rcd);
675 		rcvctrl_op = QIB_RCVCTRL_CTXT_ENB;
676 	} else
677 		rcvctrl_op = QIB_RCVCTRL_CTXT_DIS;
678 	dd->f_rcvctrl(rcd->ppd, rcvctrl_op, rcd->ctxt);
679 	/* always; new head should be equal to new tail; see above */
680 bail:
681 	return 0;
682 }
683 
684 static void qib_clean_part_key(struct qib_ctxtdata *rcd,
685 			       struct qib_devdata *dd)
686 {
687 	int i, j, pchanged = 0;
688 	u64 oldpkey;
689 	struct qib_pportdata *ppd = rcd->ppd;
690 
691 	/* for debugging only */
692 	oldpkey = (u64) ppd->pkeys[0] |
693 		((u64) ppd->pkeys[1] << 16) |
694 		((u64) ppd->pkeys[2] << 32) |
695 		((u64) ppd->pkeys[3] << 48);
696 
697 	for (i = 0; i < ARRAY_SIZE(rcd->pkeys); i++) {
698 		if (!rcd->pkeys[i])
699 			continue;
700 		for (j = 0; j < ARRAY_SIZE(ppd->pkeys); j++) {
701 			/* check for match independent of the global bit */
702 			if ((ppd->pkeys[j] & 0x7fff) !=
703 			    (rcd->pkeys[i] & 0x7fff))
704 				continue;
705 			if (atomic_dec_and_test(&ppd->pkeyrefs[j])) {
706 				ppd->pkeys[j] = 0;
707 				pchanged++;
708 			}
709 			break;
710 		}
711 		rcd->pkeys[i] = 0;
712 	}
713 	if (pchanged)
714 		(void) ppd->dd->f_set_ib_cfg(ppd, QIB_IB_CFG_PKEYS, 0);
715 }
716 
717 /* common code for the mappings on dma_alloc_coherent mem */
718 static int qib_mmap_mem(struct vm_area_struct *vma, struct qib_ctxtdata *rcd,
719 			unsigned len, void *kvaddr, u32 write_ok, char *what)
720 {
721 	struct qib_devdata *dd = rcd->dd;
722 	unsigned long pfn;
723 	int ret;
724 
725 	if ((vma->vm_end - vma->vm_start) > len) {
726 		qib_devinfo(dd->pcidev,
727 			 "FAIL on %s: len %lx > %x\n", what,
728 			 vma->vm_end - vma->vm_start, len);
729 		ret = -EFAULT;
730 		goto bail;
731 	}
732 
733 	/*
734 	 * shared context user code requires rcvhdrq mapped r/w, others
735 	 * only allowed readonly mapping.
736 	 */
737 	if (!write_ok) {
738 		if (vma->vm_flags & VM_WRITE) {
739 			qib_devinfo(dd->pcidev,
740 				 "%s must be mapped readonly\n", what);
741 			ret = -EPERM;
742 			goto bail;
743 		}
744 
745 		/* don't allow them to later change with mprotect */
746 		vma->vm_flags &= ~VM_MAYWRITE;
747 	}
748 
749 	pfn = virt_to_phys(kvaddr) >> PAGE_SHIFT;
750 	ret = remap_pfn_range(vma, vma->vm_start, pfn,
751 			      len, vma->vm_page_prot);
752 	if (ret)
753 		qib_devinfo(dd->pcidev, "%s ctxt%u mmap of %lx, %x "
754 			 "bytes failed: %d\n", what, rcd->ctxt,
755 			 pfn, len, ret);
756 bail:
757 	return ret;
758 }
759 
760 static int mmap_ureg(struct vm_area_struct *vma, struct qib_devdata *dd,
761 		     u64 ureg)
762 {
763 	unsigned long phys;
764 	unsigned long sz;
765 	int ret;
766 
767 	/*
768 	 * This is real hardware, so use io_remap.  This is the mechanism
769 	 * for the user process to update the head registers for their ctxt
770 	 * in the chip.
771 	 */
772 	sz = dd->flags & QIB_HAS_HDRSUPP ? 2 * PAGE_SIZE : PAGE_SIZE;
773 	if ((vma->vm_end - vma->vm_start) > sz) {
774 		qib_devinfo(dd->pcidev, "FAIL mmap userreg: reqlen "
775 			 "%lx > PAGE\n", vma->vm_end - vma->vm_start);
776 		ret = -EFAULT;
777 	} else {
778 		phys = dd->physaddr + ureg;
779 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
780 
781 		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
782 		ret = io_remap_pfn_range(vma, vma->vm_start,
783 					 phys >> PAGE_SHIFT,
784 					 vma->vm_end - vma->vm_start,
785 					 vma->vm_page_prot);
786 	}
787 	return ret;
788 }
789 
790 static int mmap_piobufs(struct vm_area_struct *vma,
791 			struct qib_devdata *dd,
792 			struct qib_ctxtdata *rcd,
793 			unsigned piobufs, unsigned piocnt)
794 {
795 	unsigned long phys;
796 	int ret;
797 
798 	/*
799 	 * When we map the PIO buffers in the chip, we want to map them as
800 	 * writeonly, no read possible; unfortunately, x86 doesn't allow
801 	 * for this in hardware, but we still prevent users from asking
802 	 * for it.
803 	 */
804 	if ((vma->vm_end - vma->vm_start) > (piocnt * dd->palign)) {
805 		qib_devinfo(dd->pcidev, "FAIL mmap piobufs: "
806 			 "reqlen %lx > PAGE\n",
807 			 vma->vm_end - vma->vm_start);
808 		ret = -EINVAL;
809 		goto bail;
810 	}
811 
812 	phys = dd->physaddr + piobufs;
813 
814 #if defined(__powerpc__)
815 	/* There isn't a generic way to specify writethrough mappings */
816 	pgprot_val(vma->vm_page_prot) |= _PAGE_NO_CACHE;
817 	pgprot_val(vma->vm_page_prot) |= _PAGE_WRITETHRU;
818 	pgprot_val(vma->vm_page_prot) &= ~_PAGE_GUARDED;
819 #endif
820 
821 	/*
822 	 * don't allow them to later change to readable with mprotect (for when
823 	 * not initially mapped readable, as is normally the case)
824 	 */
825 	vma->vm_flags &= ~VM_MAYREAD;
826 	vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
827 
828 	if (qib_wc_pat)
829 		vma->vm_page_prot = pgprot_writecombine(vma->vm_page_prot);
830 
831 	ret = io_remap_pfn_range(vma, vma->vm_start, phys >> PAGE_SHIFT,
832 				 vma->vm_end - vma->vm_start,
833 				 vma->vm_page_prot);
834 bail:
835 	return ret;
836 }
837 
838 static int mmap_rcvegrbufs(struct vm_area_struct *vma,
839 			   struct qib_ctxtdata *rcd)
840 {
841 	struct qib_devdata *dd = rcd->dd;
842 	unsigned long start, size;
843 	size_t total_size, i;
844 	unsigned long pfn;
845 	int ret;
846 
847 	size = rcd->rcvegrbuf_size;
848 	total_size = rcd->rcvegrbuf_chunks * size;
849 	if ((vma->vm_end - vma->vm_start) > total_size) {
850 		qib_devinfo(dd->pcidev, "FAIL on egr bufs: "
851 			 "reqlen %lx > actual %lx\n",
852 			 vma->vm_end - vma->vm_start,
853 			 (unsigned long) total_size);
854 		ret = -EINVAL;
855 		goto bail;
856 	}
857 
858 	if (vma->vm_flags & VM_WRITE) {
859 		qib_devinfo(dd->pcidev, "Can't map eager buffers as "
860 			 "writable (flags=%lx)\n", vma->vm_flags);
861 		ret = -EPERM;
862 		goto bail;
863 	}
864 	/* don't allow them to later change to writeable with mprotect */
865 	vma->vm_flags &= ~VM_MAYWRITE;
866 
867 	start = vma->vm_start;
868 
869 	for (i = 0; i < rcd->rcvegrbuf_chunks; i++, start += size) {
870 		pfn = virt_to_phys(rcd->rcvegrbuf[i]) >> PAGE_SHIFT;
871 		ret = remap_pfn_range(vma, start, pfn, size,
872 				      vma->vm_page_prot);
873 		if (ret < 0)
874 			goto bail;
875 	}
876 	ret = 0;
877 
878 bail:
879 	return ret;
880 }
881 
882 /*
883  * qib_file_vma_fault - handle a VMA page fault.
884  */
885 static int qib_file_vma_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
886 {
887 	struct page *page;
888 
889 	page = vmalloc_to_page((void *)(vmf->pgoff << PAGE_SHIFT));
890 	if (!page)
891 		return VM_FAULT_SIGBUS;
892 
893 	get_page(page);
894 	vmf->page = page;
895 
896 	return 0;
897 }
898 
899 static struct vm_operations_struct qib_file_vm_ops = {
900 	.fault = qib_file_vma_fault,
901 };
902 
903 static int mmap_kvaddr(struct vm_area_struct *vma, u64 pgaddr,
904 		       struct qib_ctxtdata *rcd, unsigned subctxt)
905 {
906 	struct qib_devdata *dd = rcd->dd;
907 	unsigned subctxt_cnt;
908 	unsigned long len;
909 	void *addr;
910 	size_t size;
911 	int ret = 0;
912 
913 	subctxt_cnt = rcd->subctxt_cnt;
914 	size = rcd->rcvegrbuf_chunks * rcd->rcvegrbuf_size;
915 
916 	/*
917 	 * Each process has all the subctxt uregbase, rcvhdrq, and
918 	 * rcvegrbufs mmapped - as an array for all the processes,
919 	 * and also separately for this process.
920 	 */
921 	if (pgaddr == cvt_kvaddr(rcd->subctxt_uregbase)) {
922 		addr = rcd->subctxt_uregbase;
923 		size = PAGE_SIZE * subctxt_cnt;
924 	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvhdr_base)) {
925 		addr = rcd->subctxt_rcvhdr_base;
926 		size = rcd->rcvhdrq_size * subctxt_cnt;
927 	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvegrbuf)) {
928 		addr = rcd->subctxt_rcvegrbuf;
929 		size *= subctxt_cnt;
930 	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_uregbase +
931 					PAGE_SIZE * subctxt)) {
932 		addr = rcd->subctxt_uregbase + PAGE_SIZE * subctxt;
933 		size = PAGE_SIZE;
934 	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvhdr_base +
935 					rcd->rcvhdrq_size * subctxt)) {
936 		addr = rcd->subctxt_rcvhdr_base +
937 			rcd->rcvhdrq_size * subctxt;
938 		size = rcd->rcvhdrq_size;
939 	} else if (pgaddr == cvt_kvaddr(&rcd->user_event_mask[subctxt])) {
940 		addr = rcd->user_event_mask;
941 		size = PAGE_SIZE;
942 	} else if (pgaddr == cvt_kvaddr(rcd->subctxt_rcvegrbuf +
943 					size * subctxt)) {
944 		addr = rcd->subctxt_rcvegrbuf + size * subctxt;
945 		/* rcvegrbufs are read-only on the slave */
946 		if (vma->vm_flags & VM_WRITE) {
947 			qib_devinfo(dd->pcidev,
948 				 "Can't map eager buffers as "
949 				 "writable (flags=%lx)\n", vma->vm_flags);
950 			ret = -EPERM;
951 			goto bail;
952 		}
953 		/*
954 		 * Don't allow permission to later change to writeable
955 		 * with mprotect.
956 		 */
957 		vma->vm_flags &= ~VM_MAYWRITE;
958 	} else
959 		goto bail;
960 	len = vma->vm_end - vma->vm_start;
961 	if (len > size) {
962 		ret = -EINVAL;
963 		goto bail;
964 	}
965 
966 	vma->vm_pgoff = (unsigned long) addr >> PAGE_SHIFT;
967 	vma->vm_ops = &qib_file_vm_ops;
968 	vma->vm_flags |= VM_RESERVED | VM_DONTEXPAND;
969 	ret = 1;
970 
971 bail:
972 	return ret;
973 }
974 
975 /**
976  * qib_mmapf - mmap various structures into user space
977  * @fp: the file pointer
978  * @vma: the VM area
979  *
980  * We use this to have a shared buffer between the kernel and the user code
981  * for the rcvhdr queue, egr buffers, and the per-context user regs and pio
982  * buffers in the chip.  We have the open and close entries so we can bump
983  * the ref count and keep the driver from being unloaded while still mapped.
984  */
985 static int qib_mmapf(struct file *fp, struct vm_area_struct *vma)
986 {
987 	struct qib_ctxtdata *rcd;
988 	struct qib_devdata *dd;
989 	u64 pgaddr, ureg;
990 	unsigned piobufs, piocnt;
991 	int ret, match = 1;
992 
993 	rcd = ctxt_fp(fp);
994 	if (!rcd || !(vma->vm_flags & VM_SHARED)) {
995 		ret = -EINVAL;
996 		goto bail;
997 	}
998 	dd = rcd->dd;
999 
1000 	/*
1001 	 * This is the qib_do_user_init() code, mapping the shared buffers
1002 	 * and per-context user registers into the user process. The address
1003 	 * referred to by vm_pgoff is the file offset passed via mmap().
1004 	 * For shared contexts, this is the kernel vmalloc() address of the
1005 	 * pages to share with the master.
1006 	 * For non-shared or master ctxts, this is a physical address.
1007 	 * We only do one mmap for each space mapped.
1008 	 */
1009 	pgaddr = vma->vm_pgoff << PAGE_SHIFT;
1010 
1011 	/*
1012 	 * Check for 0 in case one of the allocations failed, but user
1013 	 * called mmap anyway.
1014 	 */
1015 	if (!pgaddr)  {
1016 		ret = -EINVAL;
1017 		goto bail;
1018 	}
1019 
1020 	/*
1021 	 * Physical addresses must fit in 40 bits for our hardware.
1022 	 * Check for kernel virtual addresses first, anything else must
1023 	 * match a HW or memory address.
1024 	 */
1025 	ret = mmap_kvaddr(vma, pgaddr, rcd, subctxt_fp(fp));
1026 	if (ret) {
1027 		if (ret > 0)
1028 			ret = 0;
1029 		goto bail;
1030 	}
1031 
1032 	ureg = dd->uregbase + dd->ureg_align * rcd->ctxt;
1033 	if (!rcd->subctxt_cnt) {
1034 		/* ctxt is not shared */
1035 		piocnt = rcd->piocnt;
1036 		piobufs = rcd->piobufs;
1037 	} else if (!subctxt_fp(fp)) {
1038 		/* caller is the master */
1039 		piocnt = (rcd->piocnt / rcd->subctxt_cnt) +
1040 			 (rcd->piocnt % rcd->subctxt_cnt);
1041 		piobufs = rcd->piobufs +
1042 			dd->palign * (rcd->piocnt - piocnt);
1043 	} else {
1044 		unsigned slave = subctxt_fp(fp) - 1;
1045 
1046 		/* caller is a slave */
1047 		piocnt = rcd->piocnt / rcd->subctxt_cnt;
1048 		piobufs = rcd->piobufs + dd->palign * piocnt * slave;
1049 	}
1050 
1051 	if (pgaddr == ureg)
1052 		ret = mmap_ureg(vma, dd, ureg);
1053 	else if (pgaddr == piobufs)
1054 		ret = mmap_piobufs(vma, dd, rcd, piobufs, piocnt);
1055 	else if (pgaddr == dd->pioavailregs_phys)
1056 		/* in-memory copy of pioavail registers */
1057 		ret = qib_mmap_mem(vma, rcd, PAGE_SIZE,
1058 				   (void *) dd->pioavailregs_dma, 0,
1059 				   "pioavail registers");
1060 	else if (pgaddr == rcd->rcvegr_phys)
1061 		ret = mmap_rcvegrbufs(vma, rcd);
1062 	else if (pgaddr == (u64) rcd->rcvhdrq_phys)
1063 		/*
1064 		 * The rcvhdrq itself; multiple pages, contiguous
1065 		 * from an i/o perspective.  Shared contexts need
1066 		 * to map r/w, so we allow writing.
1067 		 */
1068 		ret = qib_mmap_mem(vma, rcd, rcd->rcvhdrq_size,
1069 				   rcd->rcvhdrq, 1, "rcvhdrq");
1070 	else if (pgaddr == (u64) rcd->rcvhdrqtailaddr_phys)
1071 		/* in-memory copy of rcvhdrq tail register */
1072 		ret = qib_mmap_mem(vma, rcd, PAGE_SIZE,
1073 				   rcd->rcvhdrtail_kvaddr, 0,
1074 				   "rcvhdrq tail");
1075 	else
1076 		match = 0;
1077 	if (!match)
1078 		ret = -EINVAL;
1079 
1080 	vma->vm_private_data = NULL;
1081 
1082 	if (ret < 0)
1083 		qib_devinfo(dd->pcidev,
1084 			 "mmap Failure %d: off %llx len %lx\n",
1085 			 -ret, (unsigned long long)pgaddr,
1086 			 vma->vm_end - vma->vm_start);
1087 bail:
1088 	return ret;
1089 }
1090 
1091 static unsigned int qib_poll_urgent(struct qib_ctxtdata *rcd,
1092 				    struct file *fp,
1093 				    struct poll_table_struct *pt)
1094 {
1095 	struct qib_devdata *dd = rcd->dd;
1096 	unsigned pollflag;
1097 
1098 	poll_wait(fp, &rcd->wait, pt);
1099 
1100 	spin_lock_irq(&dd->uctxt_lock);
1101 	if (rcd->urgent != rcd->urgent_poll) {
1102 		pollflag = POLLIN | POLLRDNORM;
1103 		rcd->urgent_poll = rcd->urgent;
1104 	} else {
1105 		pollflag = 0;
1106 		set_bit(QIB_CTXT_WAITING_URG, &rcd->flag);
1107 	}
1108 	spin_unlock_irq(&dd->uctxt_lock);
1109 
1110 	return pollflag;
1111 }
1112 
1113 static unsigned int qib_poll_next(struct qib_ctxtdata *rcd,
1114 				  struct file *fp,
1115 				  struct poll_table_struct *pt)
1116 {
1117 	struct qib_devdata *dd = rcd->dd;
1118 	unsigned pollflag;
1119 
1120 	poll_wait(fp, &rcd->wait, pt);
1121 
1122 	spin_lock_irq(&dd->uctxt_lock);
1123 	if (dd->f_hdrqempty(rcd)) {
1124 		set_bit(QIB_CTXT_WAITING_RCV, &rcd->flag);
1125 		dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_INTRAVAIL_ENB, rcd->ctxt);
1126 		pollflag = 0;
1127 	} else
1128 		pollflag = POLLIN | POLLRDNORM;
1129 	spin_unlock_irq(&dd->uctxt_lock);
1130 
1131 	return pollflag;
1132 }
1133 
1134 static unsigned int qib_poll(struct file *fp, struct poll_table_struct *pt)
1135 {
1136 	struct qib_ctxtdata *rcd;
1137 	unsigned pollflag;
1138 
1139 	rcd = ctxt_fp(fp);
1140 	if (!rcd)
1141 		pollflag = POLLERR;
1142 	else if (rcd->poll_type == QIB_POLL_TYPE_URGENT)
1143 		pollflag = qib_poll_urgent(rcd, fp, pt);
1144 	else  if (rcd->poll_type == QIB_POLL_TYPE_ANYRCV)
1145 		pollflag = qib_poll_next(rcd, fp, pt);
1146 	else /* invalid */
1147 		pollflag = POLLERR;
1148 
1149 	return pollflag;
1150 }
1151 
1152 /*
1153  * Check that userland and driver are compatible for subcontexts.
1154  */
1155 static int qib_compatible_subctxts(int user_swmajor, int user_swminor)
1156 {
1157 	/* this code is written long-hand for clarity */
1158 	if (QIB_USER_SWMAJOR != user_swmajor) {
1159 		/* no promise of compatibility if major mismatch */
1160 		return 0;
1161 	}
1162 	if (QIB_USER_SWMAJOR == 1) {
1163 		switch (QIB_USER_SWMINOR) {
1164 		case 0:
1165 		case 1:
1166 		case 2:
1167 			/* no subctxt implementation so cannot be compatible */
1168 			return 0;
1169 		case 3:
1170 			/* 3 is only compatible with itself */
1171 			return user_swminor == 3;
1172 		default:
1173 			/* >= 4 are compatible (or are expected to be) */
1174 			return user_swminor >= 4;
1175 		}
1176 	}
1177 	/* make no promises yet for future major versions */
1178 	return 0;
1179 }
1180 
1181 static int init_subctxts(struct qib_devdata *dd,
1182 			 struct qib_ctxtdata *rcd,
1183 			 const struct qib_user_info *uinfo)
1184 {
1185 	int ret = 0;
1186 	unsigned num_subctxts;
1187 	size_t size;
1188 
1189 	/*
1190 	 * If the user is requesting zero subctxts,
1191 	 * skip the subctxt allocation.
1192 	 */
1193 	if (uinfo->spu_subctxt_cnt <= 0)
1194 		goto bail;
1195 	num_subctxts = uinfo->spu_subctxt_cnt;
1196 
1197 	/* Check for subctxt compatibility */
1198 	if (!qib_compatible_subctxts(uinfo->spu_userversion >> 16,
1199 		uinfo->spu_userversion & 0xffff)) {
1200 		qib_devinfo(dd->pcidev,
1201 			 "Mismatched user version (%d.%d) and driver "
1202 			 "version (%d.%d) while context sharing. Ensure "
1203 			 "that driver and library are from the same "
1204 			 "release.\n",
1205 			 (int) (uinfo->spu_userversion >> 16),
1206 			 (int) (uinfo->spu_userversion & 0xffff),
1207 			 QIB_USER_SWMAJOR, QIB_USER_SWMINOR);
1208 		goto bail;
1209 	}
1210 	if (num_subctxts > QLOGIC_IB_MAX_SUBCTXT) {
1211 		ret = -EINVAL;
1212 		goto bail;
1213 	}
1214 
1215 	rcd->subctxt_uregbase = vmalloc_user(PAGE_SIZE * num_subctxts);
1216 	if (!rcd->subctxt_uregbase) {
1217 		ret = -ENOMEM;
1218 		goto bail;
1219 	}
1220 	/* Note: rcd->rcvhdrq_size isn't initialized yet. */
1221 	size = ALIGN(dd->rcvhdrcnt * dd->rcvhdrentsize *
1222 		     sizeof(u32), PAGE_SIZE) * num_subctxts;
1223 	rcd->subctxt_rcvhdr_base = vmalloc_user(size);
1224 	if (!rcd->subctxt_rcvhdr_base) {
1225 		ret = -ENOMEM;
1226 		goto bail_ureg;
1227 	}
1228 
1229 	rcd->subctxt_rcvegrbuf = vmalloc_user(rcd->rcvegrbuf_chunks *
1230 					      rcd->rcvegrbuf_size *
1231 					      num_subctxts);
1232 	if (!rcd->subctxt_rcvegrbuf) {
1233 		ret = -ENOMEM;
1234 		goto bail_rhdr;
1235 	}
1236 
1237 	rcd->subctxt_cnt = uinfo->spu_subctxt_cnt;
1238 	rcd->subctxt_id = uinfo->spu_subctxt_id;
1239 	rcd->active_slaves = 1;
1240 	rcd->redirect_seq_cnt = 1;
1241 	set_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag);
1242 	goto bail;
1243 
1244 bail_rhdr:
1245 	vfree(rcd->subctxt_rcvhdr_base);
1246 bail_ureg:
1247 	vfree(rcd->subctxt_uregbase);
1248 	rcd->subctxt_uregbase = NULL;
1249 bail:
1250 	return ret;
1251 }
1252 
1253 static int setup_ctxt(struct qib_pportdata *ppd, int ctxt,
1254 		      struct file *fp, const struct qib_user_info *uinfo)
1255 {
1256 	struct qib_devdata *dd = ppd->dd;
1257 	struct qib_ctxtdata *rcd;
1258 	void *ptmp = NULL;
1259 	int ret;
1260 
1261 	rcd = qib_create_ctxtdata(ppd, ctxt);
1262 
1263 	/*
1264 	 * Allocate memory for use in qib_tid_update() at open to
1265 	 * reduce cost of expected send setup per message segment
1266 	 */
1267 	if (rcd)
1268 		ptmp = kmalloc(dd->rcvtidcnt * sizeof(u16) +
1269 			       dd->rcvtidcnt * sizeof(struct page **),
1270 			       GFP_KERNEL);
1271 
1272 	if (!rcd || !ptmp) {
1273 		qib_dev_err(dd, "Unable to allocate ctxtdata "
1274 			    "memory, failing open\n");
1275 		ret = -ENOMEM;
1276 		goto bailerr;
1277 	}
1278 	rcd->userversion = uinfo->spu_userversion;
1279 	ret = init_subctxts(dd, rcd, uinfo);
1280 	if (ret)
1281 		goto bailerr;
1282 	rcd->tid_pg_list = ptmp;
1283 	rcd->pid = current->pid;
1284 	init_waitqueue_head(&dd->rcd[ctxt]->wait);
1285 	strlcpy(rcd->comm, current->comm, sizeof(rcd->comm));
1286 	ctxt_fp(fp) = rcd;
1287 	qib_stats.sps_ctxts++;
1288 	dd->freectxts++;
1289 	ret = 0;
1290 	goto bail;
1291 
1292 bailerr:
1293 	dd->rcd[ctxt] = NULL;
1294 	kfree(rcd);
1295 	kfree(ptmp);
1296 bail:
1297 	return ret;
1298 }
1299 
1300 static inline int usable(struct qib_pportdata *ppd)
1301 {
1302 	struct qib_devdata *dd = ppd->dd;
1303 
1304 	return dd && (dd->flags & QIB_PRESENT) && dd->kregbase && ppd->lid &&
1305 		(ppd->lflags & QIBL_LINKACTIVE);
1306 }
1307 
1308 /*
1309  * Select a context on the given device, either using a requested port
1310  * or the port based on the context number.
1311  */
1312 static int choose_port_ctxt(struct file *fp, struct qib_devdata *dd, u32 port,
1313 			    const struct qib_user_info *uinfo)
1314 {
1315 	struct qib_pportdata *ppd = NULL;
1316 	int ret, ctxt;
1317 
1318 	if (port) {
1319 		if (!usable(dd->pport + port - 1)) {
1320 			ret = -ENETDOWN;
1321 			goto done;
1322 		} else
1323 			ppd = dd->pport + port - 1;
1324 	}
1325 	for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts && dd->rcd[ctxt];
1326 	     ctxt++)
1327 		;
1328 	if (ctxt == dd->cfgctxts) {
1329 		ret = -EBUSY;
1330 		goto done;
1331 	}
1332 	if (!ppd) {
1333 		u32 pidx = ctxt % dd->num_pports;
1334 		if (usable(dd->pport + pidx))
1335 			ppd = dd->pport + pidx;
1336 		else {
1337 			for (pidx = 0; pidx < dd->num_pports && !ppd;
1338 			     pidx++)
1339 				if (usable(dd->pport + pidx))
1340 					ppd = dd->pport + pidx;
1341 		}
1342 	}
1343 	ret = ppd ? setup_ctxt(ppd, ctxt, fp, uinfo) : -ENETDOWN;
1344 done:
1345 	return ret;
1346 }
1347 
1348 static int find_free_ctxt(int unit, struct file *fp,
1349 			  const struct qib_user_info *uinfo)
1350 {
1351 	struct qib_devdata *dd = qib_lookup(unit);
1352 	int ret;
1353 
1354 	if (!dd || (uinfo->spu_port && uinfo->spu_port > dd->num_pports))
1355 		ret = -ENODEV;
1356 	else
1357 		ret = choose_port_ctxt(fp, dd, uinfo->spu_port, uinfo);
1358 
1359 	return ret;
1360 }
1361 
1362 static int get_a_ctxt(struct file *fp, const struct qib_user_info *uinfo,
1363 		      unsigned alg)
1364 {
1365 	struct qib_devdata *udd = NULL;
1366 	int ret = 0, devmax, npresent, nup, ndev, dusable = 0, i;
1367 	u32 port = uinfo->spu_port, ctxt;
1368 
1369 	devmax = qib_count_units(&npresent, &nup);
1370 	if (!npresent) {
1371 		ret = -ENXIO;
1372 		goto done;
1373 	}
1374 	if (nup == 0) {
1375 		ret = -ENETDOWN;
1376 		goto done;
1377 	}
1378 
1379 	if (alg == QIB_PORT_ALG_ACROSS) {
1380 		unsigned inuse = ~0U;
1381 		/* find device (with ACTIVE ports) with fewest ctxts in use */
1382 		for (ndev = 0; ndev < devmax; ndev++) {
1383 			struct qib_devdata *dd = qib_lookup(ndev);
1384 			unsigned cused = 0, cfree = 0, pusable = 0;
1385 			if (!dd)
1386 				continue;
1387 			if (port && port <= dd->num_pports &&
1388 			    usable(dd->pport + port - 1))
1389 				pusable = 1;
1390 			else
1391 				for (i = 0; i < dd->num_pports; i++)
1392 					if (usable(dd->pport + i))
1393 						pusable++;
1394 			if (!pusable)
1395 				continue;
1396 			for (ctxt = dd->first_user_ctxt; ctxt < dd->cfgctxts;
1397 			     ctxt++)
1398 				if (dd->rcd[ctxt])
1399 					cused++;
1400 				else
1401 					cfree++;
1402 			if (pusable && cfree && cused < inuse) {
1403 				udd = dd;
1404 				inuse = cused;
1405 			}
1406 		}
1407 		if (udd) {
1408 			ret = choose_port_ctxt(fp, udd, port, uinfo);
1409 			goto done;
1410 		}
1411 	} else {
1412 		for (ndev = 0; ndev < devmax; ndev++) {
1413 			struct qib_devdata *dd = qib_lookup(ndev);
1414 			if (dd) {
1415 				ret = choose_port_ctxt(fp, dd, port, uinfo);
1416 				if (!ret)
1417 					goto done;
1418 				if (ret == -EBUSY)
1419 					dusable++;
1420 			}
1421 		}
1422 	}
1423 	ret = dusable ? -EBUSY : -ENETDOWN;
1424 
1425 done:
1426 	return ret;
1427 }
1428 
1429 static int find_shared_ctxt(struct file *fp,
1430 			    const struct qib_user_info *uinfo)
1431 {
1432 	int devmax, ndev, i;
1433 	int ret = 0;
1434 
1435 	devmax = qib_count_units(NULL, NULL);
1436 
1437 	for (ndev = 0; ndev < devmax; ndev++) {
1438 		struct qib_devdata *dd = qib_lookup(ndev);
1439 
1440 		/* device portion of usable() */
1441 		if (!(dd && (dd->flags & QIB_PRESENT) && dd->kregbase))
1442 			continue;
1443 		for (i = dd->first_user_ctxt; i < dd->cfgctxts; i++) {
1444 			struct qib_ctxtdata *rcd = dd->rcd[i];
1445 
1446 			/* Skip ctxts which are not yet open */
1447 			if (!rcd || !rcd->cnt)
1448 				continue;
1449 			/* Skip ctxt if it doesn't match the requested one */
1450 			if (rcd->subctxt_id != uinfo->spu_subctxt_id)
1451 				continue;
1452 			/* Verify the sharing process matches the master */
1453 			if (rcd->subctxt_cnt != uinfo->spu_subctxt_cnt ||
1454 			    rcd->userversion != uinfo->spu_userversion ||
1455 			    rcd->cnt >= rcd->subctxt_cnt) {
1456 				ret = -EINVAL;
1457 				goto done;
1458 			}
1459 			ctxt_fp(fp) = rcd;
1460 			subctxt_fp(fp) = rcd->cnt++;
1461 			rcd->subpid[subctxt_fp(fp)] = current->pid;
1462 			tidcursor_fp(fp) = 0;
1463 			rcd->active_slaves |= 1 << subctxt_fp(fp);
1464 			ret = 1;
1465 			goto done;
1466 		}
1467 	}
1468 
1469 done:
1470 	return ret;
1471 }
1472 
1473 static int qib_open(struct inode *in, struct file *fp)
1474 {
1475 	/* The real work is performed later in qib_assign_ctxt() */
1476 	fp->private_data = kzalloc(sizeof(struct qib_filedata), GFP_KERNEL);
1477 	if (fp->private_data) /* no cpu affinity by default */
1478 		((struct qib_filedata *)fp->private_data)->rec_cpu_num = -1;
1479 	return fp->private_data ? 0 : -ENOMEM;
1480 }
1481 
1482 /*
1483  * Get ctxt early, so can set affinity prior to memory allocation.
1484  */
1485 static int qib_assign_ctxt(struct file *fp, const struct qib_user_info *uinfo)
1486 {
1487 	int ret;
1488 	int i_minor;
1489 	unsigned swmajor, swminor, alg = QIB_PORT_ALG_ACROSS;
1490 
1491 	/* Check to be sure we haven't already initialized this file */
1492 	if (ctxt_fp(fp)) {
1493 		ret = -EINVAL;
1494 		goto done;
1495 	}
1496 
1497 	/* for now, if major version is different, bail */
1498 	swmajor = uinfo->spu_userversion >> 16;
1499 	if (swmajor != QIB_USER_SWMAJOR) {
1500 		ret = -ENODEV;
1501 		goto done;
1502 	}
1503 
1504 	swminor = uinfo->spu_userversion & 0xffff;
1505 
1506 	if (swminor >= 11 && uinfo->spu_port_alg < QIB_PORT_ALG_COUNT)
1507 		alg = uinfo->spu_port_alg;
1508 
1509 	mutex_lock(&qib_mutex);
1510 
1511 	if (qib_compatible_subctxts(swmajor, swminor) &&
1512 	    uinfo->spu_subctxt_cnt) {
1513 		ret = find_shared_ctxt(fp, uinfo);
1514 		if (ret) {
1515 			if (ret > 0)
1516 				ret = 0;
1517 			goto done_chk_sdma;
1518 		}
1519 	}
1520 
1521 	i_minor = iminor(fp->f_dentry->d_inode) - QIB_USER_MINOR_BASE;
1522 	if (i_minor)
1523 		ret = find_free_ctxt(i_minor - 1, fp, uinfo);
1524 	else
1525 		ret = get_a_ctxt(fp, uinfo, alg);
1526 
1527 done_chk_sdma:
1528 	if (!ret) {
1529 		struct qib_filedata *fd = fp->private_data;
1530 		const struct qib_ctxtdata *rcd = fd->rcd;
1531 		const struct qib_devdata *dd = rcd->dd;
1532 		unsigned int weight;
1533 
1534 		if (dd->flags & QIB_HAS_SEND_DMA) {
1535 			fd->pq = qib_user_sdma_queue_create(&dd->pcidev->dev,
1536 							    dd->unit,
1537 							    rcd->ctxt,
1538 							    fd->subctxt);
1539 			if (!fd->pq)
1540 				ret = -ENOMEM;
1541 		}
1542 
1543 		/*
1544 		 * If process has NOT already set it's affinity, select and
1545 		 * reserve a processor for it, as a rendezvous for all
1546 		 * users of the driver.  If they don't actually later
1547 		 * set affinity to this cpu, or set it to some other cpu,
1548 		 * it just means that sooner or later we don't recommend
1549 		 * a cpu, and let the scheduler do it's best.
1550 		 */
1551 		weight = cpumask_weight(tsk_cpus_allowed(current));
1552 		if (!ret && weight >= qib_cpulist_count) {
1553 			int cpu;
1554 			cpu = find_first_zero_bit(qib_cpulist,
1555 						  qib_cpulist_count);
1556 			if (cpu != qib_cpulist_count) {
1557 				__set_bit(cpu, qib_cpulist);
1558 				fd->rec_cpu_num = cpu;
1559 			}
1560 		} else if (weight == 1 &&
1561 			test_bit(cpumask_first(tsk_cpus_allowed(current)),
1562 				 qib_cpulist))
1563 			qib_devinfo(dd->pcidev, "%s PID %u affinity "
1564 				    "set to cpu %d; already allocated\n",
1565 				    current->comm, current->pid,
1566 				    cpumask_first(tsk_cpus_allowed(current)));
1567 	}
1568 
1569 	mutex_unlock(&qib_mutex);
1570 
1571 done:
1572 	return ret;
1573 }
1574 
1575 
1576 static int qib_do_user_init(struct file *fp,
1577 			    const struct qib_user_info *uinfo)
1578 {
1579 	int ret;
1580 	struct qib_ctxtdata *rcd = ctxt_fp(fp);
1581 	struct qib_devdata *dd;
1582 	unsigned uctxt;
1583 
1584 	/* Subctxts don't need to initialize anything since master did it. */
1585 	if (subctxt_fp(fp)) {
1586 		ret = wait_event_interruptible(rcd->wait,
1587 			!test_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag));
1588 		goto bail;
1589 	}
1590 
1591 	dd = rcd->dd;
1592 
1593 	/* some ctxts may get extra buffers, calculate that here */
1594 	uctxt = rcd->ctxt - dd->first_user_ctxt;
1595 	if (uctxt < dd->ctxts_extrabuf) {
1596 		rcd->piocnt = dd->pbufsctxt + 1;
1597 		rcd->pio_base = rcd->piocnt * uctxt;
1598 	} else {
1599 		rcd->piocnt = dd->pbufsctxt;
1600 		rcd->pio_base = rcd->piocnt * uctxt +
1601 			dd->ctxts_extrabuf;
1602 	}
1603 
1604 	/*
1605 	 * All user buffers are 2KB buffers.  If we ever support
1606 	 * giving 4KB buffers to user processes, this will need some
1607 	 * work.  Can't use piobufbase directly, because it has
1608 	 * both 2K and 4K buffer base values.  So check and handle.
1609 	 */
1610 	if ((rcd->pio_base + rcd->piocnt) > dd->piobcnt2k) {
1611 		if (rcd->pio_base >= dd->piobcnt2k) {
1612 			qib_dev_err(dd,
1613 				    "%u:ctxt%u: no 2KB buffers available\n",
1614 				    dd->unit, rcd->ctxt);
1615 			ret = -ENOBUFS;
1616 			goto bail;
1617 		}
1618 		rcd->piocnt = dd->piobcnt2k - rcd->pio_base;
1619 		qib_dev_err(dd, "Ctxt%u: would use 4KB bufs, using %u\n",
1620 			    rcd->ctxt, rcd->piocnt);
1621 	}
1622 
1623 	rcd->piobufs = dd->pio2k_bufbase + rcd->pio_base * dd->palign;
1624 	qib_chg_pioavailkernel(dd, rcd->pio_base, rcd->piocnt,
1625 			       TXCHK_CHG_TYPE_USER, rcd);
1626 	/*
1627 	 * try to ensure that processes start up with consistent avail update
1628 	 * for their own range, at least.   If system very quiet, it might
1629 	 * have the in-memory copy out of date at startup for this range of
1630 	 * buffers, when a context gets re-used.  Do after the chg_pioavail
1631 	 * and before the rest of setup, so it's "almost certain" the dma
1632 	 * will have occurred (can't 100% guarantee, but should be many
1633 	 * decimals of 9s, with this ordering), given how much else happens
1634 	 * after this.
1635 	 */
1636 	dd->f_sendctrl(dd->pport, QIB_SENDCTRL_AVAIL_BLIP);
1637 
1638 	/*
1639 	 * Now allocate the rcvhdr Q and eager TIDs; skip the TID
1640 	 * array for time being.  If rcd->ctxt > chip-supported,
1641 	 * we need to do extra stuff here to handle by handling overflow
1642 	 * through ctxt 0, someday
1643 	 */
1644 	ret = qib_create_rcvhdrq(dd, rcd);
1645 	if (!ret)
1646 		ret = qib_setup_eagerbufs(rcd);
1647 	if (ret)
1648 		goto bail_pio;
1649 
1650 	rcd->tidcursor = 0; /* start at beginning after open */
1651 
1652 	/* initialize poll variables... */
1653 	rcd->urgent = 0;
1654 	rcd->urgent_poll = 0;
1655 
1656 	/*
1657 	 * Now enable the ctxt for receive.
1658 	 * For chips that are set to DMA the tail register to memory
1659 	 * when they change (and when the update bit transitions from
1660 	 * 0 to 1.  So for those chips, we turn it off and then back on.
1661 	 * This will (very briefly) affect any other open ctxts, but the
1662 	 * duration is very short, and therefore isn't an issue.  We
1663 	 * explicitly set the in-memory tail copy to 0 beforehand, so we
1664 	 * don't have to wait to be sure the DMA update has happened
1665 	 * (chip resets head/tail to 0 on transition to enable).
1666 	 */
1667 	if (rcd->rcvhdrtail_kvaddr)
1668 		qib_clear_rcvhdrtail(rcd);
1669 
1670 	dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_CTXT_ENB | QIB_RCVCTRL_TIDFLOW_ENB,
1671 		      rcd->ctxt);
1672 
1673 	/* Notify any waiting slaves */
1674 	if (rcd->subctxt_cnt) {
1675 		clear_bit(QIB_CTXT_MASTER_UNINIT, &rcd->flag);
1676 		wake_up(&rcd->wait);
1677 	}
1678 	return 0;
1679 
1680 bail_pio:
1681 	qib_chg_pioavailkernel(dd, rcd->pio_base, rcd->piocnt,
1682 			       TXCHK_CHG_TYPE_KERN, rcd);
1683 bail:
1684 	return ret;
1685 }
1686 
1687 /**
1688  * unlock_exptid - unlock any expected TID entries context still had in use
1689  * @rcd: ctxt
1690  *
1691  * We don't actually update the chip here, because we do a bulk update
1692  * below, using f_clear_tids.
1693  */
1694 static void unlock_expected_tids(struct qib_ctxtdata *rcd)
1695 {
1696 	struct qib_devdata *dd = rcd->dd;
1697 	int ctxt_tidbase = rcd->ctxt * dd->rcvtidcnt;
1698 	int i, cnt = 0, maxtid = ctxt_tidbase + dd->rcvtidcnt;
1699 
1700 	for (i = ctxt_tidbase; i < maxtid; i++) {
1701 		struct page *p = dd->pageshadow[i];
1702 		dma_addr_t phys;
1703 
1704 		if (!p)
1705 			continue;
1706 
1707 		phys = dd->physshadow[i];
1708 		dd->physshadow[i] = dd->tidinvalid;
1709 		dd->pageshadow[i] = NULL;
1710 		pci_unmap_page(dd->pcidev, phys, PAGE_SIZE,
1711 			       PCI_DMA_FROMDEVICE);
1712 		qib_release_user_pages(&p, 1);
1713 		cnt++;
1714 	}
1715 }
1716 
1717 static int qib_close(struct inode *in, struct file *fp)
1718 {
1719 	int ret = 0;
1720 	struct qib_filedata *fd;
1721 	struct qib_ctxtdata *rcd;
1722 	struct qib_devdata *dd;
1723 	unsigned long flags;
1724 	unsigned ctxt;
1725 	pid_t pid;
1726 
1727 	mutex_lock(&qib_mutex);
1728 
1729 	fd = fp->private_data;
1730 	fp->private_data = NULL;
1731 	rcd = fd->rcd;
1732 	if (!rcd) {
1733 		mutex_unlock(&qib_mutex);
1734 		goto bail;
1735 	}
1736 
1737 	dd = rcd->dd;
1738 
1739 	/* ensure all pio buffer writes in progress are flushed */
1740 	qib_flush_wc();
1741 
1742 	/* drain user sdma queue */
1743 	if (fd->pq) {
1744 		qib_user_sdma_queue_drain(rcd->ppd, fd->pq);
1745 		qib_user_sdma_queue_destroy(fd->pq);
1746 	}
1747 
1748 	if (fd->rec_cpu_num != -1)
1749 		__clear_bit(fd->rec_cpu_num, qib_cpulist);
1750 
1751 	if (--rcd->cnt) {
1752 		/*
1753 		 * XXX If the master closes the context before the slave(s),
1754 		 * revoke the mmap for the eager receive queue so
1755 		 * the slave(s) don't wait for receive data forever.
1756 		 */
1757 		rcd->active_slaves &= ~(1 << fd->subctxt);
1758 		rcd->subpid[fd->subctxt] = 0;
1759 		mutex_unlock(&qib_mutex);
1760 		goto bail;
1761 	}
1762 
1763 	/* early; no interrupt users after this */
1764 	spin_lock_irqsave(&dd->uctxt_lock, flags);
1765 	ctxt = rcd->ctxt;
1766 	dd->rcd[ctxt] = NULL;
1767 	pid = rcd->pid;
1768 	rcd->pid = 0;
1769 	spin_unlock_irqrestore(&dd->uctxt_lock, flags);
1770 
1771 	if (rcd->rcvwait_to || rcd->piowait_to ||
1772 	    rcd->rcvnowait || rcd->pionowait) {
1773 		rcd->rcvwait_to = 0;
1774 		rcd->piowait_to = 0;
1775 		rcd->rcvnowait = 0;
1776 		rcd->pionowait = 0;
1777 	}
1778 	if (rcd->flag)
1779 		rcd->flag = 0;
1780 
1781 	if (dd->kregbase) {
1782 		/* atomically clear receive enable ctxt and intr avail. */
1783 		dd->f_rcvctrl(rcd->ppd, QIB_RCVCTRL_CTXT_DIS |
1784 				  QIB_RCVCTRL_INTRAVAIL_DIS, ctxt);
1785 
1786 		/* clean up the pkeys for this ctxt user */
1787 		qib_clean_part_key(rcd, dd);
1788 		qib_disarm_piobufs(dd, rcd->pio_base, rcd->piocnt);
1789 		qib_chg_pioavailkernel(dd, rcd->pio_base,
1790 				       rcd->piocnt, TXCHK_CHG_TYPE_KERN, NULL);
1791 
1792 		dd->f_clear_tids(dd, rcd);
1793 
1794 		if (dd->pageshadow)
1795 			unlock_expected_tids(rcd);
1796 		qib_stats.sps_ctxts--;
1797 		dd->freectxts--;
1798 	}
1799 
1800 	mutex_unlock(&qib_mutex);
1801 	qib_free_ctxtdata(dd, rcd); /* after releasing the mutex */
1802 
1803 bail:
1804 	kfree(fd);
1805 	return ret;
1806 }
1807 
1808 static int qib_ctxt_info(struct file *fp, struct qib_ctxt_info __user *uinfo)
1809 {
1810 	struct qib_ctxt_info info;
1811 	int ret;
1812 	size_t sz;
1813 	struct qib_ctxtdata *rcd = ctxt_fp(fp);
1814 	struct qib_filedata *fd;
1815 
1816 	fd = fp->private_data;
1817 
1818 	info.num_active = qib_count_active_units();
1819 	info.unit = rcd->dd->unit;
1820 	info.port = rcd->ppd->port;
1821 	info.ctxt = rcd->ctxt;
1822 	info.subctxt =  subctxt_fp(fp);
1823 	/* Number of user ctxts available for this device. */
1824 	info.num_ctxts = rcd->dd->cfgctxts - rcd->dd->first_user_ctxt;
1825 	info.num_subctxts = rcd->subctxt_cnt;
1826 	info.rec_cpu = fd->rec_cpu_num;
1827 	sz = sizeof(info);
1828 
1829 	if (copy_to_user(uinfo, &info, sz)) {
1830 		ret = -EFAULT;
1831 		goto bail;
1832 	}
1833 	ret = 0;
1834 
1835 bail:
1836 	return ret;
1837 }
1838 
1839 static int qib_sdma_get_inflight(struct qib_user_sdma_queue *pq,
1840 				 u32 __user *inflightp)
1841 {
1842 	const u32 val = qib_user_sdma_inflight_counter(pq);
1843 
1844 	if (put_user(val, inflightp))
1845 		return -EFAULT;
1846 
1847 	return 0;
1848 }
1849 
1850 static int qib_sdma_get_complete(struct qib_pportdata *ppd,
1851 				 struct qib_user_sdma_queue *pq,
1852 				 u32 __user *completep)
1853 {
1854 	u32 val;
1855 	int err;
1856 
1857 	if (!pq)
1858 		return -EINVAL;
1859 
1860 	err = qib_user_sdma_make_progress(ppd, pq);
1861 	if (err < 0)
1862 		return err;
1863 
1864 	val = qib_user_sdma_complete_counter(pq);
1865 	if (put_user(val, completep))
1866 		return -EFAULT;
1867 
1868 	return 0;
1869 }
1870 
1871 static int disarm_req_delay(struct qib_ctxtdata *rcd)
1872 {
1873 	int ret = 0;
1874 
1875 	if (!usable(rcd->ppd)) {
1876 		int i;
1877 		/*
1878 		 * if link is down, or otherwise not usable, delay
1879 		 * the caller up to 30 seconds, so we don't thrash
1880 		 * in trying to get the chip back to ACTIVE, and
1881 		 * set flag so they make the call again.
1882 		 */
1883 		if (rcd->user_event_mask) {
1884 			/*
1885 			 * subctxt_cnt is 0 if not shared, so do base
1886 			 * separately, first, then remaining subctxt, if any
1887 			 */
1888 			set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
1889 				&rcd->user_event_mask[0]);
1890 			for (i = 1; i < rcd->subctxt_cnt; i++)
1891 				set_bit(_QIB_EVENT_DISARM_BUFS_BIT,
1892 					&rcd->user_event_mask[i]);
1893 		}
1894 		for (i = 0; !usable(rcd->ppd) && i < 300; i++)
1895 			msleep(100);
1896 		ret = -ENETDOWN;
1897 	}
1898 	return ret;
1899 }
1900 
1901 /*
1902  * Find all user contexts in use, and set the specified bit in their
1903  * event mask.
1904  * See also find_ctxt() for a similar use, that is specific to send buffers.
1905  */
1906 int qib_set_uevent_bits(struct qib_pportdata *ppd, const int evtbit)
1907 {
1908 	struct qib_ctxtdata *rcd;
1909 	unsigned ctxt;
1910 	int ret = 0;
1911 	unsigned long flags;
1912 
1913 	spin_lock_irqsave(&ppd->dd->uctxt_lock, flags);
1914 	for (ctxt = ppd->dd->first_user_ctxt; ctxt < ppd->dd->cfgctxts;
1915 	     ctxt++) {
1916 		rcd = ppd->dd->rcd[ctxt];
1917 		if (!rcd)
1918 			continue;
1919 		if (rcd->user_event_mask) {
1920 			int i;
1921 			/*
1922 			 * subctxt_cnt is 0 if not shared, so do base
1923 			 * separately, first, then remaining subctxt, if any
1924 			 */
1925 			set_bit(evtbit, &rcd->user_event_mask[0]);
1926 			for (i = 1; i < rcd->subctxt_cnt; i++)
1927 				set_bit(evtbit, &rcd->user_event_mask[i]);
1928 		}
1929 		ret = 1;
1930 		break;
1931 	}
1932 	spin_unlock_irqrestore(&ppd->dd->uctxt_lock, flags);
1933 
1934 	return ret;
1935 }
1936 
1937 /*
1938  * clear the event notifier events for this context.
1939  * For the DISARM_BUFS case, we also take action (this obsoletes
1940  * the older QIB_CMD_DISARM_BUFS, but we keep it for backwards
1941  * compatibility.
1942  * Other bits don't currently require actions, just atomically clear.
1943  * User process then performs actions appropriate to bit having been
1944  * set, if desired, and checks again in future.
1945  */
1946 static int qib_user_event_ack(struct qib_ctxtdata *rcd, int subctxt,
1947 			      unsigned long events)
1948 {
1949 	int ret = 0, i;
1950 
1951 	for (i = 0; i <= _QIB_MAX_EVENT_BIT; i++) {
1952 		if (!test_bit(i, &events))
1953 			continue;
1954 		if (i == _QIB_EVENT_DISARM_BUFS_BIT) {
1955 			(void)qib_disarm_piobufs_ifneeded(rcd);
1956 			ret = disarm_req_delay(rcd);
1957 		} else
1958 			clear_bit(i, &rcd->user_event_mask[subctxt]);
1959 	}
1960 	return ret;
1961 }
1962 
1963 static ssize_t qib_write(struct file *fp, const char __user *data,
1964 			 size_t count, loff_t *off)
1965 {
1966 	const struct qib_cmd __user *ucmd;
1967 	struct qib_ctxtdata *rcd;
1968 	const void __user *src;
1969 	size_t consumed, copy = 0;
1970 	struct qib_cmd cmd;
1971 	ssize_t ret = 0;
1972 	void *dest;
1973 
1974 	if (count < sizeof(cmd.type)) {
1975 		ret = -EINVAL;
1976 		goto bail;
1977 	}
1978 
1979 	ucmd = (const struct qib_cmd __user *) data;
1980 
1981 	if (copy_from_user(&cmd.type, &ucmd->type, sizeof(cmd.type))) {
1982 		ret = -EFAULT;
1983 		goto bail;
1984 	}
1985 
1986 	consumed = sizeof(cmd.type);
1987 
1988 	switch (cmd.type) {
1989 	case QIB_CMD_ASSIGN_CTXT:
1990 	case QIB_CMD_USER_INIT:
1991 		copy = sizeof(cmd.cmd.user_info);
1992 		dest = &cmd.cmd.user_info;
1993 		src = &ucmd->cmd.user_info;
1994 		break;
1995 
1996 	case QIB_CMD_RECV_CTRL:
1997 		copy = sizeof(cmd.cmd.recv_ctrl);
1998 		dest = &cmd.cmd.recv_ctrl;
1999 		src = &ucmd->cmd.recv_ctrl;
2000 		break;
2001 
2002 	case QIB_CMD_CTXT_INFO:
2003 		copy = sizeof(cmd.cmd.ctxt_info);
2004 		dest = &cmd.cmd.ctxt_info;
2005 		src = &ucmd->cmd.ctxt_info;
2006 		break;
2007 
2008 	case QIB_CMD_TID_UPDATE:
2009 	case QIB_CMD_TID_FREE:
2010 		copy = sizeof(cmd.cmd.tid_info);
2011 		dest = &cmd.cmd.tid_info;
2012 		src = &ucmd->cmd.tid_info;
2013 		break;
2014 
2015 	case QIB_CMD_SET_PART_KEY:
2016 		copy = sizeof(cmd.cmd.part_key);
2017 		dest = &cmd.cmd.part_key;
2018 		src = &ucmd->cmd.part_key;
2019 		break;
2020 
2021 	case QIB_CMD_DISARM_BUFS:
2022 	case QIB_CMD_PIOAVAILUPD: /* force an update of PIOAvail reg */
2023 		copy = 0;
2024 		src = NULL;
2025 		dest = NULL;
2026 		break;
2027 
2028 	case QIB_CMD_POLL_TYPE:
2029 		copy = sizeof(cmd.cmd.poll_type);
2030 		dest = &cmd.cmd.poll_type;
2031 		src = &ucmd->cmd.poll_type;
2032 		break;
2033 
2034 	case QIB_CMD_ARMLAUNCH_CTRL:
2035 		copy = sizeof(cmd.cmd.armlaunch_ctrl);
2036 		dest = &cmd.cmd.armlaunch_ctrl;
2037 		src = &ucmd->cmd.armlaunch_ctrl;
2038 		break;
2039 
2040 	case QIB_CMD_SDMA_INFLIGHT:
2041 		copy = sizeof(cmd.cmd.sdma_inflight);
2042 		dest = &cmd.cmd.sdma_inflight;
2043 		src = &ucmd->cmd.sdma_inflight;
2044 		break;
2045 
2046 	case QIB_CMD_SDMA_COMPLETE:
2047 		copy = sizeof(cmd.cmd.sdma_complete);
2048 		dest = &cmd.cmd.sdma_complete;
2049 		src = &ucmd->cmd.sdma_complete;
2050 		break;
2051 
2052 	case QIB_CMD_ACK_EVENT:
2053 		copy = sizeof(cmd.cmd.event_mask);
2054 		dest = &cmd.cmd.event_mask;
2055 		src = &ucmd->cmd.event_mask;
2056 		break;
2057 
2058 	default:
2059 		ret = -EINVAL;
2060 		goto bail;
2061 	}
2062 
2063 	if (copy) {
2064 		if ((count - consumed) < copy) {
2065 			ret = -EINVAL;
2066 			goto bail;
2067 		}
2068 		if (copy_from_user(dest, src, copy)) {
2069 			ret = -EFAULT;
2070 			goto bail;
2071 		}
2072 		consumed += copy;
2073 	}
2074 
2075 	rcd = ctxt_fp(fp);
2076 	if (!rcd && cmd.type != QIB_CMD_ASSIGN_CTXT) {
2077 		ret = -EINVAL;
2078 		goto bail;
2079 	}
2080 
2081 	switch (cmd.type) {
2082 	case QIB_CMD_ASSIGN_CTXT:
2083 		ret = qib_assign_ctxt(fp, &cmd.cmd.user_info);
2084 		if (ret)
2085 			goto bail;
2086 		break;
2087 
2088 	case QIB_CMD_USER_INIT:
2089 		ret = qib_do_user_init(fp, &cmd.cmd.user_info);
2090 		if (ret)
2091 			goto bail;
2092 		ret = qib_get_base_info(fp, (void __user *) (unsigned long)
2093 					cmd.cmd.user_info.spu_base_info,
2094 					cmd.cmd.user_info.spu_base_info_size);
2095 		break;
2096 
2097 	case QIB_CMD_RECV_CTRL:
2098 		ret = qib_manage_rcvq(rcd, subctxt_fp(fp), cmd.cmd.recv_ctrl);
2099 		break;
2100 
2101 	case QIB_CMD_CTXT_INFO:
2102 		ret = qib_ctxt_info(fp, (struct qib_ctxt_info __user *)
2103 				    (unsigned long) cmd.cmd.ctxt_info);
2104 		break;
2105 
2106 	case QIB_CMD_TID_UPDATE:
2107 		ret = qib_tid_update(rcd, fp, &cmd.cmd.tid_info);
2108 		break;
2109 
2110 	case QIB_CMD_TID_FREE:
2111 		ret = qib_tid_free(rcd, subctxt_fp(fp), &cmd.cmd.tid_info);
2112 		break;
2113 
2114 	case QIB_CMD_SET_PART_KEY:
2115 		ret = qib_set_part_key(rcd, cmd.cmd.part_key);
2116 		break;
2117 
2118 	case QIB_CMD_DISARM_BUFS:
2119 		(void)qib_disarm_piobufs_ifneeded(rcd);
2120 		ret = disarm_req_delay(rcd);
2121 		break;
2122 
2123 	case QIB_CMD_PIOAVAILUPD:
2124 		qib_force_pio_avail_update(rcd->dd);
2125 		break;
2126 
2127 	case QIB_CMD_POLL_TYPE:
2128 		rcd->poll_type = cmd.cmd.poll_type;
2129 		break;
2130 
2131 	case QIB_CMD_ARMLAUNCH_CTRL:
2132 		rcd->dd->f_set_armlaunch(rcd->dd, cmd.cmd.armlaunch_ctrl);
2133 		break;
2134 
2135 	case QIB_CMD_SDMA_INFLIGHT:
2136 		ret = qib_sdma_get_inflight(user_sdma_queue_fp(fp),
2137 					    (u32 __user *) (unsigned long)
2138 					    cmd.cmd.sdma_inflight);
2139 		break;
2140 
2141 	case QIB_CMD_SDMA_COMPLETE:
2142 		ret = qib_sdma_get_complete(rcd->ppd,
2143 					    user_sdma_queue_fp(fp),
2144 					    (u32 __user *) (unsigned long)
2145 					    cmd.cmd.sdma_complete);
2146 		break;
2147 
2148 	case QIB_CMD_ACK_EVENT:
2149 		ret = qib_user_event_ack(rcd, subctxt_fp(fp),
2150 					 cmd.cmd.event_mask);
2151 		break;
2152 	}
2153 
2154 	if (ret >= 0)
2155 		ret = consumed;
2156 
2157 bail:
2158 	return ret;
2159 }
2160 
2161 static ssize_t qib_aio_write(struct kiocb *iocb, const struct iovec *iov,
2162 			     unsigned long dim, loff_t off)
2163 {
2164 	struct qib_filedata *fp = iocb->ki_filp->private_data;
2165 	struct qib_ctxtdata *rcd = ctxt_fp(iocb->ki_filp);
2166 	struct qib_user_sdma_queue *pq = fp->pq;
2167 
2168 	if (!dim || !pq)
2169 		return -EINVAL;
2170 
2171 	return qib_user_sdma_writev(rcd, pq, iov, dim);
2172 }
2173 
2174 static struct class *qib_class;
2175 static dev_t qib_dev;
2176 
2177 int qib_cdev_init(int minor, const char *name,
2178 		  const struct file_operations *fops,
2179 		  struct cdev **cdevp, struct device **devp)
2180 {
2181 	const dev_t dev = MKDEV(MAJOR(qib_dev), minor);
2182 	struct cdev *cdev;
2183 	struct device *device = NULL;
2184 	int ret;
2185 
2186 	cdev = cdev_alloc();
2187 	if (!cdev) {
2188 		printk(KERN_ERR QIB_DRV_NAME
2189 		       ": Could not allocate cdev for minor %d, %s\n",
2190 		       minor, name);
2191 		ret = -ENOMEM;
2192 		goto done;
2193 	}
2194 
2195 	cdev->owner = THIS_MODULE;
2196 	cdev->ops = fops;
2197 	kobject_set_name(&cdev->kobj, name);
2198 
2199 	ret = cdev_add(cdev, dev, 1);
2200 	if (ret < 0) {
2201 		printk(KERN_ERR QIB_DRV_NAME
2202 		       ": Could not add cdev for minor %d, %s (err %d)\n",
2203 		       minor, name, -ret);
2204 		goto err_cdev;
2205 	}
2206 
2207 	device = device_create(qib_class, NULL, dev, NULL, name);
2208 	if (!IS_ERR(device))
2209 		goto done;
2210 	ret = PTR_ERR(device);
2211 	device = NULL;
2212 	printk(KERN_ERR QIB_DRV_NAME ": Could not create "
2213 	       "device for minor %d, %s (err %d)\n",
2214 	       minor, name, -ret);
2215 err_cdev:
2216 	cdev_del(cdev);
2217 	cdev = NULL;
2218 done:
2219 	*cdevp = cdev;
2220 	*devp = device;
2221 	return ret;
2222 }
2223 
2224 void qib_cdev_cleanup(struct cdev **cdevp, struct device **devp)
2225 {
2226 	struct device *device = *devp;
2227 
2228 	if (device) {
2229 		device_unregister(device);
2230 		*devp = NULL;
2231 	}
2232 
2233 	if (*cdevp) {
2234 		cdev_del(*cdevp);
2235 		*cdevp = NULL;
2236 	}
2237 }
2238 
2239 static struct cdev *wildcard_cdev;
2240 static struct device *wildcard_device;
2241 
2242 int __init qib_dev_init(void)
2243 {
2244 	int ret;
2245 
2246 	ret = alloc_chrdev_region(&qib_dev, 0, QIB_NMINORS, QIB_DRV_NAME);
2247 	if (ret < 0) {
2248 		printk(KERN_ERR QIB_DRV_NAME ": Could not allocate "
2249 		       "chrdev region (err %d)\n", -ret);
2250 		goto done;
2251 	}
2252 
2253 	qib_class = class_create(THIS_MODULE, "ipath");
2254 	if (IS_ERR(qib_class)) {
2255 		ret = PTR_ERR(qib_class);
2256 		printk(KERN_ERR QIB_DRV_NAME ": Could not create "
2257 		       "device class (err %d)\n", -ret);
2258 		unregister_chrdev_region(qib_dev, QIB_NMINORS);
2259 	}
2260 
2261 done:
2262 	return ret;
2263 }
2264 
2265 void qib_dev_cleanup(void)
2266 {
2267 	if (qib_class) {
2268 		class_destroy(qib_class);
2269 		qib_class = NULL;
2270 	}
2271 
2272 	unregister_chrdev_region(qib_dev, QIB_NMINORS);
2273 }
2274 
2275 static atomic_t user_count = ATOMIC_INIT(0);
2276 
2277 static void qib_user_remove(struct qib_devdata *dd)
2278 {
2279 	if (atomic_dec_return(&user_count) == 0)
2280 		qib_cdev_cleanup(&wildcard_cdev, &wildcard_device);
2281 
2282 	qib_cdev_cleanup(&dd->user_cdev, &dd->user_device);
2283 }
2284 
2285 static int qib_user_add(struct qib_devdata *dd)
2286 {
2287 	char name[10];
2288 	int ret;
2289 
2290 	if (atomic_inc_return(&user_count) == 1) {
2291 		ret = qib_cdev_init(0, "ipath", &qib_file_ops,
2292 				    &wildcard_cdev, &wildcard_device);
2293 		if (ret)
2294 			goto done;
2295 	}
2296 
2297 	snprintf(name, sizeof(name), "ipath%d", dd->unit);
2298 	ret = qib_cdev_init(dd->unit + 1, name, &qib_file_ops,
2299 			    &dd->user_cdev, &dd->user_device);
2300 	if (ret)
2301 		qib_user_remove(dd);
2302 done:
2303 	return ret;
2304 }
2305 
2306 /*
2307  * Create per-unit files in /dev
2308  */
2309 int qib_device_create(struct qib_devdata *dd)
2310 {
2311 	int r, ret;
2312 
2313 	r = qib_user_add(dd);
2314 	ret = qib_diag_add(dd);
2315 	if (r && !ret)
2316 		ret = r;
2317 	return ret;
2318 }
2319 
2320 /*
2321  * Remove per-unit files in /dev
2322  * void, core kernel returns no errors for this stuff
2323  */
2324 void qib_device_remove(struct qib_devdata *dd)
2325 {
2326 	qib_user_remove(dd);
2327 	qib_diag_remove(dd);
2328 }
2329