xref: /linux/drivers/infiniband/hw/hfi1/user_exp_rcv.c (revision ff124bbbca1d3a07fa1392ffdbbdeece71f68ece)
1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 /*
3  * Copyright(c) 2020 Cornelis Networks, Inc.
4  * Copyright(c) 2015-2018 Intel Corporation.
5  */
6 #include <asm/page.h>
7 #include <linux/string.h>
8 
9 #include "mmu_rb.h"
10 #include "user_exp_rcv.h"
11 #include "trace.h"
12 
13 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
14 			    struct exp_tid_set *set,
15 			    struct hfi1_filedata *fd);
16 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages);
17 static int set_rcvarray_entry(struct hfi1_filedata *fd,
18 			      struct tid_user_buf *tbuf,
19 			      u32 rcventry, struct tid_group *grp,
20 			      u16 pageidx, unsigned int npages);
21 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
22 				    struct tid_rb_node *tnode);
23 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
24 			      const struct mmu_notifier_range *range,
25 			      unsigned long cur_seq);
26 static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
27 			         const struct mmu_notifier_range *range,
28 			         unsigned long cur_seq);
29 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *,
30 			    struct tid_group *grp, u16 count,
31 			    u32 *tidlist, unsigned int *tididx,
32 			    unsigned int *pmapped);
33 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo);
34 static void __clear_tid_node(struct hfi1_filedata *fd,
35 			     struct tid_rb_node *node);
36 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node);
37 
38 static const struct mmu_interval_notifier_ops tid_mn_ops = {
39 	.invalidate = tid_rb_invalidate,
40 };
41 static const struct mmu_interval_notifier_ops tid_cover_ops = {
42 	.invalidate = tid_cover_invalidate,
43 };
44 
45 /*
46  * Initialize context and file private data needed for Expected
47  * receive caching. This needs to be done after the context has
48  * been configured with the eager/expected RcvEntry counts.
49  */
50 int hfi1_user_exp_rcv_init(struct hfi1_filedata *fd,
51 			   struct hfi1_ctxtdata *uctxt)
52 {
53 	int ret = 0;
54 
55 	fd->entry_to_rb = kzalloc_objs(*fd->entry_to_rb, uctxt->expected_count);
56 	if (!fd->entry_to_rb)
57 		return -ENOMEM;
58 
59 	if (!HFI1_CAP_UGET_MASK(uctxt->flags, TID_UNMAP)) {
60 		fd->invalid_tid_idx = 0;
61 		fd->invalid_tids = kzalloc_objs(*fd->invalid_tids,
62 						uctxt->expected_count);
63 		if (!fd->invalid_tids) {
64 			kfree(fd->entry_to_rb);
65 			fd->entry_to_rb = NULL;
66 			return -ENOMEM;
67 		}
68 		fd->use_mn = true;
69 	}
70 
71 	/*
72 	 * PSM does not have a good way to separate, count, and
73 	 * effectively enforce a limit on RcvArray entries used by
74 	 * subctxts (when context sharing is used) when TID caching
75 	 * is enabled. To help with that, we calculate a per-process
76 	 * RcvArray entry share and enforce that.
77 	 * If TID caching is not in use, PSM deals with usage on its
78 	 * own. In that case, we allow any subctxt to take all of the
79 	 * entries.
80 	 *
81 	 * Make sure that we set the tid counts only after successful
82 	 * init.
83 	 */
84 	spin_lock(&fd->tid_lock);
85 	if (uctxt->subctxt_cnt && fd->use_mn) {
86 		u16 remainder;
87 
88 		fd->tid_limit = uctxt->expected_count / uctxt->subctxt_cnt;
89 		remainder = uctxt->expected_count % uctxt->subctxt_cnt;
90 		if (remainder && fd->subctxt < remainder)
91 			fd->tid_limit++;
92 	} else {
93 		fd->tid_limit = uctxt->expected_count;
94 	}
95 	spin_unlock(&fd->tid_lock);
96 
97 	return ret;
98 }
99 
100 void hfi1_user_exp_rcv_free(struct hfi1_filedata *fd)
101 {
102 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
103 
104 	mutex_lock(&uctxt->exp_mutex);
105 	if (!EXP_TID_SET_EMPTY(uctxt->tid_full_list))
106 		unlock_exp_tids(uctxt, &uctxt->tid_full_list, fd);
107 	if (!EXP_TID_SET_EMPTY(uctxt->tid_used_list))
108 		unlock_exp_tids(uctxt, &uctxt->tid_used_list, fd);
109 	mutex_unlock(&uctxt->exp_mutex);
110 
111 	kfree(fd->invalid_tids);
112 	fd->invalid_tids = NULL;
113 
114 	kfree(fd->entry_to_rb);
115 	fd->entry_to_rb = NULL;
116 }
117 
118 /*
119  * Release pinned receive buffer pages.
120  *
121  * @mapped: true if the pages have been DMA mapped. false otherwise.
122  * @idx: Index of the first page to unpin.
123  * @npages: No of pages to unpin.
124  *
125  * If the pages have been DMA mapped (indicated by mapped parameter), their
126  * info will be passed via a struct tid_rb_node. If they haven't been mapped,
127  * their info will be passed via a struct tid_user_buf.
128  */
129 static void unpin_rcv_pages(struct hfi1_filedata *fd,
130 			    struct tid_user_buf *tidbuf,
131 			    struct tid_rb_node *node,
132 			    unsigned int idx,
133 			    unsigned int npages,
134 			    bool mapped)
135 {
136 	struct page **pages;
137 	struct hfi1_devdata *dd = fd->uctxt->dd;
138 	struct mm_struct *mm;
139 
140 	if (mapped) {
141 		dma_unmap_single(&dd->pcidev->dev, node->dma_addr,
142 				 node->npages * PAGE_SIZE, DMA_FROM_DEVICE);
143 		pages = &node->pages[idx];
144 		mm = mm_from_tid_node(node);
145 	} else {
146 		pages = &tidbuf->pages[idx];
147 		mm = current->mm;
148 	}
149 	hfi1_release_user_pages(mm, pages, npages, mapped);
150 	fd->tid_n_pinned -= npages;
151 }
152 
153 /*
154  * Pin receive buffer pages.
155  */
156 static int pin_rcv_pages(struct hfi1_filedata *fd, struct tid_user_buf *tidbuf)
157 {
158 	int pinned;
159 	unsigned int npages = tidbuf->npages;
160 	unsigned long vaddr = tidbuf->vaddr;
161 	struct page **pages = NULL;
162 	struct hfi1_devdata *dd = fd->uctxt->dd;
163 
164 	if (npages > fd->uctxt->expected_count) {
165 		dd_dev_err(dd, "Expected buffer too big\n");
166 		return -EINVAL;
167 	}
168 
169 	/* Allocate the array of struct page pointers needed for pinning */
170 	pages = kzalloc_objs(*pages, npages);
171 	if (!pages)
172 		return -ENOMEM;
173 
174 	/*
175 	 * Pin all the pages of the user buffer. If we can't pin all the
176 	 * pages, accept the amount pinned so far and program only that.
177 	 * User space knows how to deal with partially programmed buffers.
178 	 */
179 	if (!hfi1_can_pin_pages(dd, current->mm, fd->tid_n_pinned, npages)) {
180 		kfree(pages);
181 		return -ENOMEM;
182 	}
183 
184 	pinned = hfi1_acquire_user_pages(current->mm, vaddr, npages, true, pages);
185 	if (pinned <= 0) {
186 		kfree(pages);
187 		return pinned;
188 	}
189 	tidbuf->pages = pages;
190 	fd->tid_n_pinned += pinned;
191 	return pinned;
192 }
193 
194 /*
195  * RcvArray entry allocation for Expected Receives is done by the
196  * following algorithm:
197  *
198  * The context keeps 3 lists of groups of RcvArray entries:
199  *   1. List of empty groups - tid_group_list
200  *      This list is created during user context creation and
201  *      contains elements which describe sets (of 8) of empty
202  *      RcvArray entries.
203  *   2. List of partially used groups - tid_used_list
204  *      This list contains sets of RcvArray entries which are
205  *      not completely used up. Another mapping request could
206  *      use some of all of the remaining entries.
207  *   3. List of full groups - tid_full_list
208  *      This is the list where sets that are completely used
209  *      up go.
210  *
211  * An attempt to optimize the usage of RcvArray entries is
212  * made by finding all sets of physically contiguous pages in a
213  * user's buffer.
214  * These physically contiguous sets are further split into
215  * sizes supported by the receive engine of the HFI. The
216  * resulting sets of pages are stored in struct tid_pageset,
217  * which describes the sets as:
218  *    * .count - number of pages in this set
219  *    * .idx - starting index into struct page ** array
220  *                    of this set
221  *
222  * From this point on, the algorithm deals with the page sets
223  * described above. The number of pagesets is divided by the
224  * RcvArray group size to produce the number of full groups
225  * needed.
226  *
227  * Groups from the 3 lists are manipulated using the following
228  * rules:
229  *   1. For each set of 8 pagesets, a complete group from
230  *      tid_group_list is taken, programmed, and moved to
231  *      the tid_full_list list.
232  *   2. For all remaining pagesets:
233  *      2.1 If the tid_used_list is empty and the tid_group_list
234  *          is empty, stop processing pageset and return only
235  *          what has been programmed up to this point.
236  *      2.2 If the tid_used_list is empty and the tid_group_list
237  *          is not empty, move a group from tid_group_list to
238  *          tid_used_list.
239  *      2.3 For each group is tid_used_group, program as much as
240  *          can fit into the group. If the group becomes fully
241  *          used, move it to tid_full_list.
242  */
243 int hfi1_user_exp_rcv_setup(struct hfi1_filedata *fd,
244 			    struct hfi1_tid_info *tinfo)
245 {
246 	int ret = 0, need_group = 0, pinned;
247 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
248 	struct hfi1_devdata *dd = uctxt->dd;
249 	unsigned int ngroups, pageset_count,
250 		tididx = 0, mapped, mapped_pages = 0;
251 	u32 *tidlist = NULL;
252 	struct tid_user_buf *tidbuf;
253 	unsigned long mmu_seq = 0;
254 
255 	if (!PAGE_ALIGNED(tinfo->vaddr))
256 		return -EINVAL;
257 	if (tinfo->length == 0)
258 		return -EINVAL;
259 
260 	tidbuf = kzalloc(sizeof(*tidbuf), GFP_KERNEL);
261 	if (!tidbuf)
262 		return -ENOMEM;
263 
264 	mutex_init(&tidbuf->cover_mutex);
265 	tidbuf->vaddr = tinfo->vaddr;
266 	tidbuf->length = tinfo->length;
267 	tidbuf->npages = num_user_pages(tidbuf->vaddr, tidbuf->length);
268 	tidbuf->psets = kcalloc(uctxt->expected_count, sizeof(*tidbuf->psets),
269 				GFP_KERNEL);
270 	if (!tidbuf->psets) {
271 		ret = -ENOMEM;
272 		goto fail_release_mem;
273 	}
274 
275 	if (fd->use_mn) {
276 		ret = mmu_interval_notifier_insert(
277 			&tidbuf->notifier, current->mm,
278 			tidbuf->vaddr, tidbuf->npages * PAGE_SIZE,
279 			&tid_cover_ops);
280 		if (ret)
281 			goto fail_release_mem;
282 		mmu_seq = mmu_interval_read_begin(&tidbuf->notifier);
283 	}
284 
285 	pinned = pin_rcv_pages(fd, tidbuf);
286 	if (pinned <= 0) {
287 		ret = (pinned < 0) ? pinned : -ENOSPC;
288 		goto fail_unpin;
289 	}
290 
291 	/* Find sets of physically contiguous pages */
292 	tidbuf->n_psets = find_phys_blocks(tidbuf, pinned);
293 
294 	/* Reserve the number of expected tids to be used. */
295 	spin_lock(&fd->tid_lock);
296 	if (fd->tid_used + tidbuf->n_psets > fd->tid_limit)
297 		pageset_count = fd->tid_limit - fd->tid_used;
298 	else
299 		pageset_count = tidbuf->n_psets;
300 	fd->tid_used += pageset_count;
301 	spin_unlock(&fd->tid_lock);
302 
303 	if (!pageset_count) {
304 		ret = -ENOSPC;
305 		goto fail_unreserve;
306 	}
307 
308 	ngroups = pageset_count / dd->rcv_entries.group_size;
309 	tidlist = kcalloc(pageset_count, sizeof(*tidlist), GFP_KERNEL);
310 	if (!tidlist) {
311 		ret = -ENOMEM;
312 		goto fail_unreserve;
313 	}
314 
315 	tididx = 0;
316 
317 	/*
318 	 * From this point on, we are going to be using shared (between master
319 	 * and subcontexts) context resources. We need to take the lock.
320 	 */
321 	mutex_lock(&uctxt->exp_mutex);
322 	/*
323 	 * The first step is to program the RcvArray entries which are complete
324 	 * groups.
325 	 */
326 	while (ngroups && uctxt->tid_group_list.count) {
327 		struct tid_group *grp =
328 			tid_group_pop(&uctxt->tid_group_list);
329 
330 		ret = program_rcvarray(fd, tidbuf, grp,
331 				       dd->rcv_entries.group_size,
332 				       tidlist, &tididx, &mapped);
333 		/*
334 		 * If there was a failure to program the RcvArray
335 		 * entries for the entire group, reset the grp fields
336 		 * and add the grp back to the free group list.
337 		 */
338 		if (ret <= 0) {
339 			tid_group_add_tail(grp, &uctxt->tid_group_list);
340 			hfi1_cdbg(TID,
341 				  "Failed to program RcvArray group %d", ret);
342 			goto unlock;
343 		}
344 
345 		tid_group_add_tail(grp, &uctxt->tid_full_list);
346 		ngroups--;
347 		mapped_pages += mapped;
348 	}
349 
350 	while (tididx < pageset_count) {
351 		struct tid_group *grp, *ptr;
352 		/*
353 		 * If we don't have any partially used tid groups, check
354 		 * if we have empty groups. If so, take one from there and
355 		 * put in the partially used list.
356 		 */
357 		if (!uctxt->tid_used_list.count || need_group) {
358 			if (!uctxt->tid_group_list.count)
359 				goto unlock;
360 
361 			grp = tid_group_pop(&uctxt->tid_group_list);
362 			tid_group_add_tail(grp, &uctxt->tid_used_list);
363 			need_group = 0;
364 		}
365 		/*
366 		 * There is an optimization opportunity here - instead of
367 		 * fitting as many page sets as we can, check for a group
368 		 * later on in the list that could fit all of them.
369 		 */
370 		list_for_each_entry_safe(grp, ptr, &uctxt->tid_used_list.list,
371 					 list) {
372 			unsigned use = min_t(unsigned, pageset_count - tididx,
373 					     grp->size - grp->used);
374 
375 			ret = program_rcvarray(fd, tidbuf, grp,
376 					       use, tidlist,
377 					       &tididx, &mapped);
378 			if (ret < 0) {
379 				hfi1_cdbg(TID,
380 					  "Failed to program RcvArray entries %d",
381 					  ret);
382 				goto unlock;
383 			} else if (ret > 0) {
384 				if (grp->used == grp->size)
385 					tid_group_move(grp,
386 						       &uctxt->tid_used_list,
387 						       &uctxt->tid_full_list);
388 				mapped_pages += mapped;
389 				need_group = 0;
390 				/* Check if we are done so we break out early */
391 				if (tididx >= pageset_count)
392 					break;
393 			} else if (WARN_ON(ret == 0)) {
394 				/*
395 				 * If ret is 0, we did not program any entries
396 				 * into this group, which can only happen if
397 				 * we've screwed up the accounting somewhere.
398 				 * Warn and try to continue.
399 				 */
400 				need_group = 1;
401 			}
402 		}
403 	}
404 unlock:
405 	mutex_unlock(&uctxt->exp_mutex);
406 	hfi1_cdbg(TID, "total mapped: tidpairs:%u pages:%u (%d)", tididx,
407 		  mapped_pages, ret);
408 
409 	/* fail if nothing was programmed, set error if none provided */
410 	if (tididx == 0) {
411 		if (ret >= 0)
412 			ret = -ENOSPC;
413 		goto fail_unreserve;
414 	}
415 
416 	/* adjust reserved tid_used to actual count */
417 	spin_lock(&fd->tid_lock);
418 	fd->tid_used -= pageset_count - tididx;
419 	spin_unlock(&fd->tid_lock);
420 
421 	/* unpin all pages not covered by a TID */
422 	unpin_rcv_pages(fd, tidbuf, NULL, mapped_pages, pinned - mapped_pages,
423 			false);
424 
425 	if (fd->use_mn) {
426 		/* check for an invalidate during setup */
427 		bool fail = false;
428 
429 		mutex_lock(&tidbuf->cover_mutex);
430 		fail = mmu_interval_read_retry(&tidbuf->notifier, mmu_seq);
431 		mutex_unlock(&tidbuf->cover_mutex);
432 
433 		if (fail) {
434 			ret = -EBUSY;
435 			goto fail_unprogram;
436 		}
437 	}
438 
439 	tinfo->tidcnt = tididx;
440 	tinfo->length = mapped_pages * PAGE_SIZE;
441 
442 	if (copy_to_user(u64_to_user_ptr(tinfo->tidlist),
443 			 tidlist, sizeof(tidlist[0]) * tididx)) {
444 		ret = -EFAULT;
445 		goto fail_unprogram;
446 	}
447 
448 	if (fd->use_mn)
449 		mmu_interval_notifier_remove(&tidbuf->notifier);
450 	kfree(tidbuf->pages);
451 	kfree(tidbuf->psets);
452 	kfree(tidbuf);
453 	kfree(tidlist);
454 	return 0;
455 
456 fail_unprogram:
457 	/* unprogram, unmap, and unpin all allocated TIDs */
458 	tinfo->tidlist = (unsigned long)tidlist;
459 	hfi1_user_exp_rcv_clear(fd, tinfo);
460 	tinfo->tidlist = 0;
461 	pinned = 0;		/* nothing left to unpin */
462 	pageset_count = 0;	/* nothing left reserved */
463 fail_unreserve:
464 	spin_lock(&fd->tid_lock);
465 	fd->tid_used -= pageset_count;
466 	spin_unlock(&fd->tid_lock);
467 fail_unpin:
468 	if (fd->use_mn)
469 		mmu_interval_notifier_remove(&tidbuf->notifier);
470 	if (pinned > 0)
471 		unpin_rcv_pages(fd, tidbuf, NULL, 0, pinned, false);
472 fail_release_mem:
473 	kfree(tidbuf->pages);
474 	kfree(tidbuf->psets);
475 	kfree(tidbuf);
476 	kfree(tidlist);
477 	return ret;
478 }
479 
480 int hfi1_user_exp_rcv_clear(struct hfi1_filedata *fd,
481 			    struct hfi1_tid_info *tinfo)
482 {
483 	int ret = 0;
484 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
485 	u32 *tidinfo;
486 	unsigned tididx;
487 
488 	if (unlikely(tinfo->tidcnt > fd->tid_used))
489 		return -EINVAL;
490 
491 	tidinfo = memdup_array_user(u64_to_user_ptr(tinfo->tidlist),
492 				    tinfo->tidcnt, sizeof(tidinfo[0]));
493 	if (IS_ERR(tidinfo))
494 		return PTR_ERR(tidinfo);
495 
496 	mutex_lock(&uctxt->exp_mutex);
497 	for (tididx = 0; tididx < tinfo->tidcnt; tididx++) {
498 		ret = unprogram_rcvarray(fd, tidinfo[tididx]);
499 		if (ret) {
500 			hfi1_cdbg(TID, "Failed to unprogram rcv array %d",
501 				  ret);
502 			break;
503 		}
504 	}
505 	spin_lock(&fd->tid_lock);
506 	fd->tid_used -= tididx;
507 	spin_unlock(&fd->tid_lock);
508 	tinfo->tidcnt = tididx;
509 	mutex_unlock(&uctxt->exp_mutex);
510 
511 	kfree(tidinfo);
512 	return ret;
513 }
514 
515 int hfi1_user_exp_rcv_invalid(struct hfi1_filedata *fd,
516 			      struct hfi1_tid_info *tinfo)
517 {
518 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
519 	unsigned long *ev = uctxt->dd->events +
520 		(uctxt_offset(uctxt) + fd->subctxt);
521 	u32 *array;
522 	int ret = 0;
523 
524 	/*
525 	 * copy_to_user() can sleep, which will leave the invalid_lock
526 	 * locked and cause the MMU notifier to be blocked on the lock
527 	 * for a long time.
528 	 * Copy the data to a local buffer so we can release the lock.
529 	 */
530 	array = kcalloc(uctxt->expected_count, sizeof(*array), GFP_KERNEL);
531 	if (!array)
532 		return -EFAULT;
533 
534 	spin_lock(&fd->invalid_lock);
535 	if (fd->invalid_tid_idx) {
536 		memcpy(array, fd->invalid_tids, sizeof(*array) *
537 		       fd->invalid_tid_idx);
538 		memset(fd->invalid_tids, 0, sizeof(*fd->invalid_tids) *
539 		       fd->invalid_tid_idx);
540 		tinfo->tidcnt = fd->invalid_tid_idx;
541 		fd->invalid_tid_idx = 0;
542 		/*
543 		 * Reset the user flag while still holding the lock.
544 		 * Otherwise, PSM can miss events.
545 		 */
546 		clear_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
547 	} else {
548 		tinfo->tidcnt = 0;
549 	}
550 	spin_unlock(&fd->invalid_lock);
551 
552 	if (tinfo->tidcnt) {
553 		if (copy_to_user((void __user *)tinfo->tidlist,
554 				 array, sizeof(*array) * tinfo->tidcnt))
555 			ret = -EFAULT;
556 	}
557 	kfree(array);
558 
559 	return ret;
560 }
561 
562 static u32 find_phys_blocks(struct tid_user_buf *tidbuf, unsigned int npages)
563 {
564 	unsigned pagecount, pageidx, setcount = 0, i;
565 	unsigned long pfn, this_pfn;
566 	struct page **pages = tidbuf->pages;
567 	struct tid_pageset *list = tidbuf->psets;
568 
569 	if (!npages)
570 		return 0;
571 
572 	/*
573 	 * Look for sets of physically contiguous pages in the user buffer.
574 	 * This will allow us to optimize Expected RcvArray entry usage by
575 	 * using the bigger supported sizes.
576 	 */
577 	pfn = page_to_pfn(pages[0]);
578 	for (pageidx = 0, pagecount = 1, i = 1; i <= npages; i++) {
579 		this_pfn = i < npages ? page_to_pfn(pages[i]) : 0;
580 
581 		/*
582 		 * If the pfn's are not sequential, pages are not physically
583 		 * contiguous.
584 		 */
585 		if (this_pfn != ++pfn) {
586 			/*
587 			 * At this point we have to loop over the set of
588 			 * physically contiguous pages and break them down it
589 			 * sizes supported by the HW.
590 			 * There are two main constraints:
591 			 *     1. The max buffer size is MAX_EXPECTED_BUFFER.
592 			 *        If the total set size is bigger than that
593 			 *        program only a MAX_EXPECTED_BUFFER chunk.
594 			 *     2. The buffer size has to be a power of two. If
595 			 *        it is not, round down to the closes power of
596 			 *        2 and program that size.
597 			 */
598 			while (pagecount) {
599 				int maxpages = pagecount;
600 				u32 bufsize = pagecount * PAGE_SIZE;
601 
602 				if (bufsize > MAX_EXPECTED_BUFFER)
603 					maxpages =
604 						MAX_EXPECTED_BUFFER >>
605 						PAGE_SHIFT;
606 				else if (!is_power_of_2(bufsize))
607 					maxpages =
608 						rounddown_pow_of_two(bufsize) >>
609 						PAGE_SHIFT;
610 
611 				list[setcount].idx = pageidx;
612 				list[setcount].count = maxpages;
613 				pagecount -= maxpages;
614 				pageidx += maxpages;
615 				setcount++;
616 			}
617 			pageidx = i;
618 			pagecount = 1;
619 			pfn = this_pfn;
620 		} else {
621 			pagecount++;
622 		}
623 	}
624 	return setcount;
625 }
626 
627 /**
628  * program_rcvarray() - program an RcvArray group with receive buffers
629  * @fd: filedata pointer
630  * @tbuf: pointer to struct tid_user_buf that has the user buffer starting
631  *	  virtual address, buffer length, page pointers, pagesets (array of
632  *	  struct tid_pageset holding information on physically contiguous
633  *	  chunks from the user buffer), and other fields.
634  * @grp: RcvArray group
635  * @count: number of struct tid_pageset's to program
636  * @tidlist: the array of u32 elements when the information about the
637  *           programmed RcvArray entries is to be encoded.
638  * @tididx: starting offset into tidlist
639  * @pmapped: (output parameter) number of pages programmed into the RcvArray
640  *           entries.
641  *
642  * This function will program up to 'count' number of RcvArray entries from the
643  * group 'grp'. To make best use of write-combining writes, the function will
644  * perform writes to the unused RcvArray entries which will be ignored by the
645  * HW. Each RcvArray entry will be programmed with a physically contiguous
646  * buffer chunk from the user's virtual buffer.
647  *
648  * Return:
649  * -EINVAL if the requested count is larger than the size of the group,
650  * -ENOMEM or -EFAULT on error from set_rcvarray_entry(), or
651  * number of RcvArray entries programmed.
652  */
653 static int program_rcvarray(struct hfi1_filedata *fd, struct tid_user_buf *tbuf,
654 			    struct tid_group *grp, u16 count,
655 			    u32 *tidlist, unsigned int *tididx,
656 			    unsigned int *pmapped)
657 {
658 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
659 	struct hfi1_devdata *dd = uctxt->dd;
660 	u16 idx;
661 	unsigned int start = *tididx;
662 	u32 tidinfo = 0, rcventry, useidx = 0;
663 	int mapped = 0;
664 
665 	/* Count should never be larger than the group size */
666 	if (count > grp->size)
667 		return -EINVAL;
668 
669 	/* Find the first unused entry in the group */
670 	for (idx = 0; idx < grp->size; idx++) {
671 		if (!(grp->map & (1 << idx))) {
672 			useidx = idx;
673 			break;
674 		}
675 		rcv_array_wc_fill(dd, grp->base + idx);
676 	}
677 
678 	idx = 0;
679 	while (idx < count) {
680 		u16 npages, pageidx, setidx = start + idx;
681 		int ret = 0;
682 
683 		/*
684 		 * If this entry in the group is used, move to the next one.
685 		 * If we go past the end of the group, exit the loop.
686 		 */
687 		if (useidx >= grp->size) {
688 			break;
689 		} else if (grp->map & (1 << useidx)) {
690 			rcv_array_wc_fill(dd, grp->base + useidx);
691 			useidx++;
692 			continue;
693 		}
694 
695 		rcventry = grp->base + useidx;
696 		npages = tbuf->psets[setidx].count;
697 		pageidx = tbuf->psets[setidx].idx;
698 
699 		ret = set_rcvarray_entry(fd, tbuf,
700 					 rcventry, grp, pageidx,
701 					 npages);
702 		if (ret)
703 			return ret;
704 		mapped += npages;
705 
706 		tidinfo = create_tid(rcventry - uctxt->expected_base, npages);
707 		tidlist[(*tididx)++] = tidinfo;
708 		grp->used++;
709 		grp->map |= 1 << useidx++;
710 		idx++;
711 	}
712 
713 	/* Fill the rest of the group with "blank" writes */
714 	for (; useidx < grp->size; useidx++)
715 		rcv_array_wc_fill(dd, grp->base + useidx);
716 	*pmapped = mapped;
717 	return idx;
718 }
719 
720 static int set_rcvarray_entry(struct hfi1_filedata *fd,
721 			      struct tid_user_buf *tbuf,
722 			      u32 rcventry, struct tid_group *grp,
723 			      u16 pageidx, unsigned int npages)
724 {
725 	int ret;
726 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
727 	struct tid_rb_node *node;
728 	struct hfi1_devdata *dd = uctxt->dd;
729 	dma_addr_t phys;
730 	struct page **pages = tbuf->pages + pageidx;
731 
732 	/*
733 	 * Allocate the node first so we can handle a potential
734 	 * failure before we've programmed anything.
735 	 */
736 	node = kzalloc_flex(*node, pages, npages);
737 	if (!node)
738 		return -ENOMEM;
739 
740 	phys = dma_map_single(&dd->pcidev->dev, __va(page_to_phys(pages[0])),
741 			      npages * PAGE_SIZE, DMA_FROM_DEVICE);
742 	if (dma_mapping_error(&dd->pcidev->dev, phys)) {
743 		dd_dev_err(dd, "Failed to DMA map Exp Rcv pages 0x%llx\n",
744 			   phys);
745 		kfree(node);
746 		return -EFAULT;
747 	}
748 
749 	node->fdata = fd;
750 	mutex_init(&node->invalidate_mutex);
751 	node->phys = page_to_phys(pages[0]);
752 	node->npages = npages;
753 	node->rcventry = rcventry;
754 	node->dma_addr = phys;
755 	node->grp = grp;
756 	node->freed = false;
757 	memcpy(node->pages, pages, flex_array_size(node, pages, npages));
758 
759 	if (fd->use_mn) {
760 		ret = mmu_interval_notifier_insert(
761 			&node->notifier, current->mm,
762 			tbuf->vaddr + (pageidx * PAGE_SIZE), npages * PAGE_SIZE,
763 			&tid_mn_ops);
764 		if (ret)
765 			goto out_unmap;
766 	}
767 	fd->entry_to_rb[node->rcventry - uctxt->expected_base] = node;
768 
769 	hfi1_put_tid(dd, rcventry, PT_EXPECTED, phys, ilog2(npages) + 1);
770 	trace_hfi1_exp_tid_reg(uctxt->ctxt, fd->subctxt, rcventry, npages,
771 			       node->notifier.interval_tree.start, node->phys,
772 			       phys);
773 	return 0;
774 
775 out_unmap:
776 	hfi1_cdbg(TID, "Failed to insert RB node %u 0x%lx, 0x%lx %d",
777 		  node->rcventry, node->notifier.interval_tree.start,
778 		  node->phys, ret);
779 	dma_unmap_single(&dd->pcidev->dev, phys, npages * PAGE_SIZE,
780 			 DMA_FROM_DEVICE);
781 	kfree(node);
782 	return -EFAULT;
783 }
784 
785 static int unprogram_rcvarray(struct hfi1_filedata *fd, u32 tidinfo)
786 {
787 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
788 	struct hfi1_devdata *dd = uctxt->dd;
789 	struct tid_rb_node *node;
790 	u32 tidctrl = EXP_TID_GET(tidinfo, CTRL);
791 	u32 tididx = EXP_TID_GET(tidinfo, IDX) << 1, rcventry;
792 
793 	if (tidctrl == 0x3 || tidctrl == 0x0)
794 		return -EINVAL;
795 
796 	rcventry = tididx + (tidctrl - 1);
797 
798 	if (rcventry >= uctxt->expected_count) {
799 		dd_dev_err(dd, "Invalid RcvArray entry (%u) index for ctxt %u\n",
800 			   rcventry, uctxt->ctxt);
801 		return -EINVAL;
802 	}
803 
804 	node = fd->entry_to_rb[rcventry];
805 	if (!node || node->rcventry != (uctxt->expected_base + rcventry))
806 		return -EBADF;
807 
808 	if (fd->use_mn)
809 		mmu_interval_notifier_remove(&node->notifier);
810 	cacheless_tid_rb_remove(fd, node);
811 
812 	return 0;
813 }
814 
815 static void __clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
816 {
817 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
818 	struct hfi1_devdata *dd = uctxt->dd;
819 
820 	mutex_lock(&node->invalidate_mutex);
821 	if (node->freed)
822 		goto done;
823 	node->freed = true;
824 
825 	trace_hfi1_exp_tid_unreg(uctxt->ctxt, fd->subctxt, node->rcventry,
826 				 node->npages,
827 				 node->notifier.interval_tree.start, node->phys,
828 				 node->dma_addr);
829 
830 	/* Make sure device has seen the write before pages are unpinned */
831 	hfi1_put_tid(dd, node->rcventry, PT_INVALID_FLUSH, 0, 0);
832 
833 	unpin_rcv_pages(fd, NULL, node, 0, node->npages, true);
834 done:
835 	mutex_unlock(&node->invalidate_mutex);
836 }
837 
838 static void clear_tid_node(struct hfi1_filedata *fd, struct tid_rb_node *node)
839 {
840 	struct hfi1_ctxtdata *uctxt = fd->uctxt;
841 
842 	__clear_tid_node(fd, node);
843 
844 	node->grp->used--;
845 	node->grp->map &= ~(1 << (node->rcventry - node->grp->base));
846 
847 	if (node->grp->used == node->grp->size - 1)
848 		tid_group_move(node->grp, &uctxt->tid_full_list,
849 			       &uctxt->tid_used_list);
850 	else if (!node->grp->used)
851 		tid_group_move(node->grp, &uctxt->tid_used_list,
852 			       &uctxt->tid_group_list);
853 	kfree(node);
854 }
855 
856 /*
857  * As a simple helper for hfi1_user_exp_rcv_free, this function deals with
858  * clearing nodes in the non-cached case.
859  */
860 static void unlock_exp_tids(struct hfi1_ctxtdata *uctxt,
861 			    struct exp_tid_set *set,
862 			    struct hfi1_filedata *fd)
863 {
864 	struct tid_group *grp, *ptr;
865 	int i;
866 
867 	list_for_each_entry_safe(grp, ptr, &set->list, list) {
868 		list_del_init(&grp->list);
869 
870 		for (i = 0; i < grp->size; i++) {
871 			if (grp->map & (1 << i)) {
872 				u16 rcventry = grp->base + i;
873 				struct tid_rb_node *node;
874 
875 				node = fd->entry_to_rb[rcventry -
876 							  uctxt->expected_base];
877 				if (!node || node->rcventry != rcventry)
878 					continue;
879 
880 				if (fd->use_mn)
881 					mmu_interval_notifier_remove(
882 						&node->notifier);
883 				cacheless_tid_rb_remove(fd, node);
884 			}
885 		}
886 	}
887 }
888 
889 static bool tid_rb_invalidate(struct mmu_interval_notifier *mni,
890 			      const struct mmu_notifier_range *range,
891 			      unsigned long cur_seq)
892 {
893 	struct tid_rb_node *node =
894 		container_of(mni, struct tid_rb_node, notifier);
895 	struct hfi1_filedata *fdata = node->fdata;
896 	struct hfi1_ctxtdata *uctxt = fdata->uctxt;
897 
898 	if (node->freed)
899 		return true;
900 
901 	/* take action only if unmapping */
902 	if (range->event != MMU_NOTIFY_UNMAP)
903 		return true;
904 
905 	trace_hfi1_exp_tid_inval(uctxt->ctxt, fdata->subctxt,
906 				 node->notifier.interval_tree.start,
907 				 node->rcventry, node->npages, node->dma_addr);
908 
909 	/* clear the hardware rcvarray entry */
910 	__clear_tid_node(fdata, node);
911 
912 	spin_lock(&fdata->invalid_lock);
913 	if (fdata->invalid_tid_idx < uctxt->expected_count) {
914 		fdata->invalid_tids[fdata->invalid_tid_idx] =
915 			create_tid(node->rcventry - uctxt->expected_base,
916 				   node->npages);
917 		if (!fdata->invalid_tid_idx) {
918 			unsigned long *ev;
919 
920 			/*
921 			 * hfi1_set_uevent_bits() sets a user event flag
922 			 * for all processes. Because calling into the
923 			 * driver to process TID cache invalidations is
924 			 * expensive and TID cache invalidations are
925 			 * handled on a per-process basis, we can
926 			 * optimize this to set the flag only for the
927 			 * process in question.
928 			 */
929 			ev = uctxt->dd->events +
930 				(uctxt_offset(uctxt) + fdata->subctxt);
931 			set_bit(_HFI1_EVENT_TID_MMU_NOTIFY_BIT, ev);
932 		}
933 		fdata->invalid_tid_idx++;
934 	}
935 	spin_unlock(&fdata->invalid_lock);
936 	return true;
937 }
938 
939 static bool tid_cover_invalidate(struct mmu_interval_notifier *mni,
940 			         const struct mmu_notifier_range *range,
941 			         unsigned long cur_seq)
942 {
943 	struct tid_user_buf *tidbuf =
944 		container_of(mni, struct tid_user_buf, notifier);
945 
946 	/* take action only if unmapping */
947 	if (range->event == MMU_NOTIFY_UNMAP) {
948 		mutex_lock(&tidbuf->cover_mutex);
949 		mmu_interval_set_seq(mni, cur_seq);
950 		mutex_unlock(&tidbuf->cover_mutex);
951 	}
952 
953 	return true;
954 }
955 
956 static void cacheless_tid_rb_remove(struct hfi1_filedata *fdata,
957 				    struct tid_rb_node *tnode)
958 {
959 	u32 base = fdata->uctxt->expected_base;
960 
961 	fdata->entry_to_rb[tnode->rcventry - base] = NULL;
962 	clear_tid_node(fdata, tnode);
963 }
964