xref: /freebsd/sys/dev/mlx4/mlx4_ib/mlx4_ib_mr.c (revision d01498defbe804f66435b44f22da9278acddf082)
1 /*
2  * Copyright (c) 2007 Cisco Systems, Inc. All rights reserved.
3  * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
4  *
5  * This software is available to you under a choice of one of two
6  * licenses.  You may choose to be licensed under the terms of the GNU
7  * General Public License (GPL) Version 2, available from the file
8  * COPYING in the main directory of this source tree, or the
9  * OpenIB.org BSD license below:
10  *
11  *     Redistribution and use in source and binary forms, with or
12  *     without modification, are permitted provided that the following
13  *     conditions are met:
14  *
15  *      - Redistributions of source code must retain the above
16  *        copyright notice, this list of conditions and the following
17  *        disclaimer.
18  *
19  *      - Redistributions in binary form must reproduce the above
20  *        copyright notice, this list of conditions and the following
21  *        disclaimer in the documentation and/or other materials
22  *        provided with the distribution.
23  *
24  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
25  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
26  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
27  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
28  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
29  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
30  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
31  * SOFTWARE.
32  */
33 
34 #include <linux/slab.h>
35 #include <linux/module.h>
36 #include <linux/sched.h>
37 
38 #include "mlx4_ib.h"
39 
40 static u32 convert_access(int acc)
41 {
42 	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX4_PERM_ATOMIC       : 0) |
43 	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX4_PERM_REMOTE_WRITE : 0) |
44 	       (acc & IB_ACCESS_REMOTE_READ   ? MLX4_PERM_REMOTE_READ  : 0) |
45 	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX4_PERM_LOCAL_WRITE  : 0) |
46 	       (acc & IB_ACCESS_MW_BIND       ? MLX4_PERM_BIND_MW      : 0) |
47 	       MLX4_PERM_LOCAL_READ;
48 }
49 /* No suuport for Shared MR feature */
50 #if 0
51 static ssize_t shared_mr_proc_read(struct file *file,
52 			  char __user *buffer,
53 			  size_t len,
54 			  loff_t *offset)
55 {
56 
57 	return -ENOSYS;
58 
59 }
60 
61 static ssize_t shared_mr_proc_write(struct file *file,
62 			   const char __user *buffer,
63 			   size_t len,
64 			   loff_t *offset)
65 {
66 
67 	return -ENOSYS;
68 }
69 
70 static int shared_mr_mmap(struct file *filep, struct vm_area_struct *vma)
71 {
72 
73 	struct proc_dir_entry *pde = PDE(filep->f_path.dentry->d_inode);
74 	struct mlx4_shared_mr_info *smr_info =
75 		(struct mlx4_shared_mr_info *)pde->data;
76 
77 	/* Prevent any mapping not on start of area */
78 	if (vma->vm_pgoff != 0)
79 		return -EINVAL;
80 
81 	return ib_umem_map_to_vma(smr_info->umem,
82 					vma);
83 
84 }
85 
86 static const struct file_operations shared_mr_proc_ops = {
87 	.owner	= THIS_MODULE,
88 	.read	= shared_mr_proc_read,
89 	.write	= shared_mr_proc_write,
90 	.mmap	= shared_mr_mmap
91 };
92 
93 static mode_t convert_shared_access(int acc)
94 {
95 
96 	return (acc & IB_ACCESS_SHARED_MR_USER_READ ? S_IRUSR       : 0) |
97 	       (acc & IB_ACCESS_SHARED_MR_USER_WRITE  ? S_IWUSR : 0) |
98 	       (acc & IB_ACCESS_SHARED_MR_GROUP_READ   ? S_IRGRP  : 0) |
99 	       (acc & IB_ACCESS_SHARED_MR_GROUP_WRITE   ? S_IWGRP  : 0) |
100 	       (acc & IB_ACCESS_SHARED_MR_OTHER_READ   ? S_IROTH  : 0) |
101 	       (acc & IB_ACCESS_SHARED_MR_OTHER_WRITE   ? S_IWOTH  : 0);
102 
103 }
104 #endif
105 struct ib_mr *mlx4_ib_get_dma_mr(struct ib_pd *pd, int acc)
106 {
107 	struct mlx4_ib_mr *mr;
108 	int err;
109 
110 	mr = kzalloc(sizeof *mr, GFP_KERNEL);
111 	if (!mr)
112 		return ERR_PTR(-ENOMEM);
113 
114 	err = mlx4_mr_alloc(to_mdev(pd->device)->dev, to_mpd(pd)->pdn, 0,
115 			    ~0ull, convert_access(acc), 0, 0, &mr->mmr);
116 	if (err)
117 		goto err_free;
118 
119 	err = mlx4_mr_enable(to_mdev(pd->device)->dev, &mr->mmr);
120 	if (err)
121 		goto err_mr;
122 
123 	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
124 	mr->umem = NULL;
125 
126 	return &mr->ibmr;
127 
128 err_mr:
129 	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
130 
131 err_free:
132 	kfree(mr);
133 
134 	return ERR_PTR(err);
135 }
136 
137 static int mlx4_ib_umem_write_mtt_block(struct mlx4_ib_dev *dev,
138 						struct mlx4_mtt *mtt,
139 						u64 mtt_size,
140 						u64 mtt_shift,
141 						u64 len,
142 						u64 cur_start_addr,
143 						u64 *pages,
144 						int *start_index,
145 						int *npages)
146 {
147 	int k;
148 	int err = 0;
149 	u64 mtt_entries;
150 	u64 cur_end_addr = cur_start_addr + len;
151 	u64 cur_end_addr_aligned = 0;
152 
153 	len += (cur_start_addr & (mtt_size-1ULL));
154 	cur_end_addr_aligned = round_up(cur_end_addr, mtt_size);
155 	len += (cur_end_addr_aligned - cur_end_addr);
156 	if (len & (mtt_size-1ULL)) {
157 		WARN(1 ,
158 		"write_block: len %llx is not aligned to mtt_size %llx\n",
159 			(unsigned long long)len, (unsigned long long)mtt_size);
160 		return -EINVAL;
161 	}
162 
163 
164 	mtt_entries = (len >> mtt_shift);
165 
166 	/* Align the MTT start address to
167 		the mtt_size.
168 		Required to handle cases when the MR
169 		starts in the middle of an MTT record.
170 		Was not required in old code since
171 		the physical addresses provided by
172 		the dma subsystem were page aligned,
173 		which was also the MTT size.
174 	*/
175 	cur_start_addr = round_down(cur_start_addr, mtt_size);
176 	/* A new block is started ...*/
177 	for (k = 0; k < mtt_entries; ++k) {
178 		pages[*npages] = cur_start_addr + (mtt_size * k);
179 		(*npages)++;
180 		/*
181 		 * Be friendly to mlx4_write_mtt() and
182 		 * pass it chunks of appropriate size.
183 		 */
184 		if (*npages == PAGE_SIZE / sizeof(u64)) {
185 			err = mlx4_write_mtt(dev->dev,
186 					mtt, *start_index,
187 					*npages, pages);
188 			if (err)
189 				return err;
190 
191 			(*start_index) += *npages;
192 			*npages = 0;
193 		}
194 	}
195 
196 	return 0;
197 }
198 
199 int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
200 			   struct ib_umem *umem)
201 {
202 	u64 *pages;
203 	u64 len = 0;
204 	int err = 0;
205 	u64 mtt_size;
206 	u64 cur_start_addr = 0;
207 	u64 mtt_shift;
208 	int start_index = 0;
209 	int npages = 0;
210 	struct scatterlist *sg;
211 	int i;
212 
213 	pages = (u64 *) __get_free_page(GFP_KERNEL);
214 	if (!pages)
215 		return -ENOMEM;
216 
217 	mtt_shift = mtt->page_shift;
218 	mtt_size = 1ULL << mtt_shift;
219 
220 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
221 			if (cur_start_addr + len ==
222 			    sg_dma_address(sg)) {
223 				/* still the same block */
224 				len += sg_dma_len(sg);
225 				continue;
226 			}
227 			/* A new block is started ...*/
228 			/* If len is malaligned, write an extra mtt entry to
229 			    cover the misaligned area (round up the division)
230 			*/
231 			err = mlx4_ib_umem_write_mtt_block(dev,
232 						mtt, mtt_size, mtt_shift,
233 						len, cur_start_addr,
234 						pages,
235 						&start_index,
236 						&npages);
237 			if (err)
238 				goto out;
239 
240 			cur_start_addr =
241 				sg_dma_address(sg);
242 			len = sg_dma_len(sg);
243 	}
244 
245 	/* Handle the last block */
246 	if (len > 0) {
247 		/*  If len is malaligned, write an extra mtt entry to cover
248 		     the misaligned area (round up the division)
249 		*/
250 		err = mlx4_ib_umem_write_mtt_block(dev,
251 						mtt, mtt_size, mtt_shift,
252 						len, cur_start_addr,
253 						pages,
254 						&start_index,
255 						&npages);
256 			if (err)
257 				goto out;
258 	}
259 
260 
261 	if (npages)
262 		err = mlx4_write_mtt(dev->dev, mtt, start_index, npages, pages);
263 
264 out:
265 	free_page((unsigned long) pages);
266 	return err;
267 }
268 
269 static inline u64 alignment_of(u64 ptr)
270 {
271 	return ilog2(ptr & (~(ptr-1)));
272 }
273 
274 static int mlx4_ib_umem_calc_block_mtt(u64 next_block_start,
275 						u64 current_block_end,
276 						u64 block_shift)
277 {
278 	/* Check whether the alignment of the new block
279 	     is aligned as well as the previous block.
280 	     Block address must start with zeros till size of entity_size.
281 	*/
282 	if ((next_block_start & ((1ULL << block_shift) - 1ULL)) != 0)
283 		/* It is not as well aligned as the
284 		previous block-reduce the mtt size
285 		accordingly.
286 		Here we take the last right bit
287 		which is 1.
288 		*/
289 		block_shift = alignment_of(next_block_start);
290 
291 	/*  Check whether the alignment of the
292 	     end of previous block - is it aligned
293 	     as well as the start of the block
294 	*/
295 	if (((current_block_end) & ((1ULL << block_shift) - 1ULL)) != 0)
296 		/* It is not as well aligned as
297 		the start of the block - reduce the
298 		mtt size accordingly.
299 		*/
300 		block_shift = alignment_of(current_block_end);
301 
302 	return block_shift;
303 }
304 
305 /* Calculate optimal mtt size based on contiguous pages.
306 * Function will return also the number of pages that are not aligned to the
307    calculated mtt_size to be added to total number
308     of pages. For that we should check the first chunk length & last chunk
309     length and if not aligned to mtt_size we should increment
310     the non_aligned_pages number.
311     All chunks in the middle already handled as part of mtt shift calculation
312     for both their start & end addresses.
313 */
314 int mlx4_ib_umem_calc_optimal_mtt_size(struct ib_umem *umem,
315 						u64 start_va,
316 						int *num_of_mtts)
317 {
318 	u64 block_shift = MLX4_MAX_MTT_SHIFT;
319 	u64 current_block_len = 0;
320 	u64 current_block_start = 0;
321 	u64 misalignment_bits;
322 	u64 first_block_start = 0;
323 	u64 last_block_end = 0;
324 	u64 total_len = 0;
325 	u64 last_block_aligned_end = 0;
326 	u64 min_shift = ilog2(umem->page_size);
327 	struct scatterlist *sg;
328 	int i;
329 	u64 next_block_start;
330 	u64 current_block_end;
331 
332 	for_each_sg(umem->sg_head.sgl, sg, umem->nmap, i) {
333 		/* Initialization - save the first chunk start as
334 		    the current_block_start - block means contiguous pages.
335 		*/
336 		if (current_block_len == 0 && current_block_start == 0) {
337 			first_block_start = current_block_start =
338 				sg_dma_address(sg);
339 			/* Find the bits that are different between
340 			    the physical address and the virtual
341 			    address for the start of the MR.
342 			*/
343 			/* umem_get aligned the start_va to a page
344 			   boundary. Therefore, we need to align the
345 			   start va to the same boundary */
346 			/* misalignment_bits is needed to handle the
347 			   case of a single memory region. In this
348 			   case, the rest of the logic will not reduce
349 			   the block size.  If we use a block size
350 			   which is bigger than the alignment of the
351 			   misalignment bits, we might use the virtual
352 			   page number instead of the physical page
353 			   number, resulting in access to the wrong
354 			   data. */
355 			misalignment_bits =
356 			(start_va & (~(((u64)(umem->page_size))-1ULL)))
357 						^ current_block_start;
358 			block_shift = min(alignment_of(misalignment_bits)
359 				, block_shift);
360 		}
361 
362 		/* Go over the scatter entries and check
363 		     if they continue the previous scatter entry.
364 		*/
365 		next_block_start =
366 			sg_dma_address(sg);
367 		current_block_end = current_block_start
368 			+ current_block_len;
369 		/* If we have a split (non-contig.) between two block*/
370 		if (current_block_end != next_block_start) {
371 			block_shift = mlx4_ib_umem_calc_block_mtt(
372 					next_block_start,
373 					current_block_end,
374 					block_shift);
375 
376 			/* If we reached the minimum shift for 4k
377 			     page we stop the loop.
378 			*/
379 			if (block_shift <= min_shift)
380 				goto end;
381 
382 			/* If not saved yet we are in first block -
383 			     we save the length of first block to
384 			     calculate the non_aligned_pages number at
385 			*    the end.
386 			*/
387 			total_len += current_block_len;
388 
389 			/* Start a new block */
390 			current_block_start = next_block_start;
391 			current_block_len =
392 				sg_dma_len(sg);
393 			continue;
394 		}
395 		/* The scatter entry is another part of
396 		     the current block, increase the block size
397 		* An entry in the scatter can be larger than
398 		4k (page) as of dma mapping
399 		which merge some blocks together.
400 		*/
401 		current_block_len +=
402 			sg_dma_len(sg);
403 	}
404 
405 	/* Account for the last block in the total len */
406 	total_len += current_block_len;
407 	/* Add to the first block the misalignment that it suffers from.*/
408 	total_len += (first_block_start & ((1ULL<<block_shift)-1ULL));
409 	last_block_end = current_block_start+current_block_len;
410 	last_block_aligned_end = round_up(last_block_end, 1<<block_shift);
411 	total_len += (last_block_aligned_end - last_block_end);
412 
413 	WARN((total_len & ((1ULL<<block_shift)-1ULL)),
414 		" misaligned total length detected (%llu, %llu)!",
415 		(unsigned long long)total_len, (unsigned long long)block_shift);
416 
417 	*num_of_mtts = total_len >> block_shift;
418 end:
419 	if (block_shift < min_shift) {
420 		/* If shift is less than the min we set a WARN and
421 		     return the min shift.
422 		*/
423 		WARN(1,
424 		"mlx4_ib_umem_calc_optimal_mtt_size - unexpected shift %lld\n",
425 		(unsigned long long)block_shift);
426 
427 		block_shift = min_shift;
428 	}
429 	return block_shift;
430 
431 }
432 
433 /* No suuport for Shared MR */
434 #if 0
435 static int prepare_shared_mr(struct mlx4_ib_mr *mr, int access_flags, int mr_id)
436 {
437 
438 	struct proc_dir_entry *mr_proc_entry;
439 	mode_t mode = S_IFREG;
440 	char name_buff[16];
441 
442 	mode |= convert_shared_access(access_flags);
443 	sprintf(name_buff, "%X", mr_id);
444 	mr->smr_info = kmalloc(sizeof(struct mlx4_shared_mr_info), GFP_KERNEL);
445 	mr->smr_info->mr_id = mr_id;
446 	mr->smr_info->umem = mr->umem;
447 
448 	mr_proc_entry = proc_create_data(name_buff, mode,
449 				mlx4_mrs_dir_entry,
450 				&shared_mr_proc_ops,
451 				mr->smr_info);
452 
453 	if (!mr_proc_entry) {
454 		pr_err("prepare_shared_mr failed via proc\n");
455 		kfree(mr->smr_info);
456 		return -ENODEV;
457 	}
458 
459 	current_uid_gid(&(mr_proc_entry->uid), &(mr_proc_entry->gid));
460 	mr_proc_entry->size = mr->umem->length;
461 	return 0;
462 
463 }
464 static int is_shared_mr(int access_flags)
465 {
466 	/* We should check whether IB_ACCESS_SHARED_MR_USER_READ or
467 	other shared bits were turned on.
468 	*/
469 	return !!(access_flags & (IB_ACCESS_SHARED_MR_USER_READ |
470 				IB_ACCESS_SHARED_MR_USER_WRITE |
471 				IB_ACCESS_SHARED_MR_GROUP_READ |
472 				IB_ACCESS_SHARED_MR_GROUP_WRITE |
473 				IB_ACCESS_SHARED_MR_OTHER_READ |
474 				IB_ACCESS_SHARED_MR_OTHER_WRITE));
475 
476 }
477 
478 static void free_smr_info(struct mlx4_ib_mr *mr)
479 {
480 	/* When master/parent shared mr is dereged there is
481 	no ability to share this mr any more - its mr_id will be
482 	returned to the kernel as part of ib_uverbs_dereg_mr
483 	and may be allocated again as part of other reg_mr.
484 	*/
485 	char name_buff[16];
486 
487 	sprintf(name_buff, "%X", mr->smr_info->mr_id);
488 	/* Remove proc entry is checking internally that no operation
489 	was strated on that proc fs file and if in the middle
490 	current process will wait till end of operation.
491 	That's why no sync mechanism is needed when we release
492 	below the shared umem.
493 	*/
494 	remove_proc_entry(name_buff, mlx4_mrs_dir_entry);
495 	kfree(mr->smr_info);
496 	mr->smr_info = NULL;
497 }
498 #endif
499 
500 static void mlx4_invalidate_umem(void *invalidation_cookie,
501 				struct ib_umem *umem,
502 				unsigned long addr, size_t size)
503 {
504 	struct mlx4_ib_mr *mr = (struct mlx4_ib_mr *)invalidation_cookie;
505 
506 	/* This function is called under client peer lock so its resources are race protected */
507 	if (atomic_inc_return(&mr->invalidated) > 1) {
508 		umem->invalidation_ctx->inflight_invalidation = 1;
509 		goto end;
510 	}
511 
512 	umem->invalidation_ctx->peer_callback = 1;
513 	mlx4_mr_free(to_mdev(mr->ibmr.device)->dev, &mr->mmr);
514 	ib_umem_release(umem);
515 	complete(&mr->invalidation_comp);
516 
517 end:
518 	return;
519 
520 }
521 
522 struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
523 				  u64 virt_addr, int access_flags,
524 				  struct ib_udata *udata,
525 				  int mr_id)
526 {
527 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
528 	struct mlx4_ib_mr *mr;
529 	int shift;
530 	int err;
531 	int n;
532 	struct ib_peer_memory_client *ib_peer_mem;
533 
534 	mr = kzalloc(sizeof *mr, GFP_KERNEL);
535 	if (!mr)
536 		return ERR_PTR(-ENOMEM);
537 
538 	mr->umem = ib_umem_get_ex(pd->uobject->context, start, length,
539 			access_flags, 0, 1);
540 	if (IS_ERR(mr->umem)) {
541 		err = PTR_ERR(mr->umem);
542 		goto err_free;
543 	}
544 
545 	ib_peer_mem = mr->umem->ib_peer_mem;
546 	n = ib_umem_page_count(mr->umem);
547 	shift = mlx4_ib_umem_calc_optimal_mtt_size(mr->umem, start,
548 		&n);
549 	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, virt_addr, length,
550 			 convert_access(access_flags), n, shift, &mr->mmr);
551 	if (err)
552 		goto err_umem;
553 
554 	err = mlx4_ib_umem_write_mtt(dev, &mr->mmr.mtt, mr->umem);
555 	if (err)
556 		goto err_mr;
557 
558 	err = mlx4_mr_enable(dev->dev, &mr->mmr);
559 	if (err)
560 		goto err_mr;
561 
562 	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
563 /* No suuport for Shared MR */
564 #if 0
565 	/* Check whether MR should be shared */
566 	if (is_shared_mr(access_flags)) {
567 	/* start address and length must be aligned to page size in order
568 	    to map a full page and preventing leakage of data */
569 		if (mr->umem->offset || (length & ~PAGE_MASK)) {
570 		        err = -EINVAL;
571 		        goto err_mr;
572 		}
573 
574 		err = prepare_shared_mr(mr, access_flags, mr_id);
575 		if (err)
576 			goto err_mr;
577 	}
578 #endif
579 	if (ib_peer_mem) {
580 		if (access_flags & IB_ACCESS_MW_BIND) {
581 			/* Prevent binding MW on peer clients.
582 			* mlx4_invalidate_umem must be void,
583 			* therefore, mlx4_mr_free should not fail
584 			* when using peer clients. */
585 			err = -ENOSYS;
586 			pr_err("MW is not supported with peer memory client");
587 			goto err_smr;
588 		}
589 		init_completion(&mr->invalidation_comp);
590 		ib_umem_activate_invalidation_notifier(mr->umem,
591 					mlx4_invalidate_umem, mr);
592 	}
593 
594 	atomic_set(&mr->invalidated, 0);
595 	return &mr->ibmr;
596 
597 err_smr:
598 /* No suuport for Shared MR */
599 #if 0
600 	if (mr->smr_info)
601 		free_smr_info(mr);
602 #endif
603 err_mr:
604 	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &mr->mmr);
605 
606 err_umem:
607 	ib_umem_release(mr->umem);
608 
609 err_free:
610 	kfree(mr);
611 
612 	return ERR_PTR(err);
613 }
614 
615 int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
616 {
617 	struct mlx4_ib_mr *mr = to_mmr(ibmr);
618 	struct ib_umem *umem = mr->umem;
619 	int ret;
620 
621 /* No suuport for Shared MR */
622 #if 0
623 	if (mr->smr_info)
624 		free_smr_info(mr);
625 #endif
626 
627 	if (atomic_inc_return(&mr->invalidated) > 1) {
628 		wait_for_completion(&mr->invalidation_comp);
629 		goto end;
630 	}
631 
632 	ret = mlx4_mr_free(to_mdev(ibmr->device)->dev, &mr->mmr);
633 	if (ret) {
634 		/* Error is not expected here, except when memory windows
635 		* are bound to MR which is not supported with
636 		* peer memory clients */
637 		atomic_set(&mr->invalidated, 0);
638 		return ret;
639 	}
640 
641 	if (!umem)
642 		goto end;
643 
644 	ib_umem_release(mr->umem);
645 end:
646 
647 	kfree(mr);
648 
649 	return 0;
650 }
651 
652 struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type)
653 {
654 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
655 	struct mlx4_ib_mw *mw;
656 	int err;
657 
658 	mw = kmalloc(sizeof(*mw), GFP_KERNEL);
659 	if (!mw)
660 		return ERR_PTR(-ENOMEM);
661 
662 	err = mlx4_mw_alloc(dev->dev, to_mpd(pd)->pdn, (enum mlx4_mw_type)type, &mw->mmw);
663 	if (err)
664 		goto err_free;
665 
666 	err = mlx4_mw_enable(dev->dev, &mw->mmw);
667 	if (err)
668 		goto err_mw;
669 
670 	mw->ibmw.rkey = mw->mmw.key;
671 
672 	return &mw->ibmw;
673 
674 err_mw:
675 	mlx4_mw_free(dev->dev, &mw->mmw);
676 
677 err_free:
678 	kfree(mw);
679 
680 	return ERR_PTR(err);
681 }
682 
683 int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw,
684 		    struct ib_mw_bind *mw_bind)
685 {
686 	struct ib_send_wr  wr;
687 	struct ib_send_wr *bad_wr;
688 	int ret;
689 
690 	memset(&wr, 0, sizeof(wr));
691 	wr.opcode               = IB_WR_BIND_MW;
692 	wr.wr_id                = mw_bind->wr_id;
693 	wr.send_flags           = mw_bind->send_flags;
694 	wr.wr.bind_mw.mw        = mw;
695 	wr.wr.bind_mw.bind_info = mw_bind->bind_info;
696 	wr.wr.bind_mw.rkey      = ib_inc_rkey(mw->rkey);
697 
698 	ret = mlx4_ib_post_send(qp, &wr, &bad_wr);
699 	if (!ret)
700 		mw->rkey = wr.wr.bind_mw.rkey;
701 
702 	return ret;
703 }
704 
705 int mlx4_ib_dealloc_mw(struct ib_mw *ibmw)
706 {
707 	struct mlx4_ib_mw *mw = to_mmw(ibmw);
708 
709 	mlx4_mw_free(to_mdev(ibmw->device)->dev, &mw->mmw);
710 	kfree(mw);
711 
712 	return 0;
713 }
714 
715 struct ib_mr *mlx4_ib_alloc_fast_reg_mr(struct ib_pd *pd,
716 					int max_page_list_len)
717 {
718 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
719 	struct mlx4_ib_mr *mr;
720 	int err;
721 
722 	mr = kzalloc(sizeof *mr, GFP_KERNEL);
723 	if (!mr)
724 		return ERR_PTR(-ENOMEM);
725 
726 	err = mlx4_mr_alloc(dev->dev, to_mpd(pd)->pdn, 0, 0, 0,
727 			    max_page_list_len, 0, &mr->mmr);
728 	if (err)
729 		goto err_free;
730 
731 	err = mlx4_mr_enable(dev->dev, &mr->mmr);
732 	if (err)
733 		goto err_mr;
734 
735 	mr->ibmr.rkey = mr->ibmr.lkey = mr->mmr.key;
736 	mr->umem = NULL;
737 
738 	return &mr->ibmr;
739 
740 err_mr:
741 	(void) mlx4_mr_free(dev->dev, &mr->mmr);
742 
743 err_free:
744 	kfree(mr);
745 	return ERR_PTR(err);
746 }
747 
748 struct ib_fast_reg_page_list *mlx4_ib_alloc_fast_reg_page_list(struct ib_device *ibdev,
749 							       int page_list_len)
750 {
751 	struct mlx4_ib_dev *dev = to_mdev(ibdev);
752 	struct mlx4_ib_fast_reg_page_list *mfrpl;
753 	int size = page_list_len * sizeof (u64);
754 
755 	if (page_list_len > MLX4_MAX_FAST_REG_PAGES)
756 		return ERR_PTR(-EINVAL);
757 
758 	mfrpl = kmalloc(sizeof *mfrpl, GFP_KERNEL);
759 	if (!mfrpl)
760 		return ERR_PTR(-ENOMEM);
761 
762 	mfrpl->ibfrpl.page_list = kmalloc(size, GFP_KERNEL);
763 	if (!mfrpl->ibfrpl.page_list)
764 		goto err_free;
765 
766 	mfrpl->mapped_page_list = dma_alloc_coherent(&dev->dev->pdev->dev,
767 						     size, &mfrpl->map,
768 						     GFP_KERNEL);
769 	if (!mfrpl->mapped_page_list)
770 		goto err_free;
771 
772 	WARN_ON(mfrpl->map & 0x3f);
773 
774 	return &mfrpl->ibfrpl;
775 
776 err_free:
777 	kfree(mfrpl->ibfrpl.page_list);
778 	kfree(mfrpl);
779 	return ERR_PTR(-ENOMEM);
780 }
781 
782 void mlx4_ib_free_fast_reg_page_list(struct ib_fast_reg_page_list *page_list)
783 {
784 	struct mlx4_ib_dev *dev = to_mdev(page_list->device);
785 	struct mlx4_ib_fast_reg_page_list *mfrpl = to_mfrpl(page_list);
786 	int size = page_list->max_page_list_len * sizeof (u64);
787 
788 	dma_free_coherent(&dev->dev->pdev->dev, size, mfrpl->mapped_page_list,
789 			  mfrpl->map);
790 	kfree(mfrpl->ibfrpl.page_list);
791 	kfree(mfrpl);
792 }
793 
794 struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc,
795 				 struct ib_fmr_attr *fmr_attr)
796 {
797 	struct mlx4_ib_dev *dev = to_mdev(pd->device);
798 	struct mlx4_ib_fmr *fmr;
799 	int err = -ENOMEM;
800 
801 	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
802 	if (!fmr)
803 		return ERR_PTR(-ENOMEM);
804 
805 	err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc),
806 			     fmr_attr->max_pages, fmr_attr->max_maps,
807 			     fmr_attr->page_shift, &fmr->mfmr);
808 	if (err)
809 		goto err_free;
810 
811 	err = mlx4_fmr_enable(to_mdev(pd->device)->dev, &fmr->mfmr);
812 	if (err)
813 		goto err_mr;
814 
815 	fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key;
816 
817 	return &fmr->ibfmr;
818 
819 err_mr:
820 	(void) mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
821 
822 err_free:
823 	kfree(fmr);
824 
825 	return ERR_PTR(err);
826 }
827 
828 int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
829 		      int npages, u64 iova)
830 {
831 	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
832 	struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device);
833 
834 	return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova,
835 				 &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
836 }
837 
838 int mlx4_ib_unmap_fmr(struct list_head *fmr_list)
839 {
840 	struct ib_fmr *ibfmr;
841 	int err;
842 	struct mlx4_dev *mdev = NULL;
843 
844 	list_for_each_entry(ibfmr, fmr_list, list) {
845 		if (mdev && to_mdev(ibfmr->device)->dev != mdev)
846 			return -EINVAL;
847 		mdev = to_mdev(ibfmr->device)->dev;
848 	}
849 
850 	if (!mdev)
851 		return 0;
852 
853 	list_for_each_entry(ibfmr, fmr_list, list) {
854 		struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
855 
856 		mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
857 	}
858 
859 	/*
860 	 * Make sure all MPT status updates are visible before issuing
861 	 * SYNC_TPT firmware command.
862 	 */
863 	wmb();
864 
865 	err = mlx4_SYNC_TPT(mdev);
866 	if (err)
867 		pr_warn("SYNC_TPT error %d when "
868 		       "unmapping FMRs\n", err);
869 
870 	return 0;
871 }
872 
873 int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr)
874 {
875 	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
876 	struct mlx4_ib_dev *dev = to_mdev(ibfmr->device);
877 	int err;
878 
879 	err = mlx4_fmr_free(dev->dev, &ifmr->mfmr);
880 
881 	if (!err)
882 		kfree(ifmr);
883 
884 	return err;
885 }
886