xref: /freebsd/sys/dev/mlx5/mlx5_ib/mlx5_ib_mr.c (revision f126890ac5386406dadf7c4cfa9566cbb56537c5)
1 /*-
2  * Copyright (c) 2013-2021, Mellanox Technologies, Ltd.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 #include "opt_rss.h"
27 #include "opt_ratelimit.h"
28 
29 #include <linux/kref.h>
30 #include <linux/random.h>
31 #include <linux/delay.h>
32 #include <linux/sched.h>
33 #include <rdma/ib_umem.h>
34 #include <rdma/ib_umem_odp.h>
35 #include <rdma/ib_verbs.h>
36 #include <dev/mlx5/mlx5_ib/mlx5_ib.h>
37 
38 enum {
39 	MAX_PENDING_REG_MR = 8,
40 };
41 
42 #define MLX5_UMR_ALIGN 2048
43 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
44 static __be64 mlx5_ib_update_mtt_emergency_buffer[
45 		MLX5_UMR_MTT_MIN_CHUNK_SIZE/sizeof(__be64)]
46 	__aligned(MLX5_UMR_ALIGN);
47 static DEFINE_MUTEX(mlx5_ib_update_mtt_emergency_buffer_mutex);
48 #endif
49 
50 static int clean_mr(struct mlx5_ib_mr *mr);
51 
52 static int destroy_mkey(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
53 {
54 	int err = mlx5_core_destroy_mkey(dev->mdev, &mr->mmkey);
55 
56 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
57 	/* Wait until all page fault handlers using the mr complete. */
58 	synchronize_srcu(&dev->mr_srcu);
59 #endif
60 
61 	return err;
62 }
63 
64 static int order2idx(struct mlx5_ib_dev *dev, int order)
65 {
66 	struct mlx5_mr_cache *cache = &dev->cache;
67 
68 	if (order < cache->ent[0].order)
69 		return 0;
70 	else
71 		return order - cache->ent[0].order;
72 }
73 
74 static bool use_umr_mtt_update(struct mlx5_ib_mr *mr, u64 start, u64 length)
75 {
76 	return ((u64)1 << mr->order) * MLX5_ADAPTER_PAGE_SIZE >=
77 		length + (start & (MLX5_ADAPTER_PAGE_SIZE - 1));
78 }
79 
80 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
81 static void update_odp_mr(struct mlx5_ib_mr *mr)
82 {
83 	if (mr->umem->odp_data) {
84 		/*
85 		 * This barrier prevents the compiler from moving the
86 		 * setting of umem->odp_data->private to point to our
87 		 * MR, before reg_umr finished, to ensure that the MR
88 		 * initialization have finished before starting to
89 		 * handle invalidations.
90 		 */
91 		smp_wmb();
92 		mr->umem->odp_data->private = mr;
93 		/*
94 		 * Make sure we will see the new
95 		 * umem->odp_data->private value in the invalidation
96 		 * routines, before we can get page faults on the
97 		 * MR. Page faults can happen once we put the MR in
98 		 * the tree, below this line. Without the barrier,
99 		 * there can be a fault handling and an invalidation
100 		 * before umem->odp_data->private == mr is visible to
101 		 * the invalidation handler.
102 		 */
103 		smp_wmb();
104 	}
105 }
106 #endif
107 
108 static void reg_mr_callback(int status, struct mlx5_async_work *context)
109 {
110 	struct mlx5_ib_mr *mr =
111 		container_of(context, struct mlx5_ib_mr, cb_work);
112 	struct mlx5_ib_dev *dev = mr->dev;
113 	struct mlx5_mr_cache *cache = &dev->cache;
114 	int c = order2idx(dev, mr->order);
115 	struct mlx5_cache_ent *ent = &cache->ent[c];
116 	u8 key;
117 	unsigned long flags;
118 	struct mlx5_mr_table *table = &dev->mdev->priv.mr_table;
119 	int err;
120 
121 	spin_lock_irqsave(&ent->lock, flags);
122 	ent->pending--;
123 	spin_unlock_irqrestore(&ent->lock, flags);
124 	if (status) {
125 		mlx5_ib_warn(dev, "async reg mr failed. status %d\n", status);
126 		kfree(mr);
127 		dev->fill_delay = 1;
128 		mod_timer(&dev->delay_timer, jiffies + HZ);
129 		return;
130 	}
131 
132 	spin_lock_irqsave(&dev->mdev->priv.mkey_lock, flags);
133 	key = dev->mdev->priv.mkey_key++;
134 	spin_unlock_irqrestore(&dev->mdev->priv.mkey_lock, flags);
135 	mr->mmkey.key = mlx5_idx_to_mkey(MLX5_GET(create_mkey_out, mr->out, mkey_index)) | key;
136 
137 	cache->last_add = jiffies;
138 
139 	spin_lock_irqsave(&ent->lock, flags);
140 	list_add_tail(&mr->list, &ent->head);
141 	ent->cur++;
142 	ent->size++;
143 	spin_unlock_irqrestore(&ent->lock, flags);
144 
145 	spin_lock_irqsave(&table->lock, flags);
146 	err = radix_tree_insert(&table->tree, mlx5_mkey_to_idx(mr->mmkey.key),
147 				&mr->mmkey);
148 	if (err)
149 		pr_err("Error inserting to mkey tree. 0x%x\n", -err);
150 	spin_unlock_irqrestore(&table->lock, flags);
151 }
152 
153 static int add_keys(struct mlx5_ib_dev *dev, int c, int num)
154 {
155 	struct mlx5_mr_cache *cache = &dev->cache;
156 	struct mlx5_cache_ent *ent = &cache->ent[c];
157 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
158 	struct mlx5_ib_mr *mr;
159 	int npages = 1 << ent->order;
160 	void *mkc;
161 	u32 *in;
162 	int err = 0;
163 	int i;
164 
165 	in = kzalloc(inlen, GFP_KERNEL);
166 	if (!in)
167 		return -ENOMEM;
168 
169 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
170 	for (i = 0; i < num; i++) {
171 		if (ent->pending >= MAX_PENDING_REG_MR) {
172 			err = -EAGAIN;
173 			break;
174 		}
175 
176 		mr = kzalloc(sizeof(*mr), GFP_KERNEL);
177 		if (!mr) {
178 			err = -ENOMEM;
179 			break;
180 		}
181 		mr->order = ent->order;
182 		mr->umred = 1;
183 		mr->dev = dev;
184 
185 		MLX5_SET(mkc, mkc, free, 1);
186 		MLX5_SET(mkc, mkc, umr_en, 1);
187 		MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_MTT);
188 
189 		MLX5_SET(mkc, mkc, qpn, 0xffffff);
190 		MLX5_SET(mkc, mkc, translations_octword_size, (npages + 1) / 2);
191 		MLX5_SET(mkc, mkc, log_page_size, 12);
192 
193 		spin_lock_irq(&ent->lock);
194 		ent->pending++;
195 		spin_unlock_irq(&ent->lock);
196 		err = mlx5_core_create_mkey_cb(dev->mdev, &mr->mmkey,
197 					       &dev->async_ctx, in, inlen,
198 					       mr->out, sizeof(mr->out),
199 					       reg_mr_callback, &mr->cb_work);
200 		if (err) {
201 			spin_lock_irq(&ent->lock);
202 			ent->pending--;
203 			spin_unlock_irq(&ent->lock);
204 			mlx5_ib_warn(dev, "create mkey failed %d\n", err);
205 			kfree(mr);
206 			break;
207 		}
208 	}
209 
210 	kfree(in);
211 	return err;
212 }
213 
214 static void remove_keys(struct mlx5_ib_dev *dev, int c, int num)
215 {
216 	struct mlx5_mr_cache *cache = &dev->cache;
217 	struct mlx5_cache_ent *ent = &cache->ent[c];
218 	struct mlx5_ib_mr *mr;
219 	int err;
220 	int i;
221 
222 	for (i = 0; i < num; i++) {
223 		spin_lock_irq(&ent->lock);
224 		if (list_empty(&ent->head)) {
225 			spin_unlock_irq(&ent->lock);
226 			return;
227 		}
228 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
229 		list_del(&mr->list);
230 		ent->cur--;
231 		ent->size--;
232 		spin_unlock_irq(&ent->lock);
233 		err = destroy_mkey(dev, mr);
234 		if (err)
235 			mlx5_ib_warn(dev, "failed destroy mkey\n");
236 		else
237 			kfree(mr);
238 	}
239 }
240 
241 static int someone_adding(struct mlx5_mr_cache *cache)
242 {
243 	int i;
244 
245 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
246 		if (cache->ent[i].cur < cache->ent[i].limit)
247 			return 1;
248 	}
249 
250 	return 0;
251 }
252 
253 static void __cache_work_func(struct mlx5_cache_ent *ent)
254 {
255 	struct mlx5_ib_dev *dev = ent->dev;
256 	struct mlx5_mr_cache *cache = &dev->cache;
257 	int i = order2idx(dev, ent->order);
258 	int err;
259 
260 	if (cache->stopped)
261 		return;
262 
263 	ent = &dev->cache.ent[i];
264 	if (ent->cur < 2 * ent->limit && !dev->fill_delay) {
265 		err = add_keys(dev, i, 1);
266 		if (ent->cur < 2 * ent->limit) {
267 			if (err == -EAGAIN) {
268 				mlx5_ib_dbg(dev, "returned eagain, order %d\n",
269 					    i + 2);
270 				queue_delayed_work(cache->wq, &ent->dwork,
271 						   msecs_to_jiffies(3));
272 			} else if (err) {
273 				mlx5_ib_warn(dev, "command failed order %d, err %d\n",
274 					     i + 2, err);
275 				queue_delayed_work(cache->wq, &ent->dwork,
276 						   msecs_to_jiffies(1000));
277 			} else {
278 				queue_work(cache->wq, &ent->work);
279 			}
280 		}
281 	} else if (ent->cur > 2 * ent->limit) {
282 		/*
283 		 * The remove_keys() logic is performed as garbage collection
284 		 * task. Such task is intended to be run when no other active
285 		 * processes are running.
286 		 *
287 		 * The need_resched() will return TRUE if there are user tasks
288 		 * to be activated in near future.
289 		 *
290 		 * In such case, we don't execute remove_keys() and postpone
291 		 * the garbage collection work to try to run in next cycle,
292 		 * in order to free CPU resources to other tasks.
293 		 */
294 		if (!need_resched() && !someone_adding(cache) &&
295 		    time_after(jiffies, cache->last_add + 300 * HZ)) {
296 			remove_keys(dev, i, 1);
297 			if (ent->cur > ent->limit)
298 				queue_work(cache->wq, &ent->work);
299 		} else {
300 			queue_delayed_work(cache->wq, &ent->dwork, 300 * HZ);
301 		}
302 	}
303 }
304 
305 static void delayed_cache_work_func(struct work_struct *work)
306 {
307 	struct mlx5_cache_ent *ent;
308 
309 	ent = container_of(work, struct mlx5_cache_ent, dwork.work);
310 	__cache_work_func(ent);
311 }
312 
313 static void cache_work_func(struct work_struct *work)
314 {
315 	struct mlx5_cache_ent *ent;
316 
317 	ent = container_of(work, struct mlx5_cache_ent, work);
318 	__cache_work_func(ent);
319 }
320 
321 static struct mlx5_ib_mr *alloc_cached_mr(struct mlx5_ib_dev *dev, int order)
322 {
323 	struct mlx5_mr_cache *cache = &dev->cache;
324 	struct mlx5_ib_mr *mr = NULL;
325 	struct mlx5_cache_ent *ent;
326 	int c;
327 	int i;
328 
329 	c = order2idx(dev, order);
330 	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
331 		mlx5_ib_warn(dev, "order %d, cache index %d\n", order, c);
332 		return NULL;
333 	}
334 
335 	for (i = c; i < MAX_MR_CACHE_ENTRIES; i++) {
336 		ent = &cache->ent[i];
337 
338 		mlx5_ib_dbg(dev, "order %d, cache index %d\n", ent->order, i);
339 
340 		spin_lock_irq(&ent->lock);
341 		if (!list_empty(&ent->head)) {
342 			mr = list_first_entry(&ent->head, struct mlx5_ib_mr,
343 					      list);
344 			list_del(&mr->list);
345 			ent->cur--;
346 			spin_unlock_irq(&ent->lock);
347 			if (ent->cur < ent->limit)
348 				queue_work(cache->wq, &ent->work);
349 			break;
350 		}
351 		spin_unlock_irq(&ent->lock);
352 
353 		queue_work(cache->wq, &ent->work);
354 	}
355 
356 	if (!mr)
357 		cache->ent[c].miss++;
358 
359 	return mr;
360 }
361 
362 static void free_cached_mr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
363 {
364 	struct mlx5_mr_cache *cache = &dev->cache;
365 	struct mlx5_cache_ent *ent;
366 	int shrink = 0;
367 	int c;
368 
369 	c = order2idx(dev, mr->order);
370 	if (c < 0 || c >= MAX_MR_CACHE_ENTRIES) {
371 		mlx5_ib_warn(dev, "order %d, cache index %d\n", mr->order, c);
372 		return;
373 	}
374 	ent = &cache->ent[c];
375 	spin_lock_irq(&ent->lock);
376 	list_add_tail(&mr->list, &ent->head);
377 	ent->cur++;
378 	if (ent->cur > 2 * ent->limit)
379 		shrink = 1;
380 	spin_unlock_irq(&ent->lock);
381 
382 	if (shrink)
383 		queue_work(cache->wq, &ent->work);
384 }
385 
386 static void clean_keys(struct mlx5_ib_dev *dev, int c)
387 {
388 	struct mlx5_mr_cache *cache = &dev->cache;
389 	struct mlx5_cache_ent *ent = &cache->ent[c];
390 	struct mlx5_ib_mr *mr;
391 	int err;
392 
393 	cancel_delayed_work(&ent->dwork);
394 	while (1) {
395 		spin_lock_irq(&ent->lock);
396 		if (list_empty(&ent->head)) {
397 			spin_unlock_irq(&ent->lock);
398 			return;
399 		}
400 		mr = list_first_entry(&ent->head, struct mlx5_ib_mr, list);
401 		list_del(&mr->list);
402 		ent->cur--;
403 		ent->size--;
404 		spin_unlock_irq(&ent->lock);
405 		err = destroy_mkey(dev, mr);
406 		if (err)
407 			mlx5_ib_warn(dev, "failed destroy mkey\n");
408 		else
409 			kfree(mr);
410 	}
411 }
412 
413 static void delay_time_func(unsigned long ctx)
414 {
415 	struct mlx5_ib_dev *dev = (struct mlx5_ib_dev *)ctx;
416 
417 	dev->fill_delay = 0;
418 }
419 
420 int mlx5_mr_cache_init(struct mlx5_ib_dev *dev)
421 {
422 	struct mlx5_mr_cache *cache = &dev->cache;
423 	struct mlx5_cache_ent *ent;
424 	int limit;
425 	int i;
426 
427 	mutex_init(&dev->slow_path_mutex);
428 	cache->wq = alloc_ordered_workqueue("mkey_cache", WQ_MEM_RECLAIM);
429 	if (!cache->wq) {
430 		mlx5_ib_warn(dev, "failed to create work queue\n");
431 		return -ENOMEM;
432 	}
433 
434 	mlx5_cmd_init_async_ctx(dev->mdev, &dev->async_ctx);
435 	setup_timer(&dev->delay_timer, delay_time_func, (unsigned long)dev);
436 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++) {
437 		INIT_LIST_HEAD(&cache->ent[i].head);
438 		spin_lock_init(&cache->ent[i].lock);
439 
440 		ent = &cache->ent[i];
441 		INIT_LIST_HEAD(&ent->head);
442 		spin_lock_init(&ent->lock);
443 		ent->order = i + 2;
444 		ent->dev = dev;
445 
446 		if (dev->mdev->profile->mask & MLX5_PROF_MASK_MR_CACHE)
447 			limit = dev->mdev->profile->mr_cache[i].limit;
448 		else
449 			limit = 0;
450 
451 		INIT_WORK(&ent->work, cache_work_func);
452 		INIT_DELAYED_WORK(&ent->dwork, delayed_cache_work_func);
453 		ent->limit = limit;
454 		queue_work(cache->wq, &ent->work);
455 	}
456 
457 	return 0;
458 }
459 
460 int mlx5_mr_cache_cleanup(struct mlx5_ib_dev *dev)
461 {
462 	int i;
463 
464 	dev->cache.stopped = 1;
465 	flush_workqueue(dev->cache.wq);
466 	mlx5_cmd_cleanup_async_ctx(&dev->async_ctx);
467 
468 	for (i = 0; i < MAX_MR_CACHE_ENTRIES; i++)
469 		clean_keys(dev, i);
470 
471 	destroy_workqueue(dev->cache.wq);
472 	del_timer_sync(&dev->delay_timer);
473 
474 	return 0;
475 }
476 
477 struct ib_mr *mlx5_ib_get_dma_mr(struct ib_pd *pd, int acc)
478 {
479 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
480 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
481 	struct mlx5_core_dev *mdev = dev->mdev;
482 	struct mlx5_ib_mr *mr;
483 	void *mkc;
484 	u32 *in;
485 	int err;
486 
487 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
488 	if (!mr)
489 		return ERR_PTR(-ENOMEM);
490 
491 	in = kzalloc(inlen, GFP_KERNEL);
492 	if (!in) {
493 		err = -ENOMEM;
494 		goto err_free;
495 	}
496 
497 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
498 
499 	MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_PA);
500 	MLX5_SET(mkc, mkc, a, !!(acc & IB_ACCESS_REMOTE_ATOMIC));
501 	MLX5_SET(mkc, mkc, rw, !!(acc & IB_ACCESS_REMOTE_WRITE));
502 	MLX5_SET(mkc, mkc, rr, !!(acc & IB_ACCESS_REMOTE_READ));
503 	MLX5_SET(mkc, mkc, lw, !!(acc & IB_ACCESS_LOCAL_WRITE));
504 	MLX5_SET(mkc, mkc, lr, 1);
505 
506 	MLX5_SET(mkc, mkc, length64, 1);
507 	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
508 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
509 	MLX5_SET64(mkc, mkc, start_addr, 0);
510 
511 	err = mlx5_core_create_mkey(mdev, &mr->mmkey, in, inlen);
512 	if (err)
513 		goto err_in;
514 
515 	kfree(in);
516 	mr->ibmr.lkey = mr->mmkey.key;
517 	mr->ibmr.rkey = mr->mmkey.key;
518 	mr->umem = NULL;
519 
520 	return &mr->ibmr;
521 
522 err_in:
523 	kfree(in);
524 
525 err_free:
526 	kfree(mr);
527 
528 	return ERR_PTR(err);
529 }
530 
531 static int get_octo_len(u64 addr, u64 len, int page_size)
532 {
533 	u64 offset;
534 	int npages;
535 
536 	offset = addr & (page_size - 1);
537 	npages = ALIGN(len + offset, page_size) >> ilog2(page_size);
538 	return (npages + 1) / 2;
539 }
540 
541 static int use_umr(int order)
542 {
543 	return order <= MLX5_MAX_UMR_SHIFT;
544 }
545 
546 static int dma_map_mr_pas(struct mlx5_ib_dev *dev, struct ib_umem *umem,
547 			  int npages, int page_shift, int *size,
548 			  __be64 **mr_pas, dma_addr_t *dma)
549 {
550 	__be64 *pas;
551 	struct device *ddev = dev->ib_dev.dma_device;
552 
553 	/*
554 	 * UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes.
555 	 * To avoid copying garbage after the pas array, we allocate
556 	 * a little more.
557 	 */
558 	*size = ALIGN(sizeof(u64) * npages, MLX5_UMR_MTT_ALIGNMENT);
559 	*mr_pas = kmalloc(*size + MLX5_UMR_ALIGN - 1, GFP_KERNEL);
560 	if (!(*mr_pas))
561 		return -ENOMEM;
562 
563 	pas = PTR_ALIGN(*mr_pas, MLX5_UMR_ALIGN);
564 	mlx5_ib_populate_pas(dev, umem, page_shift, pas, MLX5_IB_MTT_PRESENT);
565 	/* Clear padding after the actual pages. */
566 	memset(pas + npages, 0, *size - npages * sizeof(u64));
567 
568 	*dma = dma_map_single(ddev, pas, *size, DMA_TO_DEVICE);
569 	if (dma_mapping_error(ddev, *dma)) {
570 		kfree(*mr_pas);
571 		return -ENOMEM;
572 	}
573 
574 	return 0;
575 }
576 
577 static void prep_umr_wqe_common(struct ib_pd *pd, struct mlx5_umr_wr *umrwr,
578 				struct ib_sge *sg, u64 dma, int n, u32 key,
579 				int page_shift)
580 {
581 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
582 
583 	sg->addr = dma;
584 	sg->length = ALIGN(sizeof(u64) * n, 64);
585 	sg->lkey = dev->umrc.pd->local_dma_lkey;
586 
587 	umrwr->wr.next = NULL;
588 	umrwr->wr.sg_list = sg;
589 	if (n)
590 		umrwr->wr.num_sge = 1;
591 	else
592 		umrwr->wr.num_sge = 0;
593 
594 	umrwr->wr.opcode = MLX5_IB_WR_UMR;
595 
596 	umrwr->npages = n;
597 	umrwr->page_shift = page_shift;
598 	umrwr->mkey = key;
599 }
600 
601 static void prep_umr_reg_wqe(struct ib_pd *pd, struct mlx5_umr_wr *umrwr,
602 			     struct ib_sge *sg, u64 dma, int n, u32 key,
603 			     int page_shift, u64 virt_addr, u64 len,
604 			     int access_flags)
605 {
606 	prep_umr_wqe_common(pd, umrwr, sg, dma, n, key, page_shift);
607 
608 	umrwr->wr.send_flags = 0;
609 
610 	umrwr->target.virt_addr = virt_addr;
611 	umrwr->length = len;
612 	umrwr->access_flags = access_flags;
613 	umrwr->pd = pd;
614 }
615 
616 static void prep_umr_unreg_wqe(struct mlx5_ib_dev *dev,
617 			       struct mlx5_umr_wr *umrwr, u32 key)
618 {
619 	umrwr->wr.send_flags = MLX5_IB_SEND_UMR_UNREG | MLX5_IB_SEND_UMR_FAIL_IF_FREE;
620 	umrwr->wr.opcode = MLX5_IB_WR_UMR;
621 	umrwr->mkey = key;
622 }
623 
624 static struct ib_umem *mr_umem_get(struct ib_pd *pd, u64 start, u64 length,
625 				   int access_flags, int *npages,
626 				   int *page_shift, int *ncont, int *order)
627 {
628 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
629 	struct ib_umem *umem = ib_umem_get(pd->uobject->context, start, length,
630 					   access_flags, 0);
631 	if (IS_ERR(umem)) {
632 		mlx5_ib_err(dev, "umem get failed (%ld)\n", PTR_ERR(umem));
633 		return (void *)umem;
634 	}
635 
636 	mlx5_ib_cont_pages(umem, start, MLX5_MKEY_PAGE_SHIFT_MASK, npages, page_shift, ncont, order);
637 	if (!*npages) {
638 		mlx5_ib_warn(dev, "avoid zero region\n");
639 		ib_umem_release(umem);
640 		return ERR_PTR(-EINVAL);
641 	}
642 
643 	mlx5_ib_dbg(dev, "npages %d, ncont %d, order %d, page_shift %d\n",
644 		    *npages, *ncont, *order, *page_shift);
645 
646 	return umem;
647 }
648 
649 static void mlx5_ib_umr_done(struct ib_cq *cq, struct ib_wc *wc)
650 {
651 	struct mlx5_ib_umr_context *context =
652 		container_of(wc->wr_cqe, struct mlx5_ib_umr_context, cqe);
653 
654 	context->status = wc->status;
655 	complete(&context->done);
656 }
657 
658 static inline void mlx5_ib_init_umr_context(struct mlx5_ib_umr_context *context)
659 {
660 	context->cqe.done = mlx5_ib_umr_done;
661 	context->status = -1;
662 	init_completion(&context->done);
663 }
664 
665 static struct mlx5_ib_mr *reg_umr(struct ib_pd *pd, struct ib_umem *umem,
666 				  u64 virt_addr, u64 len, int npages,
667 				  int page_shift, int order, int access_flags)
668 {
669 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
670 	struct device *ddev = dev->ib_dev.dma_device;
671 	struct umr_common *umrc = &dev->umrc;
672 	struct mlx5_ib_umr_context umr_context;
673 	struct mlx5_umr_wr umrwr = {};
674 	const struct ib_send_wr *bad;
675 	struct mlx5_ib_mr *mr;
676 	struct ib_sge sg;
677 	int size;
678 	__be64 *mr_pas;
679 	dma_addr_t dma;
680 	int err = 0;
681 	int i;
682 
683 	for (i = 0; i < 1; i++) {
684 		mr = alloc_cached_mr(dev, order);
685 		if (mr)
686 			break;
687 
688 		err = add_keys(dev, order2idx(dev, order), 1);
689 		if (err && err != -EAGAIN) {
690 			mlx5_ib_warn(dev, "add_keys failed, err %d\n", err);
691 			break;
692 		}
693 	}
694 
695 	if (!mr)
696 		return ERR_PTR(-EAGAIN);
697 
698 	err = dma_map_mr_pas(dev, umem, npages, page_shift, &size, &mr_pas,
699 			     &dma);
700 	if (err)
701 		goto free_mr;
702 
703 	mlx5_ib_init_umr_context(&umr_context);
704 
705 	umrwr.wr.wr_cqe = &umr_context.cqe;
706 	prep_umr_reg_wqe(pd, &umrwr, &sg, dma, npages, mr->mmkey.key,
707 			 page_shift, virt_addr, len, access_flags);
708 
709 	down(&umrc->sem);
710 	err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
711 	if (err) {
712 		mlx5_ib_warn(dev, "post send failed, err %d\n", err);
713 		goto unmap_dma;
714 	} else {
715 		wait_for_completion(&umr_context.done);
716 		if (umr_context.status != IB_WC_SUCCESS) {
717 			mlx5_ib_warn(dev, "reg umr failed\n");
718 			err = -EFAULT;
719 		}
720 	}
721 
722 	mr->mmkey.iova = virt_addr;
723 	mr->mmkey.size = len;
724 	mr->mmkey.pd = to_mpd(pd)->pdn;
725 
726 	mr->live = 1;
727 
728 unmap_dma:
729 	up(&umrc->sem);
730 	dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
731 
732 	kfree(mr_pas);
733 
734 free_mr:
735 	if (err) {
736 		free_cached_mr(dev, mr);
737 		return ERR_PTR(err);
738 	}
739 
740 	return mr;
741 }
742 
743 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
744 int mlx5_ib_update_mtt(struct mlx5_ib_mr *mr, u64 start_page_index, int npages,
745 		       int zap)
746 {
747 	struct mlx5_ib_dev *dev = mr->dev;
748 	struct device *ddev = dev->ib_dev.dma_device;
749 	struct umr_common *umrc = &dev->umrc;
750 	struct mlx5_ib_umr_context umr_context;
751 	struct ib_umem *umem = mr->umem;
752 	int size;
753 	__be64 *pas;
754 	dma_addr_t dma;
755 	const struct ib_send_wr *bad;
756 	struct mlx5_umr_wr wr;
757 	struct ib_sge sg;
758 	int err = 0;
759 	const int page_index_alignment = MLX5_UMR_MTT_ALIGNMENT / sizeof(u64);
760 	const int page_index_mask = page_index_alignment - 1;
761 	size_t pages_mapped = 0;
762 	size_t pages_to_map = 0;
763 	size_t pages_iter = 0;
764 	int use_emergency_buf = 0;
765 
766 	/* UMR copies MTTs in units of MLX5_UMR_MTT_ALIGNMENT bytes,
767 	 * so we need to align the offset and length accordingly */
768 	if (start_page_index & page_index_mask) {
769 		npages += start_page_index & page_index_mask;
770 		start_page_index &= ~page_index_mask;
771 	}
772 
773 	pages_to_map = ALIGN(npages, page_index_alignment);
774 
775 	if (start_page_index + pages_to_map > MLX5_MAX_UMR_PAGES)
776 		return -EINVAL;
777 
778 	size = sizeof(u64) * pages_to_map;
779 	size = min_t(int, PAGE_SIZE, size);
780 	/* We allocate with GFP_ATOMIC to avoid recursion into page-reclaim
781 	 * code, when we are called from an invalidation. The pas buffer must
782 	 * be 2k-aligned for Connect-IB. */
783 	pas = (__be64 *)get_zeroed_page(GFP_ATOMIC);
784 	if (!pas) {
785 		mlx5_ib_warn(dev, "unable to allocate memory during MTT update, falling back to slower chunked mechanism.\n");
786 		pas = mlx5_ib_update_mtt_emergency_buffer;
787 		size = MLX5_UMR_MTT_MIN_CHUNK_SIZE;
788 		use_emergency_buf = 1;
789 		mutex_lock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
790 		memset(pas, 0, size);
791 	}
792 	pages_iter = size / sizeof(u64);
793 	dma = dma_map_single(ddev, pas, size, DMA_TO_DEVICE);
794 	if (dma_mapping_error(ddev, dma)) {
795 		mlx5_ib_err(dev, "unable to map DMA during MTT update.\n");
796 		err = -ENOMEM;
797 		goto free_pas;
798 	}
799 
800 	for (pages_mapped = 0;
801 	     pages_mapped < pages_to_map && !err;
802 	     pages_mapped += pages_iter, start_page_index += pages_iter) {
803 		dma_sync_single_for_cpu(ddev, dma, size, DMA_TO_DEVICE);
804 
805 		npages = min_t(size_t,
806 			       pages_iter,
807 			       ib_umem_num_pages(umem) - start_page_index);
808 
809 		if (!zap) {
810 			__mlx5_ib_populate_pas(dev, umem, PAGE_SHIFT,
811 					       start_page_index, npages, pas,
812 					       MLX5_IB_MTT_PRESENT);
813 			/* Clear padding after the pages brought from the
814 			 * umem. */
815 			memset(pas + npages, 0, size - npages * sizeof(u64));
816 		}
817 
818 		dma_sync_single_for_device(ddev, dma, size, DMA_TO_DEVICE);
819 
820 		mlx5_ib_init_umr_context(&umr_context);
821 
822 		memset(&wr, 0, sizeof(wr));
823 		wr.wr.wr_cqe = &umr_context.cqe;
824 
825 		sg.addr = dma;
826 		sg.length = ALIGN(npages * sizeof(u64),
827 				MLX5_UMR_MTT_ALIGNMENT);
828 		sg.lkey = dev->umrc.pd->local_dma_lkey;
829 
830 		wr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE |
831 				MLX5_IB_SEND_UMR_UPDATE_MTT;
832 		wr.wr.sg_list = &sg;
833 		wr.wr.num_sge = 1;
834 		wr.wr.opcode = MLX5_IB_WR_UMR;
835 		wr.npages = sg.length / sizeof(u64);
836 		wr.page_shift = PAGE_SHIFT;
837 		wr.mkey = mr->mmkey.key;
838 		wr.target.offset = start_page_index;
839 
840 		down(&umrc->sem);
841 		err = ib_post_send(umrc->qp, &wr.wr, &bad);
842 		if (err) {
843 			mlx5_ib_err(dev, "UMR post send failed, err %d\n", err);
844 		} else {
845 			wait_for_completion(&umr_context.done);
846 			if (umr_context.status != IB_WC_SUCCESS) {
847 				mlx5_ib_err(dev, "UMR completion failed, code %d\n",
848 					    umr_context.status);
849 				err = -EFAULT;
850 			}
851 		}
852 		up(&umrc->sem);
853 	}
854 	dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
855 
856 free_pas:
857 	if (!use_emergency_buf)
858 		free_page((unsigned long)pas);
859 	else
860 		mutex_unlock(&mlx5_ib_update_mtt_emergency_buffer_mutex);
861 
862 	return err;
863 }
864 #endif
865 
866 /*
867  * If ibmr is NULL it will be allocated by reg_create.
868  * Else, the given ibmr will be used.
869  */
870 static struct mlx5_ib_mr *reg_create(struct ib_mr *ibmr, struct ib_pd *pd,
871 				     u64 virt_addr, u64 length,
872 				     struct ib_umem *umem, int npages,
873 				     int page_shift, int access_flags)
874 {
875 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
876 	struct mlx5_ib_mr *mr;
877 	__be64 *pas;
878 	void *mkc;
879 	int inlen;
880 	u32 *in;
881 	int err;
882 	bool pg_cap = !!(MLX5_CAP_GEN(dev->mdev, pg));
883 
884 	mr = ibmr ? to_mmr(ibmr) : kzalloc(sizeof(*mr), GFP_KERNEL);
885 	if (!mr)
886 		return ERR_PTR(-ENOMEM);
887 
888 	inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
889 		sizeof(*pas) * ((npages + 1) / 2) * 2;
890 	in = mlx5_vzalloc(inlen);
891 	if (!in) {
892 		err = -ENOMEM;
893 		goto err_1;
894 	}
895 	pas = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
896 	mlx5_ib_populate_pas(dev, umem, page_shift, pas,
897 			     pg_cap ? MLX5_IB_MTT_PRESENT : 0);
898 
899 	/* The pg_access bit allows setting the access flags
900 	 * in the page list submitted with the command. */
901 	MLX5_SET(create_mkey_in, in, pg_access, !!(pg_cap));
902 
903 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
904 	MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_MTT);
905 	MLX5_SET(mkc, mkc, a, !!(access_flags & IB_ACCESS_REMOTE_ATOMIC));
906 	MLX5_SET(mkc, mkc, rw, !!(access_flags & IB_ACCESS_REMOTE_WRITE));
907 	MLX5_SET(mkc, mkc, rr, !!(access_flags & IB_ACCESS_REMOTE_READ));
908 	MLX5_SET(mkc, mkc, lw, !!(access_flags & IB_ACCESS_LOCAL_WRITE));
909 	MLX5_SET(mkc, mkc, lr, 1);
910 
911 	MLX5_SET64(mkc, mkc, start_addr, virt_addr);
912 	MLX5_SET64(mkc, mkc, len, length);
913 	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
914 	MLX5_SET(mkc, mkc, bsf_octword_size, 0);
915 	MLX5_SET(mkc, mkc, translations_octword_size,
916 		 get_octo_len(virt_addr, length, 1 << page_shift));
917 	MLX5_SET(mkc, mkc, log_page_size, page_shift);
918 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
919 	MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
920 		 get_octo_len(virt_addr, length, 1 << page_shift));
921 
922 	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
923 	if (err) {
924 		mlx5_ib_warn(dev, "create mkey failed\n");
925 		goto err_2;
926 	}
927 	mr->umem = umem;
928 	mr->dev = dev;
929 	mr->live = 1;
930 	kvfree(in);
931 
932 	mlx5_ib_dbg(dev, "mkey = 0x%x\n", mr->mmkey.key);
933 
934 	return mr;
935 
936 err_2:
937 	kvfree(in);
938 
939 err_1:
940 	if (!ibmr)
941 		kfree(mr);
942 
943 	return ERR_PTR(err);
944 }
945 
946 static void set_mr_fileds(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr,
947 			  int npages, u64 length, int access_flags)
948 {
949 	mr->npages = npages;
950 	atomic_add(npages, &dev->mdev->priv.reg_pages);
951 	mr->ibmr.lkey = mr->mmkey.key;
952 	mr->ibmr.rkey = mr->mmkey.key;
953 	mr->ibmr.length = length;
954 	mr->access_flags = access_flags;
955 }
956 
957 struct ib_mr *mlx5_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
958 				  u64 virt_addr, int access_flags,
959 				  struct ib_udata *udata)
960 {
961 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
962 	struct mlx5_ib_mr *mr = NULL;
963 	struct ib_umem *umem;
964 	int page_shift;
965 	int npages;
966 	int ncont;
967 	int order;
968 	int err;
969 
970 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
971 		    (long long)start, (long long)virt_addr, (long long)length, access_flags);
972 	umem = mr_umem_get(pd, start, length, access_flags, &npages,
973 			   &page_shift, &ncont, &order);
974 
975 	if (IS_ERR(umem))
976 		return (void *)umem;
977 
978 	if (use_umr(order)) {
979 		mr = reg_umr(pd, umem, virt_addr, length, ncont, page_shift,
980 			     order, access_flags);
981 		if (PTR_ERR(mr) == -EAGAIN) {
982 			mlx5_ib_dbg(dev, "cache empty for order %d", order);
983 			mr = NULL;
984 		}
985 	} else if (access_flags & IB_ACCESS_ON_DEMAND) {
986 		err = -EINVAL;
987 		pr_err("Got MR registration for ODP MR > 512MB, not supported for Connect-IB");
988 		goto error;
989 	}
990 
991 	if (!mr) {
992 		mutex_lock(&dev->slow_path_mutex);
993 		mr = reg_create(NULL, pd, virt_addr, length, umem, ncont,
994 				page_shift, access_flags);
995 		mutex_unlock(&dev->slow_path_mutex);
996 	}
997 
998 	if (IS_ERR(mr)) {
999 		err = PTR_ERR(mr);
1000 		goto error;
1001 	}
1002 
1003 	mlx5_ib_dbg(dev, "mkey 0x%x\n", mr->mmkey.key);
1004 
1005 	mr->umem = umem;
1006 	set_mr_fileds(dev, mr, npages, length, access_flags);
1007 
1008 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1009 	update_odp_mr(mr);
1010 #endif
1011 
1012 	return &mr->ibmr;
1013 
1014 error:
1015 	ib_umem_release(umem);
1016 	return ERR_PTR(err);
1017 }
1018 
1019 static int unreg_umr(struct mlx5_ib_dev *dev, struct mlx5_ib_mr *mr)
1020 {
1021 	struct mlx5_core_dev *mdev = dev->mdev;
1022 	struct umr_common *umrc = &dev->umrc;
1023 	struct mlx5_ib_umr_context umr_context;
1024 	struct mlx5_umr_wr umrwr = {};
1025 	const struct ib_send_wr *bad;
1026 	int err;
1027 
1028 	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR)
1029 		return 0;
1030 
1031 	mlx5_ib_init_umr_context(&umr_context);
1032 
1033 	umrwr.wr.wr_cqe = &umr_context.cqe;
1034 	prep_umr_unreg_wqe(dev, &umrwr, mr->mmkey.key);
1035 
1036 	down(&umrc->sem);
1037 	err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
1038 	if (err) {
1039 		up(&umrc->sem);
1040 		mlx5_ib_dbg(dev, "err %d\n", err);
1041 		goto error;
1042 	} else {
1043 		wait_for_completion(&umr_context.done);
1044 		up(&umrc->sem);
1045 	}
1046 	if (umr_context.status != IB_WC_SUCCESS) {
1047 		mlx5_ib_warn(dev, "unreg umr failed\n");
1048 		err = -EFAULT;
1049 		goto error;
1050 	}
1051 	return 0;
1052 
1053 error:
1054 	return err;
1055 }
1056 
1057 static int rereg_umr(struct ib_pd *pd, struct mlx5_ib_mr *mr, u64 virt_addr,
1058 		     u64 length, int npages, int page_shift, int order,
1059 		     int access_flags, int flags)
1060 {
1061 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1062 	struct device *ddev = dev->ib_dev.dma_device;
1063 	struct mlx5_ib_umr_context umr_context;
1064 	const struct ib_send_wr *bad;
1065 	struct mlx5_umr_wr umrwr = {};
1066 	struct ib_sge sg;
1067 	struct umr_common *umrc = &dev->umrc;
1068 	dma_addr_t dma = 0;
1069 	__be64 *mr_pas = NULL;
1070 	int size;
1071 	int err;
1072 
1073 	mlx5_ib_init_umr_context(&umr_context);
1074 
1075 	umrwr.wr.wr_cqe = &umr_context.cqe;
1076 	umrwr.wr.send_flags = MLX5_IB_SEND_UMR_FAIL_IF_FREE;
1077 
1078 	if (flags & IB_MR_REREG_TRANS) {
1079 		err = dma_map_mr_pas(dev, mr->umem, npages, page_shift, &size,
1080 				     &mr_pas, &dma);
1081 		if (err)
1082 			return err;
1083 
1084 		umrwr.target.virt_addr = virt_addr;
1085 		umrwr.length = length;
1086 		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_TRANSLATION;
1087 	}
1088 
1089 	prep_umr_wqe_common(pd, &umrwr, &sg, dma, npages, mr->mmkey.key,
1090 			    page_shift);
1091 
1092 	if (flags & IB_MR_REREG_PD) {
1093 		umrwr.pd = pd;
1094 		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_PD;
1095 	}
1096 
1097 	if (flags & IB_MR_REREG_ACCESS) {
1098 		umrwr.access_flags = access_flags;
1099 		umrwr.wr.send_flags |= MLX5_IB_SEND_UMR_UPDATE_ACCESS;
1100 	}
1101 
1102 	/* post send request to UMR QP */
1103 	down(&umrc->sem);
1104 	err = ib_post_send(umrc->qp, &umrwr.wr, &bad);
1105 
1106 	if (err) {
1107 		mlx5_ib_warn(dev, "post send failed, err %d\n", err);
1108 	} else {
1109 		wait_for_completion(&umr_context.done);
1110 		if (umr_context.status != IB_WC_SUCCESS) {
1111 			mlx5_ib_warn(dev, "reg umr failed (%u)\n",
1112 				     umr_context.status);
1113 			err = -EFAULT;
1114 		}
1115 	}
1116 
1117 	up(&umrc->sem);
1118 	if (flags & IB_MR_REREG_TRANS) {
1119 		dma_unmap_single(ddev, dma, size, DMA_TO_DEVICE);
1120 		kfree(mr_pas);
1121 	}
1122 	return err;
1123 }
1124 
1125 int mlx5_ib_rereg_user_mr(struct ib_mr *ib_mr, int flags, u64 start,
1126 			  u64 length, u64 virt_addr, int new_access_flags,
1127 			  struct ib_pd *new_pd, struct ib_udata *udata)
1128 {
1129 	struct mlx5_ib_dev *dev = to_mdev(ib_mr->device);
1130 	struct mlx5_ib_mr *mr = to_mmr(ib_mr);
1131 	struct ib_pd *pd = (flags & IB_MR_REREG_PD) ? new_pd : ib_mr->pd;
1132 	int access_flags = flags & IB_MR_REREG_ACCESS ?
1133 			    new_access_flags :
1134 			    mr->access_flags;
1135 	u64 addr = (flags & IB_MR_REREG_TRANS) ? virt_addr : mr->umem->address;
1136 	u64 len = (flags & IB_MR_REREG_TRANS) ? length : mr->umem->length;
1137 	int page_shift = 0;
1138 	int npages = 0;
1139 	int ncont = 0;
1140 	int order = 0;
1141 	int err;
1142 
1143 	mlx5_ib_dbg(dev, "start 0x%llx, virt_addr 0x%llx, length 0x%llx, access_flags 0x%x\n",
1144 		    (long long)start, (long long)virt_addr, (long long)length, access_flags);
1145 
1146 	if (flags != IB_MR_REREG_PD) {
1147 		/*
1148 		 * Replace umem. This needs to be done whether or not UMR is
1149 		 * used.
1150 		 */
1151 		flags |= IB_MR_REREG_TRANS;
1152 		ib_umem_release(mr->umem);
1153 		mr->umem = mr_umem_get(pd, addr, len, access_flags, &npages,
1154 				       &page_shift, &ncont, &order);
1155 		if (IS_ERR(mr->umem)) {
1156 			err = PTR_ERR(mr->umem);
1157 			mr->umem = NULL;
1158 			return err;
1159 		}
1160 	}
1161 
1162 	if (flags & IB_MR_REREG_TRANS && !use_umr_mtt_update(mr, addr, len)) {
1163 		/*
1164 		 * UMR can't be used - MKey needs to be replaced.
1165 		 */
1166 		if (mr->umred) {
1167 			err = unreg_umr(dev, mr);
1168 			if (err)
1169 				mlx5_ib_warn(dev, "Failed to unregister MR\n");
1170 		} else {
1171 			err = destroy_mkey(dev, mr);
1172 			if (err)
1173 				mlx5_ib_warn(dev, "Failed to destroy MKey\n");
1174 		}
1175 		if (err)
1176 			return err;
1177 
1178 		mr = reg_create(ib_mr, pd, addr, len, mr->umem, ncont,
1179 				page_shift, access_flags);
1180 
1181 		if (IS_ERR(mr))
1182 			return PTR_ERR(mr);
1183 
1184 		mr->umred = 0;
1185 	} else {
1186 		/*
1187 		 * Send a UMR WQE
1188 		 */
1189 		err = rereg_umr(pd, mr, addr, len, npages, page_shift,
1190 				order, access_flags, flags);
1191 		if (err) {
1192 			mlx5_ib_warn(dev, "Failed to rereg UMR\n");
1193 			return err;
1194 		}
1195 	}
1196 
1197 	if (flags & IB_MR_REREG_PD) {
1198 		ib_mr->pd = pd;
1199 		mr->mmkey.pd = to_mpd(pd)->pdn;
1200 	}
1201 
1202 	if (flags & IB_MR_REREG_ACCESS)
1203 		mr->access_flags = access_flags;
1204 
1205 	if (flags & IB_MR_REREG_TRANS) {
1206 		atomic_sub(mr->npages, &dev->mdev->priv.reg_pages);
1207 		set_mr_fileds(dev, mr, npages, len, access_flags);
1208 		mr->mmkey.iova = addr;
1209 		mr->mmkey.size = len;
1210 	}
1211 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1212 	update_odp_mr(mr);
1213 #endif
1214 
1215 	return 0;
1216 }
1217 
1218 static int
1219 mlx5_alloc_priv_descs(struct ib_device *device,
1220 		      struct mlx5_ib_mr *mr,
1221 		      int ndescs,
1222 		      int desc_size)
1223 {
1224 	int size = ndescs * desc_size;
1225 	int add_size;
1226 	int ret;
1227 
1228 	add_size = max_t(int, MLX5_UMR_ALIGN - 1, 0);
1229 
1230 	mr->descs_alloc = kzalloc(size + add_size, GFP_KERNEL);
1231 	if (!mr->descs_alloc)
1232 		return -ENOMEM;
1233 
1234 	mr->descs = PTR_ALIGN(mr->descs_alloc, MLX5_UMR_ALIGN);
1235 
1236 	mr->desc_map = dma_map_single(device->dma_device, mr->descs,
1237 				      size, DMA_TO_DEVICE);
1238 	if (dma_mapping_error(device->dma_device, mr->desc_map)) {
1239 		ret = -ENOMEM;
1240 		goto err;
1241 	}
1242 
1243 	return 0;
1244 err:
1245 	kfree(mr->descs_alloc);
1246 
1247 	return ret;
1248 }
1249 
1250 static void
1251 mlx5_free_priv_descs(struct mlx5_ib_mr *mr)
1252 {
1253 	if (mr->descs) {
1254 		struct ib_device *device = mr->ibmr.device;
1255 		int size = mr->max_descs * mr->desc_size;
1256 
1257 		dma_unmap_single(device->dma_device, mr->desc_map,
1258 				 size, DMA_TO_DEVICE);
1259 		kfree(mr->descs_alloc);
1260 		mr->descs = NULL;
1261 	}
1262 }
1263 
1264 static int clean_mr(struct mlx5_ib_mr *mr)
1265 {
1266 	struct mlx5_ib_dev *dev = to_mdev(mr->ibmr.device);
1267 	int umred = mr->umred;
1268 	int err;
1269 
1270 	if (mr->sig) {
1271 		if (mlx5_core_destroy_psv(dev->mdev,
1272 					  mr->sig->psv_memory.psv_idx))
1273 			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1274 				     mr->sig->psv_memory.psv_idx);
1275 		if (mlx5_core_destroy_psv(dev->mdev,
1276 					  mr->sig->psv_wire.psv_idx))
1277 			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1278 				     mr->sig->psv_wire.psv_idx);
1279 		kfree(mr->sig);
1280 		mr->sig = NULL;
1281 	}
1282 
1283 	mlx5_free_priv_descs(mr);
1284 
1285 	if (!umred) {
1286 		u32 key = mr->mmkey.key;
1287 
1288 		err = destroy_mkey(dev, mr);
1289 		kfree(mr);
1290 		if (err) {
1291 			mlx5_ib_warn(dev, "failed to destroy mkey 0x%x (%d)\n",
1292 				     key, err);
1293 			return err;
1294 		}
1295 	} else {
1296 		err = unreg_umr(dev, mr);
1297 		if (err) {
1298 			mlx5_ib_warn(dev, "failed unregister\n");
1299 			return err;
1300 		}
1301 		free_cached_mr(dev, mr);
1302 	}
1303 
1304 	return 0;
1305 }
1306 
1307 int mlx5_ib_dereg_mr(struct ib_mr *ibmr, struct ib_udata *udata)
1308 {
1309 	struct mlx5_ib_dev *dev = to_mdev(ibmr->device);
1310 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1311 	int npages = mr->npages;
1312 	struct ib_umem *umem = mr->umem;
1313 
1314 #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING
1315 	if (umem && umem->odp_data) {
1316 		/* Prevent new page faults from succeeding */
1317 		mr->live = 0;
1318 		/* Wait for all running page-fault handlers to finish. */
1319 		synchronize_srcu(&dev->mr_srcu);
1320 		/* Destroy all page mappings */
1321 		mlx5_ib_invalidate_range(umem, ib_umem_start(umem),
1322 					 ib_umem_end(umem));
1323 		/*
1324 		 * We kill the umem before the MR for ODP,
1325 		 * so that there will not be any invalidations in
1326 		 * flight, looking at the *mr struct.
1327 		 */
1328 		ib_umem_release(umem);
1329 		atomic_sub(npages, &dev->mdev->priv.reg_pages);
1330 
1331 		/* Avoid double-freeing the umem. */
1332 		umem = NULL;
1333 	}
1334 #endif
1335 
1336 	clean_mr(mr);
1337 
1338 	if (umem) {
1339 		ib_umem_release(umem);
1340 		atomic_sub(npages, &dev->mdev->priv.reg_pages);
1341 	}
1342 
1343 	return 0;
1344 }
1345 
1346 struct ib_mr *mlx5_ib_alloc_mr(struct ib_pd *pd,
1347 			       enum ib_mr_type mr_type,
1348 			       u32 max_num_sg, struct ib_udata *udata)
1349 {
1350 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1351 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1352 	int ndescs = ALIGN(max_num_sg, 4);
1353 	struct mlx5_ib_mr *mr;
1354 	void *mkc;
1355 	u32 *in;
1356 	int err;
1357 
1358 	mr = kzalloc(sizeof(*mr), GFP_KERNEL);
1359 	if (!mr)
1360 		return ERR_PTR(-ENOMEM);
1361 
1362 	in = kzalloc(inlen, GFP_KERNEL);
1363 	if (!in) {
1364 		err = -ENOMEM;
1365 		goto err_free;
1366 	}
1367 
1368 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1369 	MLX5_SET(mkc, mkc, free, 1);
1370 	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1371 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
1372 	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1373 
1374 	if (mr_type == IB_MR_TYPE_MEM_REG) {
1375 		mr->access_mode = MLX5_ACCESS_MODE_MTT;
1376 		MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
1377 		err = mlx5_alloc_priv_descs(pd->device, mr,
1378 					    ndescs, sizeof(u64));
1379 		if (err)
1380 			goto err_free_in;
1381 
1382 		mr->desc_size = sizeof(u64);
1383 		mr->max_descs = ndescs;
1384 	} else if (mr_type == IB_MR_TYPE_SG_GAPS) {
1385 		mr->access_mode = MLX5_ACCESS_MODE_KLM;
1386 
1387 		err = mlx5_alloc_priv_descs(pd->device, mr,
1388 					    ndescs, sizeof(struct mlx5_klm));
1389 		if (err)
1390 			goto err_free_in;
1391 		mr->desc_size = sizeof(struct mlx5_klm);
1392 		mr->max_descs = ndescs;
1393 	} else if (mr_type == IB_MR_TYPE_INTEGRITY) {
1394 		u32 psv_index[2];
1395 
1396 		MLX5_SET(mkc, mkc, bsf_en, 1);
1397 		MLX5_SET(mkc, mkc, bsf_octword_size, MLX5_MKEY_BSF_OCTO_SIZE);
1398 		mr->sig = kzalloc(sizeof(*mr->sig), GFP_KERNEL);
1399 		if (!mr->sig) {
1400 			err = -ENOMEM;
1401 			goto err_free_in;
1402 		}
1403 
1404 		/* create mem & wire PSVs */
1405 		err = mlx5_core_create_psv(dev->mdev, to_mpd(pd)->pdn,
1406 					   2, psv_index);
1407 		if (err)
1408 			goto err_free_sig;
1409 
1410 		mr->access_mode = MLX5_ACCESS_MODE_KLM;
1411 		mr->sig->psv_memory.psv_idx = psv_index[0];
1412 		mr->sig->psv_wire.psv_idx = psv_index[1];
1413 
1414 		mr->sig->sig_status_checked = true;
1415 		mr->sig->sig_err_exists = false;
1416 		/* Next UMR, Arm SIGERR */
1417 		++mr->sig->sigerr_count;
1418 	} else {
1419 		mlx5_ib_warn(dev, "Invalid mr type %d\n", mr_type);
1420 		err = -EINVAL;
1421 		goto err_free_in;
1422 	}
1423 
1424 	MLX5_SET(mkc, mkc, access_mode, mr->access_mode);
1425 	MLX5_SET(mkc, mkc, umr_en, 1);
1426 
1427 	err = mlx5_core_create_mkey(dev->mdev, &mr->mmkey, in, inlen);
1428 	if (err)
1429 		goto err_destroy_psv;
1430 
1431 	mr->ibmr.lkey = mr->mmkey.key;
1432 	mr->ibmr.rkey = mr->mmkey.key;
1433 	mr->umem = NULL;
1434 	kfree(in);
1435 
1436 	return &mr->ibmr;
1437 
1438 err_destroy_psv:
1439 	if (mr->sig) {
1440 		if (mlx5_core_destroy_psv(dev->mdev,
1441 					  mr->sig->psv_memory.psv_idx))
1442 			mlx5_ib_warn(dev, "failed to destroy mem psv %d\n",
1443 				     mr->sig->psv_memory.psv_idx);
1444 		if (mlx5_core_destroy_psv(dev->mdev,
1445 					  mr->sig->psv_wire.psv_idx))
1446 			mlx5_ib_warn(dev, "failed to destroy wire psv %d\n",
1447 				     mr->sig->psv_wire.psv_idx);
1448 	}
1449 	mlx5_free_priv_descs(mr);
1450 err_free_sig:
1451 	kfree(mr->sig);
1452 err_free_in:
1453 	kfree(in);
1454 err_free:
1455 	kfree(mr);
1456 	return ERR_PTR(err);
1457 }
1458 
1459 struct ib_mw *mlx5_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type,
1460 			       struct ib_udata *udata)
1461 {
1462 	struct mlx5_ib_dev *dev = to_mdev(pd->device);
1463 	int inlen = MLX5_ST_SZ_BYTES(create_mkey_in);
1464 	struct mlx5_ib_mw *mw = NULL;
1465 	u32 *in = NULL;
1466 	void *mkc;
1467 	int ndescs;
1468 	int err;
1469 	struct mlx5_ib_alloc_mw req = {};
1470 	struct {
1471 		__u32	comp_mask;
1472 		__u32	response_length;
1473 	} resp = {};
1474 
1475 	err = ib_copy_from_udata(&req, udata, min(udata->inlen, sizeof(req)));
1476 	if (err)
1477 		return ERR_PTR(err);
1478 
1479 	if (req.comp_mask || req.reserved1 || req.reserved2)
1480 		return ERR_PTR(-EOPNOTSUPP);
1481 
1482 	if (udata->inlen > sizeof(req) &&
1483 	    !ib_is_udata_cleared(udata, sizeof(req),
1484 				 udata->inlen - sizeof(req)))
1485 		return ERR_PTR(-EOPNOTSUPP);
1486 
1487 	ndescs = req.num_klms ? roundup(req.num_klms, 4) : roundup(1, 4);
1488 
1489 	mw = kzalloc(sizeof(*mw), GFP_KERNEL);
1490 	in = kzalloc(inlen, GFP_KERNEL);
1491 	if (!mw || !in) {
1492 		err = -ENOMEM;
1493 		goto free;
1494 	}
1495 
1496 	mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
1497 
1498 	MLX5_SET(mkc, mkc, free, 1);
1499 	MLX5_SET(mkc, mkc, translations_octword_size, ndescs);
1500 	MLX5_SET(mkc, mkc, pd, to_mpd(pd)->pdn);
1501 	MLX5_SET(mkc, mkc, umr_en, 1);
1502 	MLX5_SET(mkc, mkc, lr, 1);
1503 	MLX5_SET(mkc, mkc, access_mode, MLX5_ACCESS_MODE_KLM);
1504 	MLX5_SET(mkc, mkc, en_rinval, !!((type == IB_MW_TYPE_2)));
1505 	MLX5_SET(mkc, mkc, qpn, 0xffffff);
1506 
1507 	err = mlx5_core_create_mkey(dev->mdev, &mw->mmkey, in, inlen);
1508 	if (err)
1509 		goto free;
1510 
1511 	mw->ibmw.rkey = mw->mmkey.key;
1512 
1513 	resp.response_length = min(offsetof(typeof(resp), response_length) +
1514 				   sizeof(resp.response_length), udata->outlen);
1515 	if (resp.response_length) {
1516 		err = ib_copy_to_udata(udata, &resp, resp.response_length);
1517 		if (err) {
1518 			mlx5_core_destroy_mkey(dev->mdev, &mw->mmkey);
1519 			goto free;
1520 		}
1521 	}
1522 
1523 	kfree(in);
1524 	return &mw->ibmw;
1525 
1526 free:
1527 	kfree(mw);
1528 	kfree(in);
1529 	return ERR_PTR(err);
1530 }
1531 
1532 int mlx5_ib_dealloc_mw(struct ib_mw *mw)
1533 {
1534 	struct mlx5_ib_mw *mmw = to_mmw(mw);
1535 	int err;
1536 
1537 	err =  mlx5_core_destroy_mkey((to_mdev(mw->device))->mdev,
1538 				      &mmw->mmkey);
1539 	if (!err)
1540 		kfree(mmw);
1541 	return err;
1542 }
1543 
1544 int mlx5_ib_check_mr_status(struct ib_mr *ibmr, u32 check_mask,
1545 			    struct ib_mr_status *mr_status)
1546 {
1547 	struct mlx5_ib_mr *mmr = to_mmr(ibmr);
1548 	int ret = 0;
1549 
1550 	if (check_mask & ~IB_MR_CHECK_SIG_STATUS) {
1551 		pr_err("Invalid status check mask\n");
1552 		ret = -EINVAL;
1553 		goto done;
1554 	}
1555 
1556 	mr_status->fail_status = 0;
1557 	if (check_mask & IB_MR_CHECK_SIG_STATUS) {
1558 		if (!mmr->sig) {
1559 			ret = -EINVAL;
1560 			pr_err("signature status check requested on a non-signature enabled MR\n");
1561 			goto done;
1562 		}
1563 
1564 		mmr->sig->sig_status_checked = true;
1565 		if (!mmr->sig->sig_err_exists)
1566 			goto done;
1567 
1568 		if (ibmr->lkey == mmr->sig->err_item.key)
1569 			memcpy(&mr_status->sig_err, &mmr->sig->err_item,
1570 			       sizeof(mr_status->sig_err));
1571 		else {
1572 			mr_status->sig_err.err_type = IB_SIG_BAD_GUARD;
1573 			mr_status->sig_err.sig_err_offset = 0;
1574 			mr_status->sig_err.key = mmr->sig->err_item.key;
1575 		}
1576 
1577 		mmr->sig->sig_err_exists = false;
1578 		mr_status->fail_status |= IB_MR_CHECK_SIG_STATUS;
1579 	}
1580 
1581 done:
1582 	return ret;
1583 }
1584 
1585 static int
1586 mlx5_ib_sg_to_klms(struct mlx5_ib_mr *mr,
1587 		   struct scatterlist *sgl,
1588 		   unsigned short sg_nents,
1589 		   unsigned int *sg_offset_p)
1590 {
1591 	struct scatterlist *sg = sgl;
1592 	struct mlx5_klm *klms = mr->descs;
1593 	unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
1594 	u32 lkey = mr->ibmr.pd->local_dma_lkey;
1595 	int i;
1596 
1597 	mr->ibmr.iova = sg_dma_address(sg) + sg_offset;
1598 	mr->ibmr.length = 0;
1599 	mr->ndescs = sg_nents;
1600 
1601 	for_each_sg(sgl, sg, sg_nents, i) {
1602 		if (unlikely(i > mr->max_descs))
1603 			break;
1604 		klms[i].va = cpu_to_be64(sg_dma_address(sg) + sg_offset);
1605 		klms[i].bcount = cpu_to_be32(sg_dma_len(sg) - sg_offset);
1606 		klms[i].key = cpu_to_be32(lkey);
1607 		mr->ibmr.length += sg_dma_len(sg);
1608 
1609 		sg_offset = 0;
1610 	}
1611 
1612 	if (sg_offset_p)
1613 		*sg_offset_p = sg_offset;
1614 
1615 	return i;
1616 }
1617 
1618 static int mlx5_set_page(struct ib_mr *ibmr, u64 addr)
1619 {
1620 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1621 	__be64 *descs;
1622 
1623 	if (unlikely(mr->ndescs == mr->max_descs))
1624 		return -ENOMEM;
1625 
1626 	descs = mr->descs;
1627 	descs[mr->ndescs++] = cpu_to_be64(addr | MLX5_EN_RD | MLX5_EN_WR);
1628 
1629 	return 0;
1630 }
1631 
1632 int mlx5_ib_map_mr_sg(struct ib_mr *ibmr, struct scatterlist *sg, int sg_nents,
1633 		      unsigned int *sg_offset)
1634 {
1635 	struct mlx5_ib_mr *mr = to_mmr(ibmr);
1636 	int n;
1637 
1638 	mr->ndescs = 0;
1639 
1640 	ib_dma_sync_single_for_cpu(ibmr->device, mr->desc_map,
1641 				   mr->desc_size * mr->max_descs,
1642 				   DMA_TO_DEVICE);
1643 
1644 	if (mr->access_mode == MLX5_ACCESS_MODE_KLM)
1645 		n = mlx5_ib_sg_to_klms(mr, sg, sg_nents, sg_offset);
1646 	else
1647 		n = ib_sg_to_pages(ibmr, sg, sg_nents, sg_offset,
1648 				mlx5_set_page);
1649 
1650 	ib_dma_sync_single_for_device(ibmr->device, mr->desc_map,
1651 				      mr->desc_size * mr->max_descs,
1652 				      DMA_TO_DEVICE);
1653 
1654 	return n;
1655 }
1656